{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.0, "eval_steps": 500, "global_step": 23984, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "loss": 21.045429229736328, "step": 0 }, { "ce_loss": 3.982180595397949, "epoch": 0, "step": 0 }, { "distill_loss": 1.7960407733917236, "epoch": 0, "step": 0 }, { "epoch": 0, "ref_ce_loss": 2.866819381713867, "step": 0 }, { "epoch": 0, "loss": 20.749279022216797, "step": 0 }, { "ce_loss": 4.234129905700684, "epoch": 0, "step": 0 }, { "distill_loss": 1.7374516725540161, "epoch": 0, "step": 0 }, { "epoch": 0, "ref_ce_loss": 3.2119572162628174, "step": 0 }, { "epoch": 0, "loss": 19.981489181518555, "step": 0 }, { "ce_loss": 3.8592026233673096, "epoch": 0, "step": 0 }, { "distill_loss": 1.896045446395874, "epoch": 0, "step": 0 }, { "epoch": 0, "ref_ce_loss": 3.0282931327819824, "step": 0 }, { "epoch": 0, "loss": 20.778669357299805, "step": 0 }, { "ce_loss": 3.793614149093628, "epoch": 0, "step": 0 }, { "distill_loss": 1.8912239074707031, "epoch": 0, "step": 0 }, { "epoch": 0, "ref_ce_loss": 2.9311015605926514, "step": 0 }, { "epoch": 0.00333555703802535, "loss": 18.5052, "step": 10 }, { "epoch": 0.00333555703802535, "grad_norm": 762.3750610351562, "step": 10 }, { "epoch": 0.00333555703802535, "learning_rate": 4.166666666666666e-06, "step": 10 }, { "epoch": 0.00333555703802535, "loss": 14.035091400146484, "step": 10 }, { "ce_loss": 3.1100096702575684, "epoch": 0.00333555703802535, "step": 10 }, { "distill_loss": 1.7577776908874512, "epoch": 0.00333555703802535, "step": 10 }, { "epoch": 0.00333555703802535, "ref_ce_loss": 3.2110393047332764, "step": 10 }, { "epoch": 0.00333555703802535, "loss": 13.886899948120117, "step": 10 }, { "ce_loss": 3.069828510284424, "epoch": 0.00333555703802535, "step": 10 }, { "distill_loss": 1.7487568855285645, "epoch": 0.00333555703802535, "step": 10 }, { "epoch": 0.00333555703802535, "ref_ce_loss": 3.2038192749023438, "step": 10 }, { "epoch": 0.00333555703802535, "loss": 12.481767654418945, "step": 10 }, { "ce_loss": 3.101062059402466, "epoch": 0.00333555703802535, "step": 10 }, { "distill_loss": 1.7259626388549805, "epoch": 0.00333555703802535, "step": 10 }, { "epoch": 0.00333555703802535, "ref_ce_loss": 3.0520899295806885, "step": 10 }, { "epoch": 0.00333555703802535, "loss": 13.172759056091309, "step": 10 }, { "ce_loss": 3.231196880340576, "epoch": 0.00333555703802535, "step": 10 }, { "distill_loss": 1.8049229383468628, "epoch": 0.00333555703802535, "step": 10 }, { "epoch": 0.00333555703802535, "ref_ce_loss": 2.805325984954834, "step": 10 }, { "epoch": 0.0066711140760507, "loss": 9.7871, "step": 20 }, { "epoch": 0.0066711140760507, "grad_norm": 227.34100341796875, "step": 20 }, { "epoch": 0.0066711140760507, "learning_rate": 8.333333333333332e-06, "step": 20 }, { "epoch": 0.0066711140760507, "loss": 6.437663555145264, "step": 20 }, { "ce_loss": 1.5575122833251953, "epoch": 0.0066711140760507, "step": 20 }, { "distill_loss": 1.659112572669983, "epoch": 0.0066711140760507, "step": 20 }, { "epoch": 0.0066711140760507, "ref_ce_loss": 3.047043800354004, "step": 20 }, { "epoch": 0.0066711140760507, "loss": 6.436098575592041, "step": 20 }, { "ce_loss": 1.5137897729873657, "epoch": 0.0066711140760507, "step": 20 }, { "distill_loss": 1.670945644378662, "epoch": 0.0066711140760507, "step": 20 }, { "epoch": 0.0066711140760507, "ref_ce_loss": 3.0977792739868164, "step": 20 }, { "epoch": 0.0066711140760507, "loss": 6.919541835784912, "step": 20 }, { "ce_loss": 1.7808164358139038, "epoch": 0.0066711140760507, "step": 20 }, { "distill_loss": 1.7737149000167847, "epoch": 0.0066711140760507, "step": 20 }, { "epoch": 0.0066711140760507, "ref_ce_loss": 2.914146900177002, "step": 20 }, { "epoch": 0.0066711140760507, "loss": 7.18682861328125, "step": 20 }, { "ce_loss": 1.5944644212722778, "epoch": 0.0066711140760507, "step": 20 }, { "distill_loss": 1.83235764503479, "epoch": 0.0066711140760507, "step": 20 }, { "epoch": 0.0066711140760507, "ref_ce_loss": 2.944377899169922, "step": 20 }, { "epoch": 0.01000667111407605, "loss": 6.1788, "step": 30 }, { "epoch": 0.01000667111407605, "grad_norm": 74.83038330078125, "step": 30 }, { "epoch": 0.01000667111407605, "learning_rate": 1.2499999999999999e-05, "step": 30 }, { "epoch": 0.01000667111407605, "loss": 5.411984920501709, "step": 30 }, { "ce_loss": 0.5704580545425415, "epoch": 0.01000667111407605, "step": 30 }, { "distill_loss": 1.3909449577331543, "epoch": 0.01000667111407605, "step": 30 }, { "epoch": 0.01000667111407605, "ref_ce_loss": 2.3826277256011963, "step": 30 }, { "epoch": 0.01000667111407605, "loss": 4.33409309387207, "step": 30 }, { "ce_loss": 0.5164247751235962, "epoch": 0.01000667111407605, "step": 30 }, { "distill_loss": 1.3752120733261108, "epoch": 0.01000667111407605, "step": 30 }, { "epoch": 0.01000667111407605, "ref_ce_loss": 2.437467336654663, "step": 30 }, { "epoch": 0.01000667111407605, "loss": 4.793662071228027, "step": 30 }, { "ce_loss": 0.4295244514942169, "epoch": 0.01000667111407605, "step": 30 }, { "distill_loss": 1.3164716958999634, "epoch": 0.01000667111407605, "step": 30 }, { "epoch": 0.01000667111407605, "ref_ce_loss": 2.5118391513824463, "step": 30 }, { "epoch": 0.01000667111407605, "loss": 6.409175872802734, "step": 30 }, { "ce_loss": 0.7220518589019775, "epoch": 0.01000667111407605, "step": 30 }, { "distill_loss": 1.6983754634857178, "epoch": 0.01000667111407605, "step": 30 }, { "epoch": 0.01000667111407605, "ref_ce_loss": 2.3008272647857666, "step": 30 }, { "epoch": 0.0133422281521014, "loss": 4.2478, "step": 40 }, { "epoch": 0.0133422281521014, "grad_norm": 12.155257225036621, "step": 40 }, { "epoch": 0.0133422281521014, "learning_rate": 1.6666666666666664e-05, "step": 40 }, { "epoch": 0.0133422281521014, "loss": 3.65868878364563, "step": 40 }, { "ce_loss": 0.7195524573326111, "epoch": 0.0133422281521014, "step": 40 }, { "distill_loss": 1.1870580911636353, "epoch": 0.0133422281521014, "step": 40 }, { "epoch": 0.0133422281521014, "ref_ce_loss": 1.3222304582595825, "step": 40 }, { "epoch": 0.0133422281521014, "loss": 3.2196686267852783, "step": 40 }, { "ce_loss": 0.6631148457527161, "epoch": 0.0133422281521014, "step": 40 }, { "distill_loss": 1.2712286710739136, "epoch": 0.0133422281521014, "step": 40 }, { "epoch": 0.0133422281521014, "ref_ce_loss": 1.283046841621399, "step": 40 }, { "epoch": 0.0133422281521014, "loss": 3.057570457458496, "step": 40 }, { "ce_loss": 0.42720213532447815, "epoch": 0.0133422281521014, "step": 40 }, { "distill_loss": 1.1877330541610718, "epoch": 0.0133422281521014, "step": 40 }, { "epoch": 0.0133422281521014, "ref_ce_loss": 1.2598295211791992, "step": 40 }, { "epoch": 0.0133422281521014, "loss": 3.893627643585205, "step": 40 }, { "ce_loss": 0.5937362909317017, "epoch": 0.0133422281521014, "step": 40 }, { "distill_loss": 1.1929004192352295, "epoch": 0.0133422281521014, "step": 40 }, { "epoch": 0.0133422281521014, "ref_ce_loss": 1.3428030014038086, "step": 40 }, { "epoch": 0.01667778519012675, "loss": 3.0172, "step": 50 }, { "epoch": 0.01667778519012675, "grad_norm": 11.977120399475098, "step": 50 }, { "epoch": 0.01667778519012675, "learning_rate": 2.0833333333333333e-05, "step": 50 }, { "epoch": 0.01667778519012675, "loss": 2.141342878341675, "step": 50 }, { "ce_loss": 0.47699522972106934, "epoch": 0.01667778519012675, "step": 50 }, { "distill_loss": 0.9509934186935425, "epoch": 0.01667778519012675, "step": 50 }, { "epoch": 0.01667778519012675, "ref_ce_loss": 0.5504632592201233, "step": 50 }, { "epoch": 0.01667778519012675, "loss": 2.638903856277466, "step": 50 }, { "ce_loss": 0.5003000497817993, "epoch": 0.01667778519012675, "step": 50 }, { "distill_loss": 0.9422565698623657, "epoch": 0.01667778519012675, "step": 50 }, { "epoch": 0.01667778519012675, "ref_ce_loss": 0.5490682721138, "step": 50 }, { "epoch": 0.01667778519012675, "loss": 2.7825381755828857, "step": 50 }, { "ce_loss": 0.49777916073799133, "epoch": 0.01667778519012675, "step": 50 }, { "distill_loss": 1.050254225730896, "epoch": 0.01667778519012675, "step": 50 }, { "epoch": 0.01667778519012675, "ref_ce_loss": 0.4461013078689575, "step": 50 }, { "epoch": 0.01667778519012675, "loss": 2.273829460144043, "step": 50 }, { "ce_loss": 0.5292913317680359, "epoch": 0.01667778519012675, "step": 50 }, { "distill_loss": 1.0160512924194336, "epoch": 0.01667778519012675, "step": 50 }, { "epoch": 0.01667778519012675, "ref_ce_loss": 0.49012917280197144, "step": 50 }, { "epoch": 0.0200133422281521, "loss": 2.2235, "step": 60 }, { "epoch": 0.0200133422281521, "grad_norm": 8.11925983428955, "step": 60 }, { "epoch": 0.0200133422281521, "learning_rate": 2.4999999999999998e-05, "step": 60 }, { "epoch": 0.0200133422281521, "loss": 1.9671134948730469, "step": 60 }, { "ce_loss": 0.5590649247169495, "epoch": 0.0200133422281521, "step": 60 }, { "distill_loss": 0.8475483059883118, "epoch": 0.0200133422281521, "step": 60 }, { "epoch": 0.0200133422281521, "ref_ce_loss": 0.3405017554759979, "step": 60 }, { "epoch": 0.0200133422281521, "loss": 2.8774218559265137, "step": 60 }, { "ce_loss": 0.5681470036506653, "epoch": 0.0200133422281521, "step": 60 }, { "distill_loss": 0.865785539150238, "epoch": 0.0200133422281521, "step": 60 }, { "epoch": 0.0200133422281521, "ref_ce_loss": 0.4073866009712219, "step": 60 }, { "epoch": 0.0200133422281521, "loss": 1.493269681930542, "step": 60 }, { "ce_loss": 0.3890523612499237, "epoch": 0.0200133422281521, "step": 60 }, { "distill_loss": 0.7810712456703186, "epoch": 0.0200133422281521, "step": 60 }, { "epoch": 0.0200133422281521, "ref_ce_loss": 0.322986900806427, "step": 60 }, { "epoch": 0.0200133422281521, "loss": 2.767509937286377, "step": 60 }, { "ce_loss": 0.4819281995296478, "epoch": 0.0200133422281521, "step": 60 }, { "distill_loss": 0.8184367418289185, "epoch": 0.0200133422281521, "step": 60 }, { "epoch": 0.0200133422281521, "ref_ce_loss": 0.2890125811100006, "step": 60 }, { "epoch": 0.02334889926617745, "loss": 2.0951, "step": 70 }, { "epoch": 0.02334889926617745, "grad_norm": 8.139989852905273, "step": 70 }, { "epoch": 0.02334889926617745, "learning_rate": 2.9166666666666663e-05, "step": 70 }, { "epoch": 0.02334889926617745, "loss": 1.4588215351104736, "step": 70 }, { "ce_loss": 0.4765501320362091, "epoch": 0.02334889926617745, "step": 70 }, { "distill_loss": 0.7598084807395935, "epoch": 0.02334889926617745, "step": 70 }, { "epoch": 0.02334889926617745, "ref_ce_loss": 0.22233976423740387, "step": 70 }, { "epoch": 0.02334889926617745, "loss": 2.102530002593994, "step": 70 }, { "ce_loss": 0.4892614185810089, "epoch": 0.02334889926617745, "step": 70 }, { "distill_loss": 0.7742966413497925, "epoch": 0.02334889926617745, "step": 70 }, { "epoch": 0.02334889926617745, "ref_ce_loss": 0.2892897427082062, "step": 70 }, { "epoch": 0.02334889926617745, "loss": 1.932617425918579, "step": 70 }, { "ce_loss": 0.5458316802978516, "epoch": 0.02334889926617745, "step": 70 }, { "distill_loss": 0.8504544496536255, "epoch": 0.02334889926617745, "step": 70 }, { "epoch": 0.02334889926617745, "ref_ce_loss": 0.3213707506656647, "step": 70 }, { "epoch": 0.02334889926617745, "loss": 1.812137246131897, "step": 70 }, { "ce_loss": 0.5298717021942139, "epoch": 0.02334889926617745, "step": 70 }, { "distill_loss": 0.7876577377319336, "epoch": 0.02334889926617745, "step": 70 }, { "epoch": 0.02334889926617745, "ref_ce_loss": 0.29828882217407227, "step": 70 }, { "epoch": 0.0266844563042028, "loss": 1.9725, "step": 80 }, { "epoch": 0.0266844563042028, "grad_norm": 5.928866863250732, "step": 80 }, { "epoch": 0.0266844563042028, "learning_rate": 3.333333333333333e-05, "step": 80 }, { "epoch": 0.0266844563042028, "loss": 1.7023259401321411, "step": 80 }, { "ce_loss": 0.5784029364585876, "epoch": 0.0266844563042028, "step": 80 }, { "distill_loss": 0.6333774328231812, "epoch": 0.0266844563042028, "step": 80 }, { "epoch": 0.0266844563042028, "ref_ce_loss": 0.3491305112838745, "step": 80 }, { "epoch": 0.0266844563042028, "loss": 1.8371233940124512, "step": 80 }, { "ce_loss": 0.5097202658653259, "epoch": 0.0266844563042028, "step": 80 }, { "distill_loss": 0.6676629781723022, "epoch": 0.0266844563042028, "step": 80 }, { "epoch": 0.0266844563042028, "ref_ce_loss": 0.24543851613998413, "step": 80 }, { "epoch": 0.0266844563042028, "loss": 2.143604278564453, "step": 80 }, { "ce_loss": 0.6652351021766663, "epoch": 0.0266844563042028, "step": 80 }, { "distill_loss": 0.7542927265167236, "epoch": 0.0266844563042028, "step": 80 }, { "epoch": 0.0266844563042028, "ref_ce_loss": 0.31177300214767456, "step": 80 }, { "epoch": 0.0266844563042028, "loss": 2.2320713996887207, "step": 80 }, { "ce_loss": 0.47276201844215393, "epoch": 0.0266844563042028, "step": 80 }, { "distill_loss": 0.6124601364135742, "epoch": 0.0266844563042028, "step": 80 }, { "epoch": 0.0266844563042028, "ref_ce_loss": 0.23781082034111023, "step": 80 }, { "epoch": 0.030020013342228154, "loss": 1.9708, "step": 90 }, { "epoch": 0.030020013342228154, "grad_norm": 9.603582382202148, "step": 90 }, { "epoch": 0.030020013342228154, "learning_rate": 3.75e-05, "step": 90 }, { "epoch": 0.030020013342228154, "loss": 2.0661861896514893, "step": 90 }, { "ce_loss": 0.5394359827041626, "epoch": 0.030020013342228154, "step": 90 }, { "distill_loss": 0.6982392072677612, "epoch": 0.030020013342228154, "step": 90 }, { "epoch": 0.030020013342228154, "ref_ce_loss": 0.24196895956993103, "step": 90 }, { "epoch": 0.030020013342228154, "loss": 2.078852653503418, "step": 90 }, { "ce_loss": 0.5509630441665649, "epoch": 0.030020013342228154, "step": 90 }, { "distill_loss": 0.600722074508667, "epoch": 0.030020013342228154, "step": 90 }, { "epoch": 0.030020013342228154, "ref_ce_loss": 0.2933999001979828, "step": 90 }, { "epoch": 0.030020013342228154, "loss": 1.5637493133544922, "step": 90 }, { "ce_loss": 0.46072623133659363, "epoch": 0.030020013342228154, "step": 90 }, { "distill_loss": 0.588234543800354, "epoch": 0.030020013342228154, "step": 90 }, { "epoch": 0.030020013342228154, "ref_ce_loss": 0.2861691117286682, "step": 90 }, { "epoch": 0.030020013342228154, "loss": 1.6592103242874146, "step": 90 }, { "ce_loss": 0.5185455679893494, "epoch": 0.030020013342228154, "step": 90 }, { "distill_loss": 0.673694908618927, "epoch": 0.030020013342228154, "step": 90 }, { "epoch": 0.030020013342228154, "ref_ce_loss": 0.24703171849250793, "step": 90 }, { "epoch": 0.0333555703802535, "loss": 1.8399, "step": 100 }, { "epoch": 0.0333555703802535, "grad_norm": 7.311580657958984, "step": 100 }, { "epoch": 0.0333555703802535, "learning_rate": 4.1666666666666665e-05, "step": 100 }, { "epoch": 0.0333555703802535, "loss": 1.2659127712249756, "step": 100 }, { "ce_loss": 0.48525702953338623, "epoch": 0.0333555703802535, "step": 100 }, { "distill_loss": 0.5368037819862366, "epoch": 0.0333555703802535, "step": 100 }, { "epoch": 0.0333555703802535, "ref_ce_loss": 0.24336490035057068, "step": 100 }, { "epoch": 0.0333555703802535, "loss": 1.4343409538269043, "step": 100 }, { "ce_loss": 0.44810131192207336, "epoch": 0.0333555703802535, "step": 100 }, { "distill_loss": 0.47137510776519775, "epoch": 0.0333555703802535, "step": 100 }, { "epoch": 0.0333555703802535, "ref_ce_loss": 0.3219914138317108, "step": 100 }, { "epoch": 0.0333555703802535, "loss": 1.8354941606521606, "step": 100 }, { "ce_loss": 0.5599071383476257, "epoch": 0.0333555703802535, "step": 100 }, { "distill_loss": 0.6153331995010376, "epoch": 0.0333555703802535, "step": 100 }, { "epoch": 0.0333555703802535, "ref_ce_loss": 0.22322463989257812, "step": 100 }, { "epoch": 0.0333555703802535, "loss": 1.9053720235824585, "step": 100 }, { "ce_loss": 0.5626506805419922, "epoch": 0.0333555703802535, "step": 100 }, { "distill_loss": 0.6455965042114258, "epoch": 0.0333555703802535, "step": 100 }, { "epoch": 0.0333555703802535, "ref_ce_loss": 0.2309160977602005, "step": 100 }, { "epoch": 0.03669112741827885, "loss": 1.617, "step": 110 }, { "epoch": 0.03669112741827885, "grad_norm": 3.8720383644104004, "step": 110 }, { "epoch": 0.03669112741827885, "learning_rate": 4.5833333333333334e-05, "step": 110 }, { "epoch": 0.03669112741827885, "loss": 1.8273422718048096, "step": 110 }, { "ce_loss": 0.4972834289073944, "epoch": 0.03669112741827885, "step": 110 }, { "distill_loss": 0.5478036999702454, "epoch": 0.03669112741827885, "step": 110 }, { "epoch": 0.03669112741827885, "ref_ce_loss": 0.24270348250865936, "step": 110 }, { "epoch": 0.03669112741827885, "loss": 1.5569121837615967, "step": 110 }, { "ce_loss": 0.47104451060295105, "epoch": 0.03669112741827885, "step": 110 }, { "distill_loss": 0.5678485631942749, "epoch": 0.03669112741827885, "step": 110 }, { "epoch": 0.03669112741827885, "ref_ce_loss": 0.2063506692647934, "step": 110 }, { "epoch": 0.03669112741827885, "loss": 1.791825771331787, "step": 110 }, { "ce_loss": 0.5053634643554688, "epoch": 0.03669112741827885, "step": 110 }, { "distill_loss": 0.5453431606292725, "epoch": 0.03669112741827885, "step": 110 }, { "epoch": 0.03669112741827885, "ref_ce_loss": 0.2539502680301666, "step": 110 }, { "epoch": 0.03669112741827885, "loss": 1.4480103254318237, "step": 110 }, { "ce_loss": 0.5998255014419556, "epoch": 0.03669112741827885, "step": 110 }, { "distill_loss": 0.5709897875785828, "epoch": 0.03669112741827885, "step": 110 }, { "epoch": 0.03669112741827885, "ref_ce_loss": 0.2771782875061035, "step": 110 }, { "epoch": 0.0400266844563042, "loss": 1.7635, "step": 120 }, { "epoch": 0.0400266844563042, "grad_norm": 11.676295280456543, "step": 120 }, { "epoch": 0.0400266844563042, "learning_rate": 4.9999999999999996e-05, "step": 120 }, { "epoch": 0.0400266844563042, "loss": 1.4083495140075684, "step": 120 }, { "ce_loss": 0.5004300475120544, "epoch": 0.0400266844563042, "step": 120 }, { "distill_loss": 0.4932730197906494, "epoch": 0.0400266844563042, "step": 120 }, { "epoch": 0.0400266844563042, "ref_ce_loss": 0.23548881709575653, "step": 120 }, { "epoch": 0.0400266844563042, "loss": 1.7517530918121338, "step": 120 }, { "ce_loss": 0.5206258893013, "epoch": 0.0400266844563042, "step": 120 }, { "distill_loss": 0.5133766531944275, "epoch": 0.0400266844563042, "step": 120 }, { "epoch": 0.0400266844563042, "ref_ce_loss": 0.30411529541015625, "step": 120 }, { "epoch": 0.0400266844563042, "loss": 2.425381898880005, "step": 120 }, { "ce_loss": 0.5057188272476196, "epoch": 0.0400266844563042, "step": 120 }, { "distill_loss": 0.45857954025268555, "epoch": 0.0400266844563042, "step": 120 }, { "epoch": 0.0400266844563042, "ref_ce_loss": 0.24030372500419617, "step": 120 }, { "epoch": 0.0400266844563042, "loss": 1.2033119201660156, "step": 120 }, { "ce_loss": 0.4649990200996399, "epoch": 0.0400266844563042, "step": 120 }, { "distill_loss": 0.43947190046310425, "epoch": 0.0400266844563042, "step": 120 }, { "epoch": 0.0400266844563042, "ref_ce_loss": 0.29883599281311035, "step": 120 }, { "epoch": 0.04336224149432955, "loss": 1.6532, "step": 130 }, { "epoch": 0.04336224149432955, "grad_norm": 6.527844429016113, "step": 130 }, { "epoch": 0.04336224149432955, "learning_rate": 5.4166666666666664e-05, "step": 130 }, { "epoch": 0.04336224149432955, "loss": 1.7000669240951538, "step": 130 }, { "ce_loss": 0.46197932958602905, "epoch": 0.04336224149432955, "step": 130 }, { "distill_loss": 0.46860989928245544, "epoch": 0.04336224149432955, "step": 130 }, { "epoch": 0.04336224149432955, "ref_ce_loss": 0.18592247366905212, "step": 130 }, { "epoch": 0.04336224149432955, "loss": 1.247503638267517, "step": 130 }, { "ce_loss": 0.5011730194091797, "epoch": 0.04336224149432955, "step": 130 }, { "distill_loss": 0.5982078313827515, "epoch": 0.04336224149432955, "step": 130 }, { "epoch": 0.04336224149432955, "ref_ce_loss": 0.1478995680809021, "step": 130 }, { "epoch": 0.04336224149432955, "loss": 1.0552470684051514, "step": 130 }, { "ce_loss": 0.44217559695243835, "epoch": 0.04336224149432955, "step": 130 }, { "distill_loss": 0.41241592168807983, "epoch": 0.04336224149432955, "step": 130 }, { "epoch": 0.04336224149432955, "ref_ce_loss": 0.20050190389156342, "step": 130 }, { "epoch": 0.04336224149432955, "loss": 1.273942470550537, "step": 130 }, { "ce_loss": 0.44072940945625305, "epoch": 0.04336224149432955, "step": 130 }, { "distill_loss": 0.3735302686691284, "epoch": 0.04336224149432955, "step": 130 }, { "epoch": 0.04336224149432955, "ref_ce_loss": 0.28775811195373535, "step": 130 }, { "epoch": 0.0466977985323549, "loss": 1.6767, "step": 140 }, { "epoch": 0.0466977985323549, "grad_norm": 14.396843910217285, "step": 140 }, { "epoch": 0.0466977985323549, "learning_rate": 5.8333333333333326e-05, "step": 140 }, { "epoch": 0.0466977985323549, "loss": 3.0713770389556885, "step": 140 }, { "ce_loss": 0.4275344908237457, "epoch": 0.0466977985323549, "step": 140 }, { "distill_loss": 0.5006403923034668, "epoch": 0.0466977985323549, "step": 140 }, { "epoch": 0.0466977985323549, "ref_ce_loss": 0.26904621720314026, "step": 140 }, { "epoch": 0.0466977985323549, "loss": 2.02996826171875, "step": 140 }, { "ce_loss": 0.5514727234840393, "epoch": 0.0466977985323549, "step": 140 }, { "distill_loss": 0.5769180655479431, "epoch": 0.0466977985323549, "step": 140 }, { "epoch": 0.0466977985323549, "ref_ce_loss": 0.1895466148853302, "step": 140 }, { "epoch": 0.0466977985323549, "loss": 2.803758382797241, "step": 140 }, { "ce_loss": 0.6055456399917603, "epoch": 0.0466977985323549, "step": 140 }, { "distill_loss": 0.624858558177948, "epoch": 0.0466977985323549, "step": 140 }, { "epoch": 0.0466977985323549, "ref_ce_loss": 0.26189401745796204, "step": 140 }, { "epoch": 0.0466977985323549, "loss": 1.83541738986969, "step": 140 }, { "ce_loss": 0.5861349105834961, "epoch": 0.0466977985323549, "step": 140 }, { "distill_loss": 0.6315572261810303, "epoch": 0.0466977985323549, "step": 140 }, { "epoch": 0.0466977985323549, "ref_ce_loss": 0.24624021351337433, "step": 140 }, { "epoch": 0.05003335557038025, "loss": 1.8934, "step": 150 }, { "epoch": 0.05003335557038025, "grad_norm": 3.9040794372558594, "step": 150 }, { "epoch": 0.05003335557038025, "learning_rate": 6.25e-05, "step": 150 }, { "epoch": 0.05003335557038025, "loss": 2.2169854640960693, "step": 150 }, { "ce_loss": 0.4155034124851227, "epoch": 0.05003335557038025, "step": 150 }, { "distill_loss": 0.5503289103507996, "epoch": 0.05003335557038025, "step": 150 }, { "epoch": 0.05003335557038025, "ref_ce_loss": 0.1302298605442047, "step": 150 }, { "epoch": 0.05003335557038025, "loss": 2.067556858062744, "step": 150 }, { "ce_loss": 0.5323479771614075, "epoch": 0.05003335557038025, "step": 150 }, { "distill_loss": 0.5300868153572083, "epoch": 0.05003335557038025, "step": 150 }, { "epoch": 0.05003335557038025, "ref_ce_loss": 0.25406020879745483, "step": 150 }, { "epoch": 0.05003335557038025, "loss": 1.628650426864624, "step": 150 }, { "ce_loss": 0.5438556671142578, "epoch": 0.05003335557038025, "step": 150 }, { "distill_loss": 0.5136424899101257, "epoch": 0.05003335557038025, "step": 150 }, { "epoch": 0.05003335557038025, "ref_ce_loss": 0.25217124819755554, "step": 150 }, { "epoch": 0.05003335557038025, "loss": 1.442086100578308, "step": 150 }, { "ce_loss": 0.47215816378593445, "epoch": 0.05003335557038025, "step": 150 }, { "distill_loss": 0.47653883695602417, "epoch": 0.05003335557038025, "step": 150 }, { "epoch": 0.05003335557038025, "ref_ce_loss": 0.24462643265724182, "step": 150 }, { "epoch": 0.0533689126084056, "loss": 1.8171, "step": 160 }, { "epoch": 0.0533689126084056, "grad_norm": 6.843990325927734, "step": 160 }, { "epoch": 0.0533689126084056, "learning_rate": 6.666666666666666e-05, "step": 160 }, { "epoch": 0.0533689126084056, "loss": 1.6495225429534912, "step": 160 }, { "ce_loss": 0.5015798807144165, "epoch": 0.0533689126084056, "step": 160 }, { "distill_loss": 0.5507673025131226, "epoch": 0.0533689126084056, "step": 160 }, { "epoch": 0.0533689126084056, "ref_ce_loss": 0.21071067452430725, "step": 160 }, { "epoch": 0.0533689126084056, "loss": 1.8539624214172363, "step": 160 }, { "ce_loss": 0.5268104076385498, "epoch": 0.0533689126084056, "step": 160 }, { "distill_loss": 0.582175612449646, "epoch": 0.0533689126084056, "step": 160 }, { "epoch": 0.0533689126084056, "ref_ce_loss": 0.1892932802438736, "step": 160 }, { "epoch": 0.0533689126084056, "loss": 1.9361590147018433, "step": 160 }, { "ce_loss": 0.5014770030975342, "epoch": 0.0533689126084056, "step": 160 }, { "distill_loss": 0.5757460594177246, "epoch": 0.0533689126084056, "step": 160 }, { "epoch": 0.0533689126084056, "ref_ce_loss": 0.27097243070602417, "step": 160 }, { "epoch": 0.0533689126084056, "loss": 2.1110458374023438, "step": 160 }, { "ce_loss": 0.39067360758781433, "epoch": 0.0533689126084056, "step": 160 }, { "distill_loss": 0.4890226125717163, "epoch": 0.0533689126084056, "step": 160 }, { "epoch": 0.0533689126084056, "ref_ce_loss": 0.16871194541454315, "step": 160 }, { "epoch": 0.05670446964643095, "loss": 1.7352, "step": 170 }, { "epoch": 0.05670446964643095, "grad_norm": 11.573543548583984, "step": 170 }, { "epoch": 0.05670446964643095, "learning_rate": 7.083333333333332e-05, "step": 170 }, { "epoch": 0.05670446964643095, "loss": 1.8267295360565186, "step": 170 }, { "ce_loss": 0.5845050811767578, "epoch": 0.05670446964643095, "step": 170 }, { "distill_loss": 0.5946557521820068, "epoch": 0.05670446964643095, "step": 170 }, { "epoch": 0.05670446964643095, "ref_ce_loss": 0.3130984604358673, "step": 170 }, { "epoch": 0.05670446964643095, "loss": 1.8540599346160889, "step": 170 }, { "ce_loss": 0.44277799129486084, "epoch": 0.05670446964643095, "step": 170 }, { "distill_loss": 0.5504587888717651, "epoch": 0.05670446964643095, "step": 170 }, { "epoch": 0.05670446964643095, "ref_ce_loss": 0.18251009285449982, "step": 170 }, { "epoch": 0.05670446964643095, "loss": 1.7149605751037598, "step": 170 }, { "ce_loss": 0.4688924551010132, "epoch": 0.05670446964643095, "step": 170 }, { "distill_loss": 0.446077436208725, "epoch": 0.05670446964643095, "step": 170 }, { "epoch": 0.05670446964643095, "ref_ce_loss": 0.21932320296764374, "step": 170 }, { "epoch": 0.05670446964643095, "loss": 1.5233681201934814, "step": 170 }, { "ce_loss": 0.4129520654678345, "epoch": 0.05670446964643095, "step": 170 }, { "distill_loss": 0.5524225234985352, "epoch": 0.05670446964643095, "step": 170 }, { "epoch": 0.05670446964643095, "ref_ce_loss": 0.17084111273288727, "step": 170 }, { "epoch": 0.06004002668445631, "loss": 1.5744, "step": 180 }, { "epoch": 0.06004002668445631, "grad_norm": 7.211359977722168, "step": 180 }, { "epoch": 0.06004002668445631, "learning_rate": 7.5e-05, "step": 180 }, { "epoch": 0.06004002668445631, "loss": 1.1495260000228882, "step": 180 }, { "ce_loss": 0.4412047863006592, "epoch": 0.06004002668445631, "step": 180 }, { "distill_loss": 0.5178871750831604, "epoch": 0.06004002668445631, "step": 180 }, { "epoch": 0.06004002668445631, "ref_ce_loss": 0.19014258682727814, "step": 180 }, { "epoch": 0.06004002668445631, "loss": 1.5029988288879395, "step": 180 }, { "ce_loss": 0.5065923929214478, "epoch": 0.06004002668445631, "step": 180 }, { "distill_loss": 0.5425624251365662, "epoch": 0.06004002668445631, "step": 180 }, { "epoch": 0.06004002668445631, "ref_ce_loss": 0.2572532892227173, "step": 180 }, { "epoch": 0.06004002668445631, "loss": 1.4675511121749878, "step": 180 }, { "ce_loss": 0.44596901535987854, "epoch": 0.06004002668445631, "step": 180 }, { "distill_loss": 0.4775707423686981, "epoch": 0.06004002668445631, "step": 180 }, { "epoch": 0.06004002668445631, "ref_ce_loss": 0.22245991230010986, "step": 180 }, { "epoch": 0.06004002668445631, "loss": 2.0268988609313965, "step": 180 }, { "ce_loss": 0.5191164016723633, "epoch": 0.06004002668445631, "step": 180 }, { "distill_loss": 0.6096824407577515, "epoch": 0.06004002668445631, "step": 180 }, { "epoch": 0.06004002668445631, "ref_ce_loss": 0.24708403646945953, "step": 180 }, { "epoch": 0.06337558372248166, "loss": 1.6854, "step": 190 }, { "epoch": 0.06337558372248166, "grad_norm": 5.338657379150391, "step": 190 }, { "epoch": 0.06337558372248166, "learning_rate": 7.916666666666666e-05, "step": 190 }, { "epoch": 0.06337558372248166, "loss": 2.0079145431518555, "step": 190 }, { "ce_loss": 0.45281872153282166, "epoch": 0.06337558372248166, "step": 190 }, { "distill_loss": 0.585824191570282, "epoch": 0.06337558372248166, "step": 190 }, { "epoch": 0.06337558372248166, "ref_ce_loss": 0.18356651067733765, "step": 190 }, { "epoch": 0.06337558372248166, "loss": 2.0530364513397217, "step": 190 }, { "ce_loss": 0.5388656854629517, "epoch": 0.06337558372248166, "step": 190 }, { "distill_loss": 0.5773449540138245, "epoch": 0.06337558372248166, "step": 190 }, { "epoch": 0.06337558372248166, "ref_ce_loss": 0.23279789090156555, "step": 190 }, { "epoch": 0.06337558372248166, "loss": 1.787460207939148, "step": 190 }, { "ce_loss": 0.5597324371337891, "epoch": 0.06337558372248166, "step": 190 }, { "distill_loss": 0.6426137685775757, "epoch": 0.06337558372248166, "step": 190 }, { "epoch": 0.06337558372248166, "ref_ce_loss": 0.1847495138645172, "step": 190 }, { "epoch": 0.06337558372248166, "loss": 2.38955020904541, "step": 190 }, { "ce_loss": 0.542634129524231, "epoch": 0.06337558372248166, "step": 190 }, { "distill_loss": 0.6852626204490662, "epoch": 0.06337558372248166, "step": 190 }, { "epoch": 0.06337558372248166, "ref_ce_loss": 0.22592175006866455, "step": 190 }, { "epoch": 0.066711140760507, "loss": 1.7286, "step": 200 }, { "epoch": 0.066711140760507, "grad_norm": 4.373966693878174, "step": 200 }, { "epoch": 0.066711140760507, "learning_rate": 8.333333333333333e-05, "step": 200 }, { "epoch": 0.066711140760507, "loss": 1.9177393913269043, "step": 200 }, { "ce_loss": 0.5175270438194275, "epoch": 0.066711140760507, "step": 200 }, { "distill_loss": 0.5832851529121399, "epoch": 0.066711140760507, "step": 200 }, { "epoch": 0.066711140760507, "ref_ce_loss": 0.203318253159523, "step": 200 }, { "epoch": 0.066711140760507, "loss": 1.9105541706085205, "step": 200 }, { "ce_loss": 0.4564187526702881, "epoch": 0.066711140760507, "step": 200 }, { "distill_loss": 0.6109424829483032, "epoch": 0.066711140760507, "step": 200 }, { "epoch": 0.066711140760507, "ref_ce_loss": 0.22662192583084106, "step": 200 }, { "epoch": 0.066711140760507, "loss": 2.0437614917755127, "step": 200 }, { "ce_loss": 0.5581548810005188, "epoch": 0.066711140760507, "step": 200 }, { "distill_loss": 0.660732090473175, "epoch": 0.066711140760507, "step": 200 }, { "epoch": 0.066711140760507, "ref_ce_loss": 0.2035897672176361, "step": 200 }, { "epoch": 0.066711140760507, "loss": 1.8254846334457397, "step": 200 }, { "ce_loss": 0.49772366881370544, "epoch": 0.066711140760507, "step": 200 }, { "distill_loss": 0.6139116287231445, "epoch": 0.066711140760507, "step": 200 }, { "epoch": 0.066711140760507, "ref_ce_loss": 0.14511890709400177, "step": 200 }, { "epoch": 0.07004669779853236, "loss": 1.7987, "step": 210 }, { "epoch": 0.07004669779853236, "grad_norm": 8.68053150177002, "step": 210 }, { "epoch": 0.07004669779853236, "learning_rate": 8.75e-05, "step": 210 }, { "epoch": 0.07004669779853236, "loss": 1.711827039718628, "step": 210 }, { "ce_loss": 0.4407254457473755, "epoch": 0.07004669779853236, "step": 210 }, { "distill_loss": 0.6045986413955688, "epoch": 0.07004669779853236, "step": 210 }, { "epoch": 0.07004669779853236, "ref_ce_loss": 0.1859685778617859, "step": 210 }, { "epoch": 0.07004669779853236, "loss": 1.258393406867981, "step": 210 }, { "ce_loss": 0.396217405796051, "epoch": 0.07004669779853236, "step": 210 }, { "distill_loss": 0.53984534740448, "epoch": 0.07004669779853236, "step": 210 }, { "epoch": 0.07004669779853236, "ref_ce_loss": 0.15166202187538147, "step": 210 }, { "epoch": 0.07004669779853236, "loss": 1.666795015335083, "step": 210 }, { "ce_loss": 0.4788994789123535, "epoch": 0.07004669779853236, "step": 210 }, { "distill_loss": 0.6096857190132141, "epoch": 0.07004669779853236, "step": 210 }, { "epoch": 0.07004669779853236, "ref_ce_loss": 0.20014192163944244, "step": 210 }, { "epoch": 0.07004669779853236, "loss": 1.3152960538864136, "step": 210 }, { "ce_loss": 0.4885876178741455, "epoch": 0.07004669779853236, "step": 210 }, { "distill_loss": 0.5094537734985352, "epoch": 0.07004669779853236, "step": 210 }, { "epoch": 0.07004669779853236, "ref_ce_loss": 0.3172496259212494, "step": 210 }, { "epoch": 0.0733822548365577, "loss": 1.6134, "step": 220 }, { "epoch": 0.0733822548365577, "grad_norm": 7.969250679016113, "step": 220 }, { "epoch": 0.0733822548365577, "learning_rate": 9.166666666666667e-05, "step": 220 }, { "epoch": 0.0733822548365577, "loss": 1.9584766626358032, "step": 220 }, { "ce_loss": 0.5503972768783569, "epoch": 0.0733822548365577, "step": 220 }, { "distill_loss": 0.846336841583252, "epoch": 0.0733822548365577, "step": 220 }, { "epoch": 0.0733822548365577, "ref_ce_loss": 0.272134006023407, "step": 220 }, { "epoch": 0.0733822548365577, "loss": 3.0287160873413086, "step": 220 }, { "ce_loss": 0.48527926206588745, "epoch": 0.0733822548365577, "step": 220 }, { "distill_loss": 0.7273819446563721, "epoch": 0.0733822548365577, "step": 220 }, { "epoch": 0.0733822548365577, "ref_ce_loss": 0.24912559986114502, "step": 220 }, { "epoch": 0.0733822548365577, "loss": 2.417612314224243, "step": 220 }, { "ce_loss": 0.49760445952415466, "epoch": 0.0733822548365577, "step": 220 }, { "distill_loss": 0.7104735970497131, "epoch": 0.0733822548365577, "step": 220 }, { "epoch": 0.0733822548365577, "ref_ce_loss": 0.20668411254882812, "step": 220 }, { "epoch": 0.0733822548365577, "loss": 1.9464311599731445, "step": 220 }, { "ce_loss": 0.38983479142189026, "epoch": 0.0733822548365577, "step": 220 }, { "distill_loss": 0.5419527888298035, "epoch": 0.0733822548365577, "step": 220 }, { "epoch": 0.0733822548365577, "ref_ce_loss": 0.20586061477661133, "step": 220 }, { "epoch": 0.07671781187458306, "loss": 1.8046, "step": 230 }, { "epoch": 0.07671781187458306, "grad_norm": 4.189472198486328, "step": 230 }, { "epoch": 0.07671781187458306, "learning_rate": 9.583333333333332e-05, "step": 230 }, { "epoch": 0.07671781187458306, "loss": 1.4793825149536133, "step": 230 }, { "ce_loss": 0.5580512285232544, "epoch": 0.07671781187458306, "step": 230 }, { "distill_loss": 0.6622768044471741, "epoch": 0.07671781187458306, "step": 230 }, { "epoch": 0.07671781187458306, "ref_ce_loss": 0.2589597702026367, "step": 230 }, { "epoch": 0.07671781187458306, "loss": 1.6697819232940674, "step": 230 }, { "ce_loss": 0.4094105064868927, "epoch": 0.07671781187458306, "step": 230 }, { "distill_loss": 0.6387210488319397, "epoch": 0.07671781187458306, "step": 230 }, { "epoch": 0.07671781187458306, "ref_ce_loss": 0.17742785811424255, "step": 230 }, { "epoch": 0.07671781187458306, "loss": 2.397416830062866, "step": 230 }, { "ce_loss": 0.5211621522903442, "epoch": 0.07671781187458306, "step": 230 }, { "distill_loss": 0.6582202911376953, "epoch": 0.07671781187458306, "step": 230 }, { "epoch": 0.07671781187458306, "ref_ce_loss": 0.22356149554252625, "step": 230 }, { "epoch": 0.07671781187458306, "loss": 1.714322805404663, "step": 230 }, { "ce_loss": 0.5506291389465332, "epoch": 0.07671781187458306, "step": 230 }, { "distill_loss": 0.7077121138572693, "epoch": 0.07671781187458306, "step": 230 }, { "epoch": 0.07671781187458306, "ref_ce_loss": 0.22887969017028809, "step": 230 }, { "epoch": 0.0800533689126084, "loss": 1.6997, "step": 240 }, { "epoch": 0.0800533689126084, "grad_norm": 4.067202568054199, "step": 240 }, { "epoch": 0.0800533689126084, "learning_rate": 9.999999999999999e-05, "step": 240 }, { "epoch": 0.0800533689126084, "loss": 1.3863872289657593, "step": 240 }, { "ce_loss": 0.40065205097198486, "epoch": 0.0800533689126084, "step": 240 }, { "distill_loss": 0.5652359127998352, "epoch": 0.0800533689126084, "step": 240 }, { "epoch": 0.0800533689126084, "ref_ce_loss": 0.17888511717319489, "step": 240 }, { "epoch": 0.0800533689126084, "loss": 1.5082571506500244, "step": 240 }, { "ce_loss": 0.4735325872898102, "epoch": 0.0800533689126084, "step": 240 }, { "distill_loss": 0.6025104522705078, "epoch": 0.0800533689126084, "step": 240 }, { "epoch": 0.0800533689126084, "ref_ce_loss": 0.20674863457679749, "step": 240 }, { "epoch": 0.0800533689126084, "loss": 1.3987960815429688, "step": 240 }, { "ce_loss": 0.3731807470321655, "epoch": 0.0800533689126084, "step": 240 }, { "distill_loss": 0.5945234894752502, "epoch": 0.0800533689126084, "step": 240 }, { "epoch": 0.0800533689126084, "ref_ce_loss": 0.1682978868484497, "step": 240 }, { "epoch": 0.0800533689126084, "loss": 1.8205552101135254, "step": 240 }, { "ce_loss": 0.3344307839870453, "epoch": 0.0800533689126084, "step": 240 }, { "distill_loss": 0.6057762503623962, "epoch": 0.0800533689126084, "step": 240 }, { "epoch": 0.0800533689126084, "ref_ce_loss": 0.1428370624780655, "step": 240 }, { "epoch": 0.08338892595063375, "loss": 1.702, "step": 250 }, { "epoch": 0.08338892595063375, "grad_norm": 3.4190220832824707, "step": 250 }, { "epoch": 0.08338892595063375, "learning_rate": 0.00010416666666666666, "step": 250 }, { "epoch": 0.08338892595063375, "loss": 2.404367446899414, "step": 250 }, { "ce_loss": 0.5212981700897217, "epoch": 0.08338892595063375, "step": 250 }, { "distill_loss": 0.6047353148460388, "epoch": 0.08338892595063375, "step": 250 }, { "epoch": 0.08338892595063375, "ref_ce_loss": 0.18801236152648926, "step": 250 }, { "epoch": 0.08338892595063375, "loss": 1.6255981922149658, "step": 250 }, { "ce_loss": 0.3867962658405304, "epoch": 0.08338892595063375, "step": 250 }, { "distill_loss": 0.5201380848884583, "epoch": 0.08338892595063375, "step": 250 }, { "epoch": 0.08338892595063375, "ref_ce_loss": 0.19661235809326172, "step": 250 }, { "epoch": 0.08338892595063375, "loss": 1.7828060388565063, "step": 250 }, { "ce_loss": 0.4401053786277771, "epoch": 0.08338892595063375, "step": 250 }, { "distill_loss": 0.5746744871139526, "epoch": 0.08338892595063375, "step": 250 }, { "epoch": 0.08338892595063375, "ref_ce_loss": 0.17614340782165527, "step": 250 }, { "epoch": 0.08338892595063375, "loss": 1.0926663875579834, "step": 250 }, { "ce_loss": 0.363411545753479, "epoch": 0.08338892595063375, "step": 250 }, { "distill_loss": 0.44517719745635986, "epoch": 0.08338892595063375, "step": 250 }, { "epoch": 0.08338892595063375, "ref_ce_loss": 0.27430668473243713, "step": 250 }, { "epoch": 0.0867244829886591, "loss": 1.5861, "step": 260 }, { "epoch": 0.0867244829886591, "grad_norm": 4.730551719665527, "step": 260 }, { "epoch": 0.0867244829886591, "learning_rate": 0.00010833333333333333, "step": 260 }, { "epoch": 0.0867244829886591, "loss": 2.1394453048706055, "step": 260 }, { "ce_loss": 0.48058784008026123, "epoch": 0.0867244829886591, "step": 260 }, { "distill_loss": 0.5192797780036926, "epoch": 0.0867244829886591, "step": 260 }, { "epoch": 0.0867244829886591, "ref_ce_loss": 0.1934836208820343, "step": 260 }, { "epoch": 0.0867244829886591, "loss": 1.4074976444244385, "step": 260 }, { "ce_loss": 0.4552706182003021, "epoch": 0.0867244829886591, "step": 260 }, { "distill_loss": 0.48851698637008667, "epoch": 0.0867244829886591, "step": 260 }, { "epoch": 0.0867244829886591, "ref_ce_loss": 0.15941733121871948, "step": 260 }, { "epoch": 0.0867244829886591, "loss": 1.5271190404891968, "step": 260 }, { "ce_loss": 0.4061850905418396, "epoch": 0.0867244829886591, "step": 260 }, { "distill_loss": 0.49891141057014465, "epoch": 0.0867244829886591, "step": 260 }, { "epoch": 0.0867244829886591, "ref_ce_loss": 0.23973435163497925, "step": 260 }, { "epoch": 0.0867244829886591, "loss": 1.4721564054489136, "step": 260 }, { "ce_loss": 0.4186480939388275, "epoch": 0.0867244829886591, "step": 260 }, { "distill_loss": 0.46537312865257263, "epoch": 0.0867244829886591, "step": 260 }, { "epoch": 0.0867244829886591, "ref_ce_loss": 0.22249318659305573, "step": 260 }, { "epoch": 0.09006004002668445, "loss": 1.598, "step": 270 }, { "epoch": 0.09006004002668445, "grad_norm": 6.989984035491943, "step": 270 }, { "epoch": 0.09006004002668445, "learning_rate": 0.0001125, "step": 270 }, { "epoch": 0.09006004002668445, "loss": 1.7460898160934448, "step": 270 }, { "ce_loss": 0.5612281560897827, "epoch": 0.09006004002668445, "step": 270 }, { "distill_loss": 0.7256549596786499, "epoch": 0.09006004002668445, "step": 270 }, { "epoch": 0.09006004002668445, "ref_ce_loss": 0.15846964716911316, "step": 270 }, { "epoch": 0.09006004002668445, "loss": 1.6589117050170898, "step": 270 }, { "ce_loss": 0.3386031985282898, "epoch": 0.09006004002668445, "step": 270 }, { "distill_loss": 0.5948140621185303, "epoch": 0.09006004002668445, "step": 270 }, { "epoch": 0.09006004002668445, "ref_ce_loss": 0.1583528220653534, "step": 270 }, { "epoch": 0.09006004002668445, "loss": 1.8619314432144165, "step": 270 }, { "ce_loss": 0.4433027505874634, "epoch": 0.09006004002668445, "step": 270 }, { "distill_loss": 0.6727502346038818, "epoch": 0.09006004002668445, "step": 270 }, { "epoch": 0.09006004002668445, "ref_ce_loss": 0.19358614087104797, "step": 270 }, { "epoch": 0.09006004002668445, "loss": 1.4343819618225098, "step": 270 }, { "ce_loss": 0.31405356526374817, "epoch": 0.09006004002668445, "step": 270 }, { "distill_loss": 0.5503586530685425, "epoch": 0.09006004002668445, "step": 270 }, { "epoch": 0.09006004002668445, "ref_ce_loss": 0.16053526103496552, "step": 270 }, { "epoch": 0.0933955970647098, "loss": 1.5236, "step": 280 }, { "epoch": 0.0933955970647098, "grad_norm": 2.97304368019104, "step": 280 }, { "epoch": 0.0933955970647098, "learning_rate": 0.00011666666666666665, "step": 280 }, { "epoch": 0.0933955970647098, "loss": 1.3160141706466675, "step": 280 }, { "ce_loss": 0.4134749174118042, "epoch": 0.0933955970647098, "step": 280 }, { "distill_loss": 0.5966621041297913, "epoch": 0.0933955970647098, "step": 280 }, { "epoch": 0.0933955970647098, "ref_ce_loss": 0.13658492267131805, "step": 280 }, { "epoch": 0.0933955970647098, "loss": 1.6723679304122925, "step": 280 }, { "ce_loss": 0.5664156675338745, "epoch": 0.0933955970647098, "step": 280 }, { "distill_loss": 0.6793075203895569, "epoch": 0.0933955970647098, "step": 280 }, { "epoch": 0.0933955970647098, "ref_ce_loss": 0.2132052481174469, "step": 280 }, { "epoch": 0.0933955970647098, "loss": 1.7364765405654907, "step": 280 }, { "ce_loss": 0.3761884570121765, "epoch": 0.0933955970647098, "step": 280 }, { "distill_loss": 0.6366319060325623, "epoch": 0.0933955970647098, "step": 280 }, { "epoch": 0.0933955970647098, "ref_ce_loss": 0.19218598306179047, "step": 280 }, { "epoch": 0.0933955970647098, "loss": 1.455072283744812, "step": 280 }, { "ce_loss": 0.402457594871521, "epoch": 0.0933955970647098, "step": 280 }, { "distill_loss": 0.6173094511032104, "epoch": 0.0933955970647098, "step": 280 }, { "epoch": 0.0933955970647098, "ref_ce_loss": 0.18485473096370697, "step": 280 }, { "epoch": 0.09673115410273515, "loss": 1.5508, "step": 290 }, { "epoch": 0.09673115410273515, "grad_norm": 6.759135723114014, "step": 290 }, { "epoch": 0.09673115410273515, "learning_rate": 0.00012083333333333332, "step": 290 }, { "epoch": 0.09673115410273515, "loss": 2.49653959274292, "step": 290 }, { "ce_loss": 0.5873475074768066, "epoch": 0.09673115410273515, "step": 290 }, { "distill_loss": 0.47205284237861633, "epoch": 0.09673115410273515, "step": 290 }, { "epoch": 0.09673115410273515, "ref_ce_loss": 0.2220914363861084, "step": 290 }, { "epoch": 0.09673115410273515, "loss": 1.023373007774353, "step": 290 }, { "ce_loss": 0.42132118344306946, "epoch": 0.09673115410273515, "step": 290 }, { "distill_loss": 0.48384732007980347, "epoch": 0.09673115410273515, "step": 290 }, { "epoch": 0.09673115410273515, "ref_ce_loss": 0.11812644451856613, "step": 290 }, { "epoch": 0.09673115410273515, "loss": 1.0123854875564575, "step": 290 }, { "ce_loss": 0.32844048738479614, "epoch": 0.09673115410273515, "step": 290 }, { "distill_loss": 0.3588325083255768, "epoch": 0.09673115410273515, "step": 290 }, { "epoch": 0.09673115410273515, "ref_ce_loss": 0.18135057389736176, "step": 290 }, { "epoch": 0.09673115410273515, "loss": 1.6200289726257324, "step": 290 }, { "ce_loss": 0.4336802065372467, "epoch": 0.09673115410273515, "step": 290 }, { "distill_loss": 0.517190158367157, "epoch": 0.09673115410273515, "step": 290 }, { "epoch": 0.09673115410273515, "ref_ce_loss": 0.18303078413009644, "step": 290 }, { "epoch": 0.1000667111407605, "loss": 1.6154, "step": 300 }, { "epoch": 0.1000667111407605, "grad_norm": 4.066988468170166, "step": 300 }, { "epoch": 0.1000667111407605, "learning_rate": 0.000125, "step": 300 }, { "epoch": 0.1000667111407605, "loss": 1.648413062095642, "step": 300 }, { "ce_loss": 0.3631472587585449, "epoch": 0.1000667111407605, "step": 300 }, { "distill_loss": 0.32697027921676636, "epoch": 0.1000667111407605, "step": 300 }, { "epoch": 0.1000667111407605, "ref_ce_loss": 0.2214168757200241, "step": 300 }, { "epoch": 0.1000667111407605, "loss": 1.8053052425384521, "step": 300 }, { "ce_loss": 0.47860148549079895, "epoch": 0.1000667111407605, "step": 300 }, { "distill_loss": 0.43630748987197876, "epoch": 0.1000667111407605, "step": 300 }, { "epoch": 0.1000667111407605, "ref_ce_loss": 0.20420612394809723, "step": 300 }, { "epoch": 0.1000667111407605, "loss": 0.9513359665870667, "step": 300 }, { "ce_loss": 0.39622437953948975, "epoch": 0.1000667111407605, "step": 300 }, { "distill_loss": 0.365519642829895, "epoch": 0.1000667111407605, "step": 300 }, { "epoch": 0.1000667111407605, "ref_ce_loss": 0.18928639590740204, "step": 300 }, { "epoch": 0.1000667111407605, "loss": 1.047836184501648, "step": 300 }, { "ce_loss": 0.44009682536125183, "epoch": 0.1000667111407605, "step": 300 }, { "distill_loss": 0.3900669515132904, "epoch": 0.1000667111407605, "step": 300 }, { "epoch": 0.1000667111407605, "ref_ce_loss": 0.2174147367477417, "step": 300 }, { "epoch": 0.10340226817878585, "loss": 1.5139, "step": 310 }, { "epoch": 0.10340226817878585, "grad_norm": 2.1779675483703613, "step": 310 }, { "epoch": 0.10340226817878585, "learning_rate": 0.00012916666666666667, "step": 310 }, { "epoch": 0.10340226817878585, "loss": 1.72920823097229, "step": 310 }, { "ce_loss": 0.40507739782333374, "epoch": 0.10340226817878585, "step": 310 }, { "distill_loss": 0.4770023822784424, "epoch": 0.10340226817878585, "step": 310 }, { "epoch": 0.10340226817878585, "ref_ce_loss": 0.19400697946548462, "step": 310 }, { "epoch": 0.10340226817878585, "loss": 1.1937280893325806, "step": 310 }, { "ce_loss": 0.3553615212440491, "epoch": 0.10340226817878585, "step": 310 }, { "distill_loss": 0.4500002861022949, "epoch": 0.10340226817878585, "step": 310 }, { "epoch": 0.10340226817878585, "ref_ce_loss": 0.1656891256570816, "step": 310 }, { "epoch": 0.10340226817878585, "loss": 1.708717703819275, "step": 310 }, { "ce_loss": 0.36175283789634705, "epoch": 0.10340226817878585, "step": 310 }, { "distill_loss": 0.5048600435256958, "epoch": 0.10340226817878585, "step": 310 }, { "epoch": 0.10340226817878585, "ref_ce_loss": 0.13430316746234894, "step": 310 }, { "epoch": 0.10340226817878585, "loss": 1.4646344184875488, "step": 310 }, { "ce_loss": 0.535317599773407, "epoch": 0.10340226817878585, "step": 310 }, { "distill_loss": 0.4663164019584656, "epoch": 0.10340226817878585, "step": 310 }, { "epoch": 0.10340226817878585, "ref_ce_loss": 0.2614893615245819, "step": 310 }, { "epoch": 0.1067378252168112, "loss": 1.5879, "step": 320 }, { "epoch": 0.1067378252168112, "grad_norm": 3.444401979446411, "step": 320 }, { "epoch": 0.1067378252168112, "learning_rate": 0.0001333333333333333, "step": 320 }, { "epoch": 0.1067378252168112, "loss": 1.3173494338989258, "step": 320 }, { "ce_loss": 0.4536479711532593, "epoch": 0.1067378252168112, "step": 320 }, { "distill_loss": 0.38794639706611633, "epoch": 0.1067378252168112, "step": 320 }, { "epoch": 0.1067378252168112, "ref_ce_loss": 0.20736144483089447, "step": 320 }, { "epoch": 0.1067378252168112, "loss": 1.2465583086013794, "step": 320 }, { "ce_loss": 0.4199206531047821, "epoch": 0.1067378252168112, "step": 320 }, { "distill_loss": 0.4158381223678589, "epoch": 0.1067378252168112, "step": 320 }, { "epoch": 0.1067378252168112, "ref_ce_loss": 0.15094277262687683, "step": 320 }, { "epoch": 0.1067378252168112, "loss": 1.271466612815857, "step": 320 }, { "ce_loss": 0.3549860119819641, "epoch": 0.1067378252168112, "step": 320 }, { "distill_loss": 0.2894030511379242, "epoch": 0.1067378252168112, "step": 320 }, { "epoch": 0.1067378252168112, "ref_ce_loss": 0.24361640214920044, "step": 320 }, { "epoch": 0.1067378252168112, "loss": 1.6522722244262695, "step": 320 }, { "ce_loss": 0.41289564967155457, "epoch": 0.1067378252168112, "step": 320 }, { "distill_loss": 0.4370846152305603, "epoch": 0.1067378252168112, "step": 320 }, { "epoch": 0.1067378252168112, "ref_ce_loss": 0.222516730427742, "step": 320 }, { "epoch": 0.11007338225483655, "loss": 1.4188, "step": 330 }, { "epoch": 0.11007338225483655, "grad_norm": 4.895388126373291, "step": 330 }, { "epoch": 0.11007338225483655, "learning_rate": 0.00013749999999999998, "step": 330 }, { "epoch": 0.11007338225483655, "loss": 1.594529628753662, "step": 330 }, { "ce_loss": 0.378388375043869, "epoch": 0.11007338225483655, "step": 330 }, { "distill_loss": 0.32350635528564453, "epoch": 0.11007338225483655, "step": 330 }, { "epoch": 0.11007338225483655, "ref_ce_loss": 0.19054223597049713, "step": 330 }, { "epoch": 0.11007338225483655, "loss": 1.8214046955108643, "step": 330 }, { "ce_loss": 0.4791013300418854, "epoch": 0.11007338225483655, "step": 330 }, { "distill_loss": 0.3267383873462677, "epoch": 0.11007338225483655, "step": 330 }, { "epoch": 0.11007338225483655, "ref_ce_loss": 0.23376412689685822, "step": 330 }, { "epoch": 0.11007338225483655, "loss": 0.8836880326271057, "step": 330 }, { "ce_loss": 0.3631823658943176, "epoch": 0.11007338225483655, "step": 330 }, { "distill_loss": 0.3676852881908417, "epoch": 0.11007338225483655, "step": 330 }, { "epoch": 0.11007338225483655, "ref_ce_loss": 0.1527552306652069, "step": 330 }, { "epoch": 0.11007338225483655, "loss": 1.3370083570480347, "step": 330 }, { "ce_loss": 0.4256639778614044, "epoch": 0.11007338225483655, "step": 330 }, { "distill_loss": 0.41985267400741577, "epoch": 0.11007338225483655, "step": 330 }, { "epoch": 0.11007338225483655, "ref_ce_loss": 0.1425405740737915, "step": 330 }, { "epoch": 0.1134089392928619, "loss": 1.4184, "step": 340 }, { "epoch": 0.1134089392928619, "grad_norm": 6.696072578430176, "step": 340 }, { "epoch": 0.1134089392928619, "learning_rate": 0.00014166666666666665, "step": 340 }, { "epoch": 0.1134089392928619, "loss": 1.4294278621673584, "step": 340 }, { "ce_loss": 0.4337483048439026, "epoch": 0.1134089392928619, "step": 340 }, { "distill_loss": 0.42644885182380676, "epoch": 0.1134089392928619, "step": 340 }, { "epoch": 0.1134089392928619, "ref_ce_loss": 0.14427414536476135, "step": 340 }, { "epoch": 0.1134089392928619, "loss": 1.463660717010498, "step": 340 }, { "ce_loss": 0.5838468074798584, "epoch": 0.1134089392928619, "step": 340 }, { "distill_loss": 0.46177640557289124, "epoch": 0.1134089392928619, "step": 340 }, { "epoch": 0.1134089392928619, "ref_ce_loss": 0.2553325593471527, "step": 340 }, { "epoch": 0.1134089392928619, "loss": 1.427936315536499, "step": 340 }, { "ce_loss": 0.48335981369018555, "epoch": 0.1134089392928619, "step": 340 }, { "distill_loss": 0.4261768162250519, "epoch": 0.1134089392928619, "step": 340 }, { "epoch": 0.1134089392928619, "ref_ce_loss": 0.24112361669540405, "step": 340 }, { "epoch": 0.1134089392928619, "loss": 1.4521446228027344, "step": 340 }, { "ce_loss": 0.4573907256126404, "epoch": 0.1134089392928619, "step": 340 }, { "distill_loss": 0.39685431122779846, "epoch": 0.1134089392928619, "step": 340 }, { "epoch": 0.1134089392928619, "ref_ce_loss": 0.16895025968551636, "step": 340 }, { "epoch": 0.11674449633088725, "loss": 1.4988, "step": 350 }, { "epoch": 0.11674449633088725, "grad_norm": 4.563868045806885, "step": 350 }, { "epoch": 0.11674449633088725, "learning_rate": 0.00014583333333333332, "step": 350 }, { "epoch": 0.11674449633088725, "loss": 1.0791542530059814, "step": 350 }, { "ce_loss": 0.35604622960090637, "epoch": 0.11674449633088725, "step": 350 }, { "distill_loss": 0.3385554552078247, "epoch": 0.11674449633088725, "step": 350 }, { "epoch": 0.11674449633088725, "ref_ce_loss": 0.1298314332962036, "step": 350 }, { "epoch": 0.11674449633088725, "loss": 1.874572515487671, "step": 350 }, { "ce_loss": 0.5706193447113037, "epoch": 0.11674449633088725, "step": 350 }, { "distill_loss": 0.462056040763855, "epoch": 0.11674449633088725, "step": 350 }, { "epoch": 0.11674449633088725, "ref_ce_loss": 0.18519967794418335, "step": 350 }, { "epoch": 0.11674449633088725, "loss": 1.3227744102478027, "step": 350 }, { "ce_loss": 0.4325740933418274, "epoch": 0.11674449633088725, "step": 350 }, { "distill_loss": 0.4918447732925415, "epoch": 0.11674449633088725, "step": 350 }, { "epoch": 0.11674449633088725, "ref_ce_loss": 0.18212299048900604, "step": 350 }, { "epoch": 0.11674449633088725, "loss": 1.2638686895370483, "step": 350 }, { "ce_loss": 0.4000357687473297, "epoch": 0.11674449633088725, "step": 350 }, { "distill_loss": 0.38934990763664246, "epoch": 0.11674449633088725, "step": 350 }, { "epoch": 0.11674449633088725, "ref_ce_loss": 0.16199086606502533, "step": 350 }, { "epoch": 0.12008005336891261, "loss": 1.4421, "step": 360 }, { "epoch": 0.12008005336891261, "grad_norm": 3.7695729732513428, "step": 360 }, { "epoch": 0.12008005336891261, "learning_rate": 0.00015, "step": 360 }, { "epoch": 0.12008005336891261, "loss": 1.2791365385055542, "step": 360 }, { "ce_loss": 0.4072173535823822, "epoch": 0.12008005336891261, "step": 360 }, { "distill_loss": 0.5058611631393433, "epoch": 0.12008005336891261, "step": 360 }, { "epoch": 0.12008005336891261, "ref_ce_loss": 0.17743639647960663, "step": 360 }, { "epoch": 0.12008005336891261, "loss": 1.5815454721450806, "step": 360 }, { "ce_loss": 0.48840242624282837, "epoch": 0.12008005336891261, "step": 360 }, { "distill_loss": 0.546277642250061, "epoch": 0.12008005336891261, "step": 360 }, { "epoch": 0.12008005336891261, "ref_ce_loss": 0.26060250401496887, "step": 360 }, { "epoch": 0.12008005336891261, "loss": 1.5207912921905518, "step": 360 }, { "ce_loss": 0.352418452501297, "epoch": 0.12008005336891261, "step": 360 }, { "distill_loss": 0.5164685845375061, "epoch": 0.12008005336891261, "step": 360 }, { "epoch": 0.12008005336891261, "ref_ce_loss": 0.1214674562215805, "step": 360 }, { "epoch": 0.12008005336891261, "loss": 1.307206153869629, "step": 360 }, { "ce_loss": 0.3980710804462433, "epoch": 0.12008005336891261, "step": 360 }, { "distill_loss": 0.5026689171791077, "epoch": 0.12008005336891261, "step": 360 }, { "epoch": 0.12008005336891261, "ref_ce_loss": 0.1789749711751938, "step": 360 }, { "epoch": 0.12341561040693796, "loss": 1.2719, "step": 370 }, { "epoch": 0.12341561040693796, "grad_norm": 2.6082639694213867, "step": 370 }, { "epoch": 0.12341561040693796, "learning_rate": 0.00015416666666666663, "step": 370 }, { "epoch": 0.12341561040693796, "loss": 1.9527366161346436, "step": 370 }, { "ce_loss": 0.4247026741504669, "epoch": 0.12341561040693796, "step": 370 }, { "distill_loss": 0.44470375776290894, "epoch": 0.12341561040693796, "step": 370 }, { "epoch": 0.12341561040693796, "ref_ce_loss": 0.23504160344600677, "step": 370 }, { "epoch": 0.12341561040693796, "loss": 1.3341894149780273, "step": 370 }, { "ce_loss": 0.47517120838165283, "epoch": 0.12341561040693796, "step": 370 }, { "distill_loss": 0.4574129581451416, "epoch": 0.12341561040693796, "step": 370 }, { "epoch": 0.12341561040693796, "ref_ce_loss": 0.24176302552223206, "step": 370 }, { "epoch": 0.12341561040693796, "loss": 1.0476963520050049, "step": 370 }, { "ce_loss": 0.32721778750419617, "epoch": 0.12341561040693796, "step": 370 }, { "distill_loss": 0.44194135069847107, "epoch": 0.12341561040693796, "step": 370 }, { "epoch": 0.12341561040693796, "ref_ce_loss": 0.10548078268766403, "step": 370 }, { "epoch": 0.12341561040693796, "loss": 1.5318841934204102, "step": 370 }, { "ce_loss": 0.38235610723495483, "epoch": 0.12341561040693796, "step": 370 }, { "distill_loss": 0.4024381935596466, "epoch": 0.12341561040693796, "step": 370 }, { "epoch": 0.12341561040693796, "ref_ce_loss": 0.2406122088432312, "step": 370 }, { "epoch": 0.12675116744496331, "loss": 1.3198, "step": 380 }, { "epoch": 0.12675116744496331, "grad_norm": 3.5721678733825684, "step": 380 }, { "epoch": 0.12675116744496331, "learning_rate": 0.00015833333333333332, "step": 380 }, { "epoch": 0.12675116744496331, "loss": 1.3019144535064697, "step": 380 }, { "ce_loss": 0.47807297110557556, "epoch": 0.12675116744496331, "step": 380 }, { "distill_loss": 0.3548854887485504, "epoch": 0.12675116744496331, "step": 380 }, { "epoch": 0.12675116744496331, "ref_ce_loss": 0.2511584162712097, "step": 380 }, { "epoch": 0.12675116744496331, "loss": 0.7955793142318726, "step": 380 }, { "ce_loss": 0.31110042333602905, "epoch": 0.12675116744496331, "step": 380 }, { "distill_loss": 0.2900923788547516, "epoch": 0.12675116744496331, "step": 380 }, { "epoch": 0.12675116744496331, "ref_ce_loss": 0.19429923593997955, "step": 380 }, { "epoch": 0.12675116744496331, "loss": 1.0563925504684448, "step": 380 }, { "ce_loss": 0.4805901348590851, "epoch": 0.12675116744496331, "step": 380 }, { "distill_loss": 0.3224341571331024, "epoch": 0.12675116744496331, "step": 380 }, { "epoch": 0.12675116744496331, "ref_ce_loss": 0.25321510434150696, "step": 380 }, { "epoch": 0.12675116744496331, "loss": 1.2074453830718994, "step": 380 }, { "ce_loss": 0.39232152700424194, "epoch": 0.12675116744496331, "step": 380 }, { "distill_loss": 0.3258730173110962, "epoch": 0.12675116744496331, "step": 380 }, { "epoch": 0.12675116744496331, "ref_ce_loss": 0.2012324184179306, "step": 380 }, { "epoch": 0.13008672448298866, "loss": 1.256, "step": 390 }, { "epoch": 0.13008672448298866, "grad_norm": 5.881503105163574, "step": 390 }, { "epoch": 0.13008672448298866, "learning_rate": 0.00016249999999999997, "step": 390 }, { "epoch": 0.13008672448298866, "loss": 1.3629982471466064, "step": 390 }, { "ce_loss": 0.36652955412864685, "epoch": 0.13008672448298866, "step": 390 }, { "distill_loss": 0.29035764932632446, "epoch": 0.13008672448298866, "step": 390 }, { "epoch": 0.13008672448298866, "ref_ce_loss": 0.22455023229122162, "step": 390 }, { "epoch": 0.13008672448298866, "loss": 0.9247293472290039, "step": 390 }, { "ce_loss": 0.4429352283477783, "epoch": 0.13008672448298866, "step": 390 }, { "distill_loss": 0.29146555066108704, "epoch": 0.13008672448298866, "step": 390 }, { "epoch": 0.13008672448298866, "ref_ce_loss": 0.19032831490039825, "step": 390 }, { "epoch": 0.13008672448298866, "loss": 1.3504780530929565, "step": 390 }, { "ce_loss": 0.3869108557701111, "epoch": 0.13008672448298866, "step": 390 }, { "distill_loss": 0.2632961869239807, "epoch": 0.13008672448298866, "step": 390 }, { "epoch": 0.13008672448298866, "ref_ce_loss": 0.1716235727071762, "step": 390 }, { "epoch": 0.13008672448298866, "loss": 0.904478907585144, "step": 390 }, { "ce_loss": 0.3317762017250061, "epoch": 0.13008672448298866, "step": 390 }, { "distill_loss": 0.2193835973739624, "epoch": 0.13008672448298866, "step": 390 }, { "epoch": 0.13008672448298866, "ref_ce_loss": 0.1852221041917801, "step": 390 }, { "epoch": 0.133422281521014, "loss": 1.3014, "step": 400 }, { "epoch": 0.133422281521014, "grad_norm": 4.005307674407959, "step": 400 }, { "epoch": 0.133422281521014, "learning_rate": 0.00016666666666666666, "step": 400 }, { "epoch": 0.133422281521014, "loss": 1.7505016326904297, "step": 400 }, { "ce_loss": 0.44826963543891907, "epoch": 0.133422281521014, "step": 400 }, { "distill_loss": 0.23693206906318665, "epoch": 0.133422281521014, "step": 400 }, { "epoch": 0.133422281521014, "ref_ce_loss": 0.26399722695350647, "step": 400 }, { "epoch": 0.133422281521014, "loss": 0.8870535492897034, "step": 400 }, { "ce_loss": 0.3827820122241974, "epoch": 0.133422281521014, "step": 400 }, { "distill_loss": 0.20150505006313324, "epoch": 0.133422281521014, "step": 400 }, { "epoch": 0.133422281521014, "ref_ce_loss": 0.15887323021888733, "step": 400 }, { "epoch": 0.133422281521014, "loss": 1.160254955291748, "step": 400 }, { "ce_loss": 0.37770774960517883, "epoch": 0.133422281521014, "step": 400 }, { "distill_loss": 0.24424059689044952, "epoch": 0.133422281521014, "step": 400 }, { "epoch": 0.133422281521014, "ref_ce_loss": 0.2287183552980423, "step": 400 }, { "epoch": 0.133422281521014, "loss": 1.241231083869934, "step": 400 }, { "ce_loss": 0.4069371819496155, "epoch": 0.133422281521014, "step": 400 }, { "distill_loss": 0.22539016604423523, "epoch": 0.133422281521014, "step": 400 }, { "epoch": 0.133422281521014, "ref_ce_loss": 0.21300897002220154, "step": 400 }, { "epoch": 0.13675783855903936, "loss": 1.3101, "step": 410 }, { "epoch": 0.13675783855903936, "grad_norm": 5.411527156829834, "step": 410 }, { "epoch": 0.13675783855903936, "learning_rate": 0.0001708333333333333, "step": 410 }, { "epoch": 0.13675783855903936, "loss": 1.7810142040252686, "step": 410 }, { "ce_loss": 0.45244982838630676, "epoch": 0.13675783855903936, "step": 410 }, { "distill_loss": 0.2896427810192108, "epoch": 0.13675783855903936, "step": 410 }, { "epoch": 0.13675783855903936, "ref_ce_loss": 0.1715669482946396, "step": 410 }, { "epoch": 0.13675783855903936, "loss": 1.1003687381744385, "step": 410 }, { "ce_loss": 0.3501242399215698, "epoch": 0.13675783855903936, "step": 410 }, { "distill_loss": 0.26800230145454407, "epoch": 0.13675783855903936, "step": 410 }, { "epoch": 0.13675783855903936, "ref_ce_loss": 0.14147433638572693, "step": 410 }, { "epoch": 0.13675783855903936, "loss": 1.1758261919021606, "step": 410 }, { "ce_loss": 0.4695630371570587, "epoch": 0.13675783855903936, "step": 410 }, { "distill_loss": 0.3363664746284485, "epoch": 0.13675783855903936, "step": 410 }, { "epoch": 0.13675783855903936, "ref_ce_loss": 0.1658724695444107, "step": 410 }, { "epoch": 0.13675783855903936, "loss": 1.3013876676559448, "step": 410 }, { "ce_loss": 0.4676929712295532, "epoch": 0.13675783855903936, "step": 410 }, { "distill_loss": 0.2978712022304535, "epoch": 0.13675783855903936, "step": 410 }, { "epoch": 0.13675783855903936, "ref_ce_loss": 0.18323028087615967, "step": 410 }, { "epoch": 0.1400933955970647, "loss": 1.1958, "step": 420 }, { "epoch": 0.1400933955970647, "grad_norm": 4.778338432312012, "step": 420 }, { "epoch": 0.1400933955970647, "learning_rate": 0.000175, "step": 420 }, { "epoch": 0.1400933955970647, "loss": 0.9920318722724915, "step": 420 }, { "ce_loss": 0.5251628160476685, "epoch": 0.1400933955970647, "step": 420 }, { "distill_loss": 0.2395399510860443, "epoch": 0.1400933955970647, "step": 420 }, { "epoch": 0.1400933955970647, "ref_ce_loss": 0.22730065882205963, "step": 420 }, { "epoch": 0.1400933955970647, "loss": 1.2365477085113525, "step": 420 }, { "ce_loss": 0.4431440532207489, "epoch": 0.1400933955970647, "step": 420 }, { "distill_loss": 0.2159503847360611, "epoch": 0.1400933955970647, "step": 420 }, { "epoch": 0.1400933955970647, "ref_ce_loss": 0.24663586914539337, "step": 420 }, { "epoch": 0.1400933955970647, "loss": 0.8285670876502991, "step": 420 }, { "ce_loss": 0.42659738659858704, "epoch": 0.1400933955970647, "step": 420 }, { "distill_loss": 0.20237162709236145, "epoch": 0.1400933955970647, "step": 420 }, { "epoch": 0.1400933955970647, "ref_ce_loss": 0.19959665834903717, "step": 420 }, { "epoch": 0.1400933955970647, "loss": 1.406829833984375, "step": 420 }, { "ce_loss": 0.3699018955230713, "epoch": 0.1400933955970647, "step": 420 }, { "distill_loss": 0.19302500784397125, "epoch": 0.1400933955970647, "step": 420 }, { "epoch": 0.1400933955970647, "ref_ce_loss": 0.21495334804058075, "step": 420 }, { "epoch": 0.14342895263509006, "loss": 1.2285, "step": 430 }, { "epoch": 0.14342895263509006, "grad_norm": 8.007396697998047, "step": 430 }, { "epoch": 0.14342895263509006, "learning_rate": 0.00017916666666666664, "step": 430 }, { "epoch": 0.14342895263509006, "loss": 1.0800083875656128, "step": 430 }, { "ce_loss": 0.34309080243110657, "epoch": 0.14342895263509006, "step": 430 }, { "distill_loss": 0.17490103840827942, "epoch": 0.14342895263509006, "step": 430 }, { "epoch": 0.14342895263509006, "ref_ce_loss": 0.1360379010438919, "step": 430 }, { "epoch": 0.14342895263509006, "loss": 1.1706424951553345, "step": 430 }, { "ce_loss": 0.4648958742618561, "epoch": 0.14342895263509006, "step": 430 }, { "distill_loss": 0.21292121708393097, "epoch": 0.14342895263509006, "step": 430 }, { "epoch": 0.14342895263509006, "ref_ce_loss": 0.23670294880867004, "step": 430 }, { "epoch": 0.14342895263509006, "loss": 1.0521199703216553, "step": 430 }, { "ce_loss": 0.4007576107978821, "epoch": 0.14342895263509006, "step": 430 }, { "distill_loss": 0.18228591978549957, "epoch": 0.14342895263509006, "step": 430 }, { "epoch": 0.14342895263509006, "ref_ce_loss": 0.2405649870634079, "step": 430 }, { "epoch": 0.14342895263509006, "loss": 0.9265434741973877, "step": 430 }, { "ce_loss": 0.3507271409034729, "epoch": 0.14342895263509006, "step": 430 }, { "distill_loss": 0.1507226973772049, "epoch": 0.14342895263509006, "step": 430 }, { "epoch": 0.14342895263509006, "ref_ce_loss": 0.15023833513259888, "step": 430 }, { "epoch": 0.1467645096731154, "loss": 1.1471, "step": 440 }, { "epoch": 0.1467645096731154, "grad_norm": 4.192609786987305, "step": 440 }, { "epoch": 0.1467645096731154, "learning_rate": 0.00018333333333333334, "step": 440 }, { "epoch": 0.1467645096731154, "loss": 0.79221510887146, "step": 440 }, { "ce_loss": 0.3981459140777588, "epoch": 0.1467645096731154, "step": 440 }, { "distill_loss": 0.22157087922096252, "epoch": 0.1467645096731154, "step": 440 }, { "epoch": 0.1467645096731154, "ref_ce_loss": 0.17235179245471954, "step": 440 }, { "epoch": 0.1467645096731154, "loss": 0.863082230091095, "step": 440 }, { "ce_loss": 0.36590737104415894, "epoch": 0.1467645096731154, "step": 440 }, { "distill_loss": 0.1941511183977127, "epoch": 0.1467645096731154, "step": 440 }, { "epoch": 0.1467645096731154, "ref_ce_loss": 0.17397010326385498, "step": 440 }, { "epoch": 0.1467645096731154, "loss": 0.7825886011123657, "step": 440 }, { "ce_loss": 0.3600039780139923, "epoch": 0.1467645096731154, "step": 440 }, { "distill_loss": 0.1648310422897339, "epoch": 0.1467645096731154, "step": 440 }, { "epoch": 0.1467645096731154, "ref_ce_loss": 0.2572781443595886, "step": 440 }, { "epoch": 0.1467645096731154, "loss": 1.2959476709365845, "step": 440 }, { "ce_loss": 0.42079582810401917, "epoch": 0.1467645096731154, "step": 440 }, { "distill_loss": 0.18587611615657806, "epoch": 0.1467645096731154, "step": 440 }, { "epoch": 0.1467645096731154, "ref_ce_loss": 0.23832887411117554, "step": 440 }, { "epoch": 0.15010006671114076, "loss": 1.1871, "step": 450 }, { "epoch": 0.15010006671114076, "grad_norm": 3.1583211421966553, "step": 450 }, { "epoch": 0.15010006671114076, "learning_rate": 0.00018749999999999998, "step": 450 }, { "epoch": 0.15010006671114076, "loss": 1.155402660369873, "step": 450 }, { "ce_loss": 0.3351343870162964, "epoch": 0.15010006671114076, "step": 450 }, { "distill_loss": 0.21110394597053528, "epoch": 0.15010006671114076, "step": 450 }, { "epoch": 0.15010006671114076, "ref_ce_loss": 0.17169038951396942, "step": 450 }, { "epoch": 0.15010006671114076, "loss": 0.8734752535820007, "step": 450 }, { "ce_loss": 0.390616238117218, "epoch": 0.15010006671114076, "step": 450 }, { "distill_loss": 0.2063642293214798, "epoch": 0.15010006671114076, "step": 450 }, { "epoch": 0.15010006671114076, "ref_ce_loss": 0.15543001890182495, "step": 450 }, { "epoch": 0.15010006671114076, "loss": 1.4934319257736206, "step": 450 }, { "ce_loss": 0.366504967212677, "epoch": 0.15010006671114076, "step": 450 }, { "distill_loss": 0.2531406581401825, "epoch": 0.15010006671114076, "step": 450 }, { "epoch": 0.15010006671114076, "ref_ce_loss": 0.15823999047279358, "step": 450 }, { "epoch": 0.15010006671114076, "loss": 1.0348056554794312, "step": 450 }, { "ce_loss": 0.4287627041339874, "epoch": 0.15010006671114076, "step": 450 }, { "distill_loss": 0.23162701725959778, "epoch": 0.15010006671114076, "step": 450 }, { "epoch": 0.15010006671114076, "ref_ce_loss": 0.21052688360214233, "step": 450 }, { "epoch": 0.1534356237491661, "loss": 1.1691, "step": 460 }, { "epoch": 0.1534356237491661, "grad_norm": 2.9276604652404785, "step": 460 }, { "epoch": 0.1534356237491661, "learning_rate": 0.00019166666666666665, "step": 460 }, { "epoch": 0.1534356237491661, "loss": 0.9366768598556519, "step": 460 }, { "ce_loss": 0.3069738745689392, "epoch": 0.1534356237491661, "step": 460 }, { "distill_loss": 0.1811765879392624, "epoch": 0.1534356237491661, "step": 460 }, { "epoch": 0.1534356237491661, "ref_ce_loss": 0.12331288307905197, "step": 460 }, { "epoch": 0.1534356237491661, "loss": 0.792156994342804, "step": 460 }, { "ce_loss": 0.3994184136390686, "epoch": 0.1534356237491661, "step": 460 }, { "distill_loss": 0.2316671758890152, "epoch": 0.1534356237491661, "step": 460 }, { "epoch": 0.1534356237491661, "ref_ce_loss": 0.1596464067697525, "step": 460 }, { "epoch": 0.1534356237491661, "loss": 1.0726959705352783, "step": 460 }, { "ce_loss": 0.44579944014549255, "epoch": 0.1534356237491661, "step": 460 }, { "distill_loss": 0.24677278101444244, "epoch": 0.1534356237491661, "step": 460 }, { "epoch": 0.1534356237491661, "ref_ce_loss": 0.20206046104431152, "step": 460 }, { "epoch": 0.1534356237491661, "loss": 0.8775638341903687, "step": 460 }, { "ce_loss": 0.344332754611969, "epoch": 0.1534356237491661, "step": 460 }, { "distill_loss": 0.2588733732700348, "epoch": 0.1534356237491661, "step": 460 }, { "epoch": 0.1534356237491661, "ref_ce_loss": 0.12321383506059647, "step": 460 }, { "epoch": 0.15677118078719146, "loss": 1.0912, "step": 470 }, { "epoch": 0.15677118078719146, "grad_norm": 3.4049301147460938, "step": 470 }, { "epoch": 0.15677118078719146, "learning_rate": 0.00019583333333333331, "step": 470 }, { "epoch": 0.15677118078719146, "loss": 1.1278324127197266, "step": 470 }, { "ce_loss": 0.4387667775154114, "epoch": 0.15677118078719146, "step": 470 }, { "distill_loss": 0.23371437191963196, "epoch": 0.15677118078719146, "step": 470 }, { "epoch": 0.15677118078719146, "ref_ce_loss": 0.16273000836372375, "step": 470 }, { "epoch": 0.15677118078719146, "loss": 1.114906668663025, "step": 470 }, { "ce_loss": 0.4878920316696167, "epoch": 0.15677118078719146, "step": 470 }, { "distill_loss": 0.2571012079715729, "epoch": 0.15677118078719146, "step": 470 }, { "epoch": 0.15677118078719146, "ref_ce_loss": 0.22147198021411896, "step": 470 }, { "epoch": 0.15677118078719146, "loss": 0.914812445640564, "step": 470 }, { "ce_loss": 0.35870662331581116, "epoch": 0.15677118078719146, "step": 470 }, { "distill_loss": 0.2166028916835785, "epoch": 0.15677118078719146, "step": 470 }, { "epoch": 0.15677118078719146, "ref_ce_loss": 0.18772242963314056, "step": 470 }, { "epoch": 0.15677118078719146, "loss": 0.9531413912773132, "step": 470 }, { "ce_loss": 0.4369311034679413, "epoch": 0.15677118078719146, "step": 470 }, { "distill_loss": 0.23241980373859406, "epoch": 0.15677118078719146, "step": 470 }, { "epoch": 0.15677118078719146, "ref_ce_loss": 0.13508175313472748, "step": 470 }, { "epoch": 0.1601067378252168, "loss": 1.101, "step": 480 }, { "epoch": 0.1601067378252168, "grad_norm": 3.4803426265716553, "step": 480 }, { "epoch": 0.1601067378252168, "learning_rate": 0.00019999999999999998, "step": 480 }, { "epoch": 0.1601067378252168, "loss": 0.9322621822357178, "step": 480 }, { "ce_loss": 0.33916839957237244, "epoch": 0.1601067378252168, "step": 480 }, { "distill_loss": 0.19862687587738037, "epoch": 0.1601067378252168, "step": 480 }, { "epoch": 0.1601067378252168, "ref_ce_loss": 0.16116870939731598, "step": 480 }, { "epoch": 0.1601067378252168, "loss": 0.8902009129524231, "step": 480 }, { "ce_loss": 0.32456398010253906, "epoch": 0.1601067378252168, "step": 480 }, { "distill_loss": 0.17489784955978394, "epoch": 0.1601067378252168, "step": 480 }, { "epoch": 0.1601067378252168, "ref_ce_loss": 0.22662509977817535, "step": 480 }, { "epoch": 0.1601067378252168, "loss": 1.1041722297668457, "step": 480 }, { "ce_loss": 0.31113237142562866, "epoch": 0.1601067378252168, "step": 480 }, { "distill_loss": 0.17790481448173523, "epoch": 0.1601067378252168, "step": 480 }, { "epoch": 0.1601067378252168, "ref_ce_loss": 0.15077772736549377, "step": 480 }, { "epoch": 0.1601067378252168, "loss": 0.8598635196685791, "step": 480 }, { "ce_loss": 0.418891042470932, "epoch": 0.1601067378252168, "step": 480 }, { "distill_loss": 0.18824335932731628, "epoch": 0.1601067378252168, "step": 480 }, { "epoch": 0.1601067378252168, "ref_ce_loss": 0.14995791018009186, "step": 480 }, { "epoch": 0.16344229486324216, "loss": 1.0941, "step": 490 }, { "epoch": 0.16344229486324216, "grad_norm": 3.9171133041381836, "step": 490 }, { "epoch": 0.16344229486324216, "learning_rate": 0.00020416666666666665, "step": 490 }, { "epoch": 0.16344229486324216, "loss": 1.0149155855178833, "step": 490 }, { "ce_loss": 0.45266851782798767, "epoch": 0.16344229486324216, "step": 490 }, { "distill_loss": 0.1602305769920349, "epoch": 0.16344229486324216, "step": 490 }, { "epoch": 0.16344229486324216, "ref_ce_loss": 0.20514211058616638, "step": 490 }, { "epoch": 0.16344229486324216, "loss": 1.6531803607940674, "step": 490 }, { "ce_loss": 0.3911672830581665, "epoch": 0.16344229486324216, "step": 490 }, { "distill_loss": 0.14901621639728546, "epoch": 0.16344229486324216, "step": 490 }, { "epoch": 0.16344229486324216, "ref_ce_loss": 0.2036956399679184, "step": 490 }, { "epoch": 0.16344229486324216, "loss": 0.800538182258606, "step": 490 }, { "ce_loss": 0.27649155259132385, "epoch": 0.16344229486324216, "step": 490 }, { "distill_loss": 0.13379120826721191, "epoch": 0.16344229486324216, "step": 490 }, { "epoch": 0.16344229486324216, "ref_ce_loss": 0.24327047169208527, "step": 490 }, { "epoch": 0.16344229486324216, "loss": 1.8821849822998047, "step": 490 }, { "ce_loss": 0.38936328887939453, "epoch": 0.16344229486324216, "step": 490 }, { "distill_loss": 0.1746000498533249, "epoch": 0.16344229486324216, "step": 490 }, { "epoch": 0.16344229486324216, "ref_ce_loss": 0.17974820733070374, "step": 490 }, { "epoch": 0.1667778519012675, "loss": 1.0735, "step": 500 }, { "epoch": 0.1667778519012675, "grad_norm": 3.739142417907715, "step": 500 }, { "epoch": 0.1667778519012675, "learning_rate": 0.00020833333333333332, "step": 500 }, { "epoch": 0.1667778519012675, "loss": 0.7304775714874268, "step": 500 }, { "ce_loss": 0.34158605337142944, "epoch": 0.1667778519012675, "step": 500 }, { "distill_loss": 0.11969535052776337, "epoch": 0.1667778519012675, "step": 500 }, { "epoch": 0.1667778519012675, "ref_ce_loss": 0.1446818709373474, "step": 500 }, { "epoch": 0.1667778519012675, "loss": 1.7273657321929932, "step": 500 }, { "ce_loss": 0.40708187222480774, "epoch": 0.1667778519012675, "step": 500 }, { "distill_loss": 0.12989500164985657, "epoch": 0.1667778519012675, "step": 500 }, { "epoch": 0.1667778519012675, "ref_ce_loss": 0.22536632418632507, "step": 500 }, { "epoch": 0.1667778519012675, "loss": 1.4432916641235352, "step": 500 }, { "ce_loss": 0.33398616313934326, "epoch": 0.1667778519012675, "step": 500 }, { "distill_loss": 0.1335625797510147, "epoch": 0.1667778519012675, "step": 500 }, { "epoch": 0.1667778519012675, "ref_ce_loss": 0.14719289541244507, "step": 500 }, { "epoch": 0.1667778519012675, "loss": 1.2549631595611572, "step": 500 }, { "ce_loss": 0.43018224835395813, "epoch": 0.1667778519012675, "step": 500 }, { "distill_loss": 0.10706700384616852, "epoch": 0.1667778519012675, "step": 500 }, { "epoch": 0.1667778519012675, "ref_ce_loss": 0.24464663863182068, "step": 500 }, { "epoch": 0.17011340893929286, "loss": 1.0963, "step": 510 }, { "epoch": 0.17011340893929286, "grad_norm": 2.8536627292633057, "step": 510 }, { "epoch": 0.17011340893929286, "learning_rate": 0.0002125, "step": 510 }, { "epoch": 0.17011340893929286, "loss": 0.7844828367233276, "step": 510 }, { "ce_loss": 0.34540170431137085, "epoch": 0.17011340893929286, "step": 510 }, { "distill_loss": 0.1451631784439087, "epoch": 0.17011340893929286, "step": 510 }, { "epoch": 0.17011340893929286, "ref_ce_loss": 0.14832229912281036, "step": 510 }, { "epoch": 0.17011340893929286, "loss": 1.0483617782592773, "step": 510 }, { "ce_loss": 0.5033784508705139, "epoch": 0.17011340893929286, "step": 510 }, { "distill_loss": 0.13474875688552856, "epoch": 0.17011340893929286, "step": 510 }, { "epoch": 0.17011340893929286, "ref_ce_loss": 0.2690085768699646, "step": 510 }, { "epoch": 0.17011340893929286, "loss": 1.3711141347885132, "step": 510 }, { "ce_loss": 0.5016711354255676, "epoch": 0.17011340893929286, "step": 510 }, { "distill_loss": 0.15381048619747162, "epoch": 0.17011340893929286, "step": 510 }, { "epoch": 0.17011340893929286, "ref_ce_loss": 0.254338800907135, "step": 510 }, { "epoch": 0.17011340893929286, "loss": 0.7214844822883606, "step": 510 }, { "ce_loss": 0.35924723744392395, "epoch": 0.17011340893929286, "step": 510 }, { "distill_loss": 0.13219282031059265, "epoch": 0.17011340893929286, "step": 510 }, { "epoch": 0.17011340893929286, "ref_ce_loss": 0.22950786352157593, "step": 510 }, { "epoch": 0.1734489659773182, "loss": 1.072, "step": 520 }, { "epoch": 0.1734489659773182, "grad_norm": 7.5163092613220215, "step": 520 }, { "epoch": 0.1734489659773182, "learning_rate": 0.00021666666666666666, "step": 520 }, { "epoch": 0.1734489659773182, "loss": 1.590034008026123, "step": 520 }, { "ce_loss": 0.5328056812286377, "epoch": 0.1734489659773182, "step": 520 }, { "distill_loss": 0.1510966718196869, "epoch": 0.1734489659773182, "step": 520 }, { "epoch": 0.1734489659773182, "ref_ce_loss": 0.2138897031545639, "step": 520 }, { "epoch": 0.1734489659773182, "loss": 1.0373549461364746, "step": 520 }, { "ce_loss": 0.3898574709892273, "epoch": 0.1734489659773182, "step": 520 }, { "distill_loss": 0.1519639492034912, "epoch": 0.1734489659773182, "step": 520 }, { "epoch": 0.1734489659773182, "ref_ce_loss": 0.24447669088840485, "step": 520 }, { "epoch": 0.1734489659773182, "loss": 1.6956017017364502, "step": 520 }, { "ce_loss": 0.4165439009666443, "epoch": 0.1734489659773182, "step": 520 }, { "distill_loss": 0.1603700965642929, "epoch": 0.1734489659773182, "step": 520 }, { "epoch": 0.1734489659773182, "ref_ce_loss": 0.10957357287406921, "step": 520 }, { "epoch": 0.1734489659773182, "loss": 1.230751633644104, "step": 520 }, { "ce_loss": 0.4296768307685852, "epoch": 0.1734489659773182, "step": 520 }, { "distill_loss": 0.17311978340148926, "epoch": 0.1734489659773182, "step": 520 }, { "epoch": 0.1734489659773182, "ref_ce_loss": 0.17816101014614105, "step": 520 }, { "epoch": 0.17678452301534356, "loss": 1.2013, "step": 530 }, { "epoch": 0.17678452301534356, "grad_norm": 4.662562370300293, "step": 530 }, { "epoch": 0.17678452301534356, "learning_rate": 0.00022083333333333333, "step": 530 }, { "epoch": 0.17678452301534356, "loss": 1.1115416288375854, "step": 530 }, { "ce_loss": 0.4564555883407593, "epoch": 0.17678452301534356, "step": 530 }, { "distill_loss": 0.12773512303829193, "epoch": 0.17678452301534356, "step": 530 }, { "epoch": 0.17678452301534356, "ref_ce_loss": 0.19250312447547913, "step": 530 }, { "epoch": 0.17678452301534356, "loss": 0.6754852533340454, "step": 530 }, { "ce_loss": 0.39747318625450134, "epoch": 0.17678452301534356, "step": 530 }, { "distill_loss": 0.1195475161075592, "epoch": 0.17678452301534356, "step": 530 }, { "epoch": 0.17678452301534356, "ref_ce_loss": 0.15837079286575317, "step": 530 }, { "epoch": 0.17678452301534356, "loss": 0.7832016944885254, "step": 530 }, { "ce_loss": 0.42709383368492126, "epoch": 0.17678452301534356, "step": 530 }, { "distill_loss": 0.13008397817611694, "epoch": 0.17678452301534356, "step": 530 }, { "epoch": 0.17678452301534356, "ref_ce_loss": 0.22590842843055725, "step": 530 }, { "epoch": 0.17678452301534356, "loss": 1.5747027397155762, "step": 530 }, { "ce_loss": 0.38369321823120117, "epoch": 0.17678452301534356, "step": 530 }, { "distill_loss": 0.121522918343544, "epoch": 0.17678452301534356, "step": 530 }, { "epoch": 0.17678452301534356, "ref_ce_loss": 0.1542671173810959, "step": 530 }, { "epoch": 0.1801200800533689, "loss": 1.0564, "step": 540 }, { "epoch": 0.1801200800533689, "grad_norm": 3.5200061798095703, "step": 540 }, { "epoch": 0.1801200800533689, "learning_rate": 0.000225, "step": 540 }, { "epoch": 0.1801200800533689, "loss": 1.145336389541626, "step": 540 }, { "ce_loss": 0.4209885895252228, "epoch": 0.1801200800533689, "step": 540 }, { "distill_loss": 0.13889750838279724, "epoch": 0.1801200800533689, "step": 540 }, { "epoch": 0.1801200800533689, "ref_ce_loss": 0.1695491373538971, "step": 540 }, { "epoch": 0.1801200800533689, "loss": 0.8331067562103271, "step": 540 }, { "ce_loss": 0.3781631290912628, "epoch": 0.1801200800533689, "step": 540 }, { "distill_loss": 0.1160370260477066, "epoch": 0.1801200800533689, "step": 540 }, { "epoch": 0.1801200800533689, "ref_ce_loss": 0.21670031547546387, "step": 540 }, { "epoch": 0.1801200800533689, "loss": 0.7429934740066528, "step": 540 }, { "ce_loss": 0.4223313331604004, "epoch": 0.1801200800533689, "step": 540 }, { "distill_loss": 0.1255422979593277, "epoch": 0.1801200800533689, "step": 540 }, { "epoch": 0.1801200800533689, "ref_ce_loss": 0.1942986696958542, "step": 540 }, { "epoch": 0.1801200800533689, "loss": 0.8002361059188843, "step": 540 }, { "ce_loss": 0.3722546398639679, "epoch": 0.1801200800533689, "step": 540 }, { "distill_loss": 0.13500186800956726, "epoch": 0.1801200800533689, "step": 540 }, { "epoch": 0.1801200800533689, "ref_ce_loss": 0.14406229555606842, "step": 540 }, { "epoch": 0.18345563709139426, "loss": 1.0981, "step": 550 }, { "epoch": 0.18345563709139426, "grad_norm": 7.129374980926514, "step": 550 }, { "epoch": 0.18345563709139426, "learning_rate": 0.00022916666666666664, "step": 550 }, { "epoch": 0.18345563709139426, "loss": 0.6342393159866333, "step": 550 }, { "ce_loss": 0.37034982442855835, "epoch": 0.18345563709139426, "step": 550 }, { "distill_loss": 0.12606871128082275, "epoch": 0.18345563709139426, "step": 550 }, { "epoch": 0.18345563709139426, "ref_ce_loss": 0.13699214160442352, "step": 550 }, { "epoch": 0.18345563709139426, "loss": 0.8261252641677856, "step": 550 }, { "ce_loss": 0.394279420375824, "epoch": 0.18345563709139426, "step": 550 }, { "distill_loss": 0.14237342774868011, "epoch": 0.18345563709139426, "step": 550 }, { "epoch": 0.18345563709139426, "ref_ce_loss": 0.17166593670845032, "step": 550 }, { "epoch": 0.18345563709139426, "loss": 0.9936450719833374, "step": 550 }, { "ce_loss": 0.4587689936161041, "epoch": 0.18345563709139426, "step": 550 }, { "distill_loss": 0.13692550361156464, "epoch": 0.18345563709139426, "step": 550 }, { "epoch": 0.18345563709139426, "ref_ce_loss": 0.26241040229797363, "step": 550 }, { "epoch": 0.18345563709139426, "loss": 0.7960110902786255, "step": 550 }, { "ce_loss": 0.3535270094871521, "epoch": 0.18345563709139426, "step": 550 }, { "distill_loss": 0.13536620140075684, "epoch": 0.18345563709139426, "step": 550 }, { "epoch": 0.18345563709139426, "ref_ce_loss": 0.1410633772611618, "step": 550 }, { "epoch": 0.1867911941294196, "loss": 0.9803, "step": 560 }, { "epoch": 0.1867911941294196, "grad_norm": 3.267789602279663, "step": 560 }, { "epoch": 0.1867911941294196, "learning_rate": 0.0002333333333333333, "step": 560 }, { "epoch": 0.1867911941294196, "loss": 0.9305562973022461, "step": 560 }, { "ce_loss": 0.3439280390739441, "epoch": 0.1867911941294196, "step": 560 }, { "distill_loss": 0.1248282864689827, "epoch": 0.1867911941294196, "step": 560 }, { "epoch": 0.1867911941294196, "ref_ce_loss": 0.24716688692569733, "step": 560 }, { "epoch": 0.1867911941294196, "loss": 0.537109911441803, "step": 560 }, { "ce_loss": 0.31539902091026306, "epoch": 0.1867911941294196, "step": 560 }, { "distill_loss": 0.13532862067222595, "epoch": 0.1867911941294196, "step": 560 }, { "epoch": 0.1867911941294196, "ref_ce_loss": 0.08404301851987839, "step": 560 }, { "epoch": 0.1867911941294196, "loss": 0.8709670901298523, "step": 560 }, { "ce_loss": 0.39863720536231995, "epoch": 0.1867911941294196, "step": 560 }, { "distill_loss": 0.14135649800300598, "epoch": 0.1867911941294196, "step": 560 }, { "epoch": 0.1867911941294196, "ref_ce_loss": 0.2198602557182312, "step": 560 }, { "epoch": 0.1867911941294196, "loss": 1.1067215204238892, "step": 560 }, { "ce_loss": 0.42708060145378113, "epoch": 0.1867911941294196, "step": 560 }, { "distill_loss": 0.13257859647274017, "epoch": 0.1867911941294196, "step": 560 }, { "epoch": 0.1867911941294196, "ref_ce_loss": 0.2437812089920044, "step": 560 }, { "epoch": 0.19012675116744496, "loss": 0.9827, "step": 570 }, { "epoch": 0.19012675116744496, "grad_norm": 2.548273801803589, "step": 570 }, { "epoch": 0.19012675116744496, "learning_rate": 0.00023749999999999997, "step": 570 }, { "epoch": 0.19012675116744496, "loss": 1.6599349975585938, "step": 570 }, { "ce_loss": 0.4800955653190613, "epoch": 0.19012675116744496, "step": 570 }, { "distill_loss": 0.16220739483833313, "epoch": 0.19012675116744496, "step": 570 }, { "epoch": 0.19012675116744496, "ref_ce_loss": 0.17117564380168915, "step": 570 }, { "epoch": 0.19012675116744496, "loss": 0.6720627546310425, "step": 570 }, { "ce_loss": 0.3572465479373932, "epoch": 0.19012675116744496, "step": 570 }, { "distill_loss": 0.12056800723075867, "epoch": 0.19012675116744496, "step": 570 }, { "epoch": 0.19012675116744496, "ref_ce_loss": 0.19394969940185547, "step": 570 }, { "epoch": 0.19012675116744496, "loss": 1.1998469829559326, "step": 570 }, { "ce_loss": 0.44026824831962585, "epoch": 0.19012675116744496, "step": 570 }, { "distill_loss": 0.1431874930858612, "epoch": 0.19012675116744496, "step": 570 }, { "epoch": 0.19012675116744496, "ref_ce_loss": 0.1669449657201767, "step": 570 }, { "epoch": 0.19012675116744496, "loss": 1.1252778768539429, "step": 570 }, { "ce_loss": 0.36948126554489136, "epoch": 0.19012675116744496, "step": 570 }, { "distill_loss": 0.13540102541446686, "epoch": 0.19012675116744496, "step": 570 }, { "epoch": 0.19012675116744496, "ref_ce_loss": 0.1495905965566635, "step": 570 }, { "epoch": 0.1934623082054703, "loss": 1.0028, "step": 580 }, { "epoch": 0.1934623082054703, "grad_norm": 2.2215306758880615, "step": 580 }, { "epoch": 0.1934623082054703, "learning_rate": 0.00024166666666666664, "step": 580 }, { "epoch": 0.1934623082054703, "loss": 0.7160823941230774, "step": 580 }, { "ce_loss": 0.4436323046684265, "epoch": 0.1934623082054703, "step": 580 }, { "distill_loss": 0.1151004433631897, "epoch": 0.1934623082054703, "step": 580 }, { "epoch": 0.1934623082054703, "ref_ce_loss": 0.1572589874267578, "step": 580 }, { "epoch": 0.1934623082054703, "loss": 0.8652709722518921, "step": 580 }, { "ce_loss": 0.4033513367176056, "epoch": 0.1934623082054703, "step": 580 }, { "distill_loss": 0.11897167563438416, "epoch": 0.1934623082054703, "step": 580 }, { "epoch": 0.1934623082054703, "ref_ce_loss": 0.1534203588962555, "step": 580 }, { "epoch": 0.1934623082054703, "loss": 0.930794358253479, "step": 580 }, { "ce_loss": 0.3531077206134796, "epoch": 0.1934623082054703, "step": 580 }, { "distill_loss": 0.11472751200199127, "epoch": 0.1934623082054703, "step": 580 }, { "epoch": 0.1934623082054703, "ref_ce_loss": 0.15658818185329437, "step": 580 }, { "epoch": 0.1934623082054703, "loss": 1.2479609251022339, "step": 580 }, { "ce_loss": 0.31892189383506775, "epoch": 0.1934623082054703, "step": 580 }, { "distill_loss": 0.12005530297756195, "epoch": 0.1934623082054703, "step": 580 }, { "epoch": 0.1934623082054703, "ref_ce_loss": 0.1711868792772293, "step": 580 }, { "epoch": 0.19679786524349566, "loss": 0.9568, "step": 590 }, { "epoch": 0.19679786524349566, "grad_norm": 2.501621723175049, "step": 590 }, { "epoch": 0.19679786524349566, "learning_rate": 0.0002458333333333333, "step": 590 }, { "epoch": 0.19679786524349566, "loss": 0.8282630443572998, "step": 590 }, { "ce_loss": 0.44370871782302856, "epoch": 0.19679786524349566, "step": 590 }, { "distill_loss": 0.11649394035339355, "epoch": 0.19679786524349566, "step": 590 }, { "epoch": 0.19679786524349566, "ref_ce_loss": 0.1873226761817932, "step": 590 }, { "epoch": 0.19679786524349566, "loss": 0.9475926160812378, "step": 590 }, { "ce_loss": 0.42322638630867004, "epoch": 0.19679786524349566, "step": 590 }, { "distill_loss": 0.10067001730203629, "epoch": 0.19679786524349566, "step": 590 }, { "epoch": 0.19679786524349566, "ref_ce_loss": 0.2594554126262665, "step": 590 }, { "epoch": 0.19679786524349566, "loss": 1.049269437789917, "step": 590 }, { "ce_loss": 0.35470736026763916, "epoch": 0.19679786524349566, "step": 590 }, { "distill_loss": 0.09396913647651672, "epoch": 0.19679786524349566, "step": 590 }, { "epoch": 0.19679786524349566, "ref_ce_loss": 0.23191304504871368, "step": 590 }, { "epoch": 0.19679786524349566, "loss": 0.9053672552108765, "step": 590 }, { "ce_loss": 0.43494299054145813, "epoch": 0.19679786524349566, "step": 590 }, { "distill_loss": 0.10486435890197754, "epoch": 0.19679786524349566, "step": 590 }, { "epoch": 0.19679786524349566, "ref_ce_loss": 0.21197301149368286, "step": 590 }, { "epoch": 0.200133422281521, "loss": 1.0136, "step": 600 }, { "epoch": 0.200133422281521, "grad_norm": 5.192570209503174, "step": 600 }, { "epoch": 0.200133422281521, "learning_rate": 0.00025, "step": 600 }, { "epoch": 0.200133422281521, "loss": 0.9283386468887329, "step": 600 }, { "ce_loss": 0.44486886262893677, "epoch": 0.200133422281521, "step": 600 }, { "distill_loss": 0.10690561681985855, "epoch": 0.200133422281521, "step": 600 }, { "epoch": 0.200133422281521, "ref_ce_loss": 0.17460843920707703, "step": 600 }, { "epoch": 0.200133422281521, "loss": 1.5753425359725952, "step": 600 }, { "ce_loss": 0.3931669592857361, "epoch": 0.200133422281521, "step": 600 }, { "distill_loss": 0.09033727645874023, "epoch": 0.200133422281521, "step": 600 }, { "epoch": 0.200133422281521, "ref_ce_loss": 0.1294490545988083, "step": 600 }, { "epoch": 0.200133422281521, "loss": 0.9043337106704712, "step": 600 }, { "ce_loss": 0.4034048318862915, "epoch": 0.200133422281521, "step": 600 }, { "distill_loss": 0.1026686429977417, "epoch": 0.200133422281521, "step": 600 }, { "epoch": 0.200133422281521, "ref_ce_loss": 0.22998681664466858, "step": 600 }, { "epoch": 0.200133422281521, "loss": 0.9695954322814941, "step": 600 }, { "ce_loss": 0.514153778553009, "epoch": 0.200133422281521, "step": 600 }, { "distill_loss": 0.10593391954898834, "epoch": 0.200133422281521, "step": 600 }, { "epoch": 0.200133422281521, "ref_ce_loss": 0.21736301481723785, "step": 600 }, { "epoch": 0.20346897931954636, "loss": 0.9875, "step": 610 }, { "epoch": 0.20346897931954636, "grad_norm": 4.711723327636719, "step": 610 }, { "epoch": 0.20346897931954636, "learning_rate": 0.00025416666666666665, "step": 610 }, { "epoch": 0.20346897931954636, "loss": 1.0265181064605713, "step": 610 }, { "ce_loss": 0.27343955636024475, "epoch": 0.20346897931954636, "step": 610 }, { "distill_loss": 0.11366772651672363, "epoch": 0.20346897931954636, "step": 610 }, { "epoch": 0.20346897931954636, "ref_ce_loss": 0.25925183296203613, "step": 610 }, { "epoch": 0.20346897931954636, "loss": 1.0089905261993408, "step": 610 }, { "ce_loss": 0.6128965020179749, "epoch": 0.20346897931954636, "step": 610 }, { "distill_loss": 0.13944478332996368, "epoch": 0.20346897931954636, "step": 610 }, { "epoch": 0.20346897931954636, "ref_ce_loss": 0.25663983821868896, "step": 610 }, { "epoch": 0.20346897931954636, "loss": 0.7002043724060059, "step": 610 }, { "ce_loss": 0.37525349855422974, "epoch": 0.20346897931954636, "step": 610 }, { "distill_loss": 0.11815596371889114, "epoch": 0.20346897931954636, "step": 610 }, { "epoch": 0.20346897931954636, "ref_ce_loss": 0.20678366720676422, "step": 610 }, { "epoch": 0.20346897931954636, "loss": 1.0840277671813965, "step": 610 }, { "ce_loss": 0.3404729664325714, "epoch": 0.20346897931954636, "step": 610 }, { "distill_loss": 0.13921114802360535, "epoch": 0.20346897931954636, "step": 610 }, { "epoch": 0.20346897931954636, "ref_ce_loss": 0.19724339246749878, "step": 610 }, { "epoch": 0.2068045363575717, "loss": 1.1648, "step": 620 }, { "epoch": 0.2068045363575717, "grad_norm": 15.244519233703613, "step": 620 }, { "epoch": 0.2068045363575717, "learning_rate": 0.00025833333333333334, "step": 620 }, { "epoch": 0.2068045363575717, "loss": 0.7298845052719116, "step": 620 }, { "ce_loss": 0.41301995515823364, "epoch": 0.2068045363575717, "step": 620 }, { "distill_loss": 0.12899278104305267, "epoch": 0.2068045363575717, "step": 620 }, { "epoch": 0.2068045363575717, "ref_ce_loss": 0.1292322278022766, "step": 620 }, { "epoch": 0.2068045363575717, "loss": 0.9002193212509155, "step": 620 }, { "ce_loss": 0.2968343198299408, "epoch": 0.2068045363575717, "step": 620 }, { "distill_loss": 0.11593446880578995, "epoch": 0.2068045363575717, "step": 620 }, { "epoch": 0.2068045363575717, "ref_ce_loss": 0.17411155998706818, "step": 620 }, { "epoch": 0.2068045363575717, "loss": 0.7344695329666138, "step": 620 }, { "ce_loss": 0.29947271943092346, "epoch": 0.2068045363575717, "step": 620 }, { "distill_loss": 0.14619185030460358, "epoch": 0.2068045363575717, "step": 620 }, { "epoch": 0.2068045363575717, "ref_ce_loss": 0.10363147407770157, "step": 620 }, { "epoch": 0.2068045363575717, "loss": 1.373020887374878, "step": 620 }, { "ce_loss": 0.40310031175613403, "epoch": 0.2068045363575717, "step": 620 }, { "distill_loss": 0.15024641156196594, "epoch": 0.2068045363575717, "step": 620 }, { "epoch": 0.2068045363575717, "ref_ce_loss": 0.17175301909446716, "step": 620 }, { "epoch": 0.21014009339559706, "loss": 1.0749, "step": 630 }, { "epoch": 0.21014009339559706, "grad_norm": 8.212667465209961, "step": 630 }, { "epoch": 0.21014009339559706, "learning_rate": 0.0002625, "step": 630 }, { "epoch": 0.21014009339559706, "loss": 0.8715625405311584, "step": 630 }, { "ce_loss": 0.35520607233047485, "epoch": 0.21014009339559706, "step": 630 }, { "distill_loss": 0.1376957893371582, "epoch": 0.21014009339559706, "step": 630 }, { "epoch": 0.21014009339559706, "ref_ce_loss": 0.19228589534759521, "step": 630 }, { "epoch": 0.21014009339559706, "loss": 0.74106764793396, "step": 630 }, { "ce_loss": 0.3812822997570038, "epoch": 0.21014009339559706, "step": 630 }, { "distill_loss": 0.104681096971035, "epoch": 0.21014009339559706, "step": 630 }, { "epoch": 0.21014009339559706, "ref_ce_loss": 0.2551042437553406, "step": 630 }, { "epoch": 0.21014009339559706, "loss": 1.4336597919464111, "step": 630 }, { "ce_loss": 0.425566166639328, "epoch": 0.21014009339559706, "step": 630 }, { "distill_loss": 0.13053010404109955, "epoch": 0.21014009339559706, "step": 630 }, { "epoch": 0.21014009339559706, "ref_ce_loss": 0.1743241250514984, "step": 630 }, { "epoch": 0.21014009339559706, "loss": 0.8435543775558472, "step": 630 }, { "ce_loss": 0.3314042091369629, "epoch": 0.21014009339559706, "step": 630 }, { "distill_loss": 0.12516427040100098, "epoch": 0.21014009339559706, "step": 630 }, { "epoch": 0.21014009339559706, "ref_ce_loss": 0.1005224660038948, "step": 630 }, { "epoch": 0.2134756504336224, "loss": 1.0402, "step": 640 }, { "epoch": 0.2134756504336224, "grad_norm": 5.008931636810303, "step": 640 }, { "epoch": 0.2134756504336224, "learning_rate": 0.0002666666666666666, "step": 640 }, { "epoch": 0.2134756504336224, "loss": 0.9882537722587585, "step": 640 }, { "ce_loss": 0.3984259366989136, "epoch": 0.2134756504336224, "step": 640 }, { "distill_loss": 0.10313582420349121, "epoch": 0.2134756504336224, "step": 640 }, { "epoch": 0.2134756504336224, "ref_ce_loss": 0.2773117125034332, "step": 640 }, { "epoch": 0.2134756504336224, "loss": 0.9406678080558777, "step": 640 }, { "ce_loss": 0.40200868248939514, "epoch": 0.2134756504336224, "step": 640 }, { "distill_loss": 0.09374626725912094, "epoch": 0.2134756504336224, "step": 640 }, { "epoch": 0.2134756504336224, "ref_ce_loss": 0.22686932981014252, "step": 640 }, { "epoch": 0.2134756504336224, "loss": 0.7817229628562927, "step": 640 }, { "ce_loss": 0.37328627705574036, "epoch": 0.2134756504336224, "step": 640 }, { "distill_loss": 0.10063036531209946, "epoch": 0.2134756504336224, "step": 640 }, { "epoch": 0.2134756504336224, "ref_ce_loss": 0.21581311523914337, "step": 640 }, { "epoch": 0.2134756504336224, "loss": 0.9889649152755737, "step": 640 }, { "ce_loss": 0.3842744827270508, "epoch": 0.2134756504336224, "step": 640 }, { "distill_loss": 0.10192930698394775, "epoch": 0.2134756504336224, "step": 640 }, { "epoch": 0.2134756504336224, "ref_ce_loss": 0.17884069681167603, "step": 640 }, { "epoch": 0.21681120747164775, "loss": 1.0506, "step": 650 }, { "epoch": 0.21681120747164775, "grad_norm": 4.354966640472412, "step": 650 }, { "epoch": 0.21681120747164775, "learning_rate": 0.0002708333333333333, "step": 650 }, { "epoch": 0.21681120747164775, "loss": 0.7693982720375061, "step": 650 }, { "ce_loss": 0.3526293933391571, "epoch": 0.21681120747164775, "step": 650 }, { "distill_loss": 0.11822477728128433, "epoch": 0.21681120747164775, "step": 650 }, { "epoch": 0.21681120747164775, "ref_ce_loss": 0.15306073427200317, "step": 650 }, { "epoch": 0.21681120747164775, "loss": 0.7925047874450684, "step": 650 }, { "ce_loss": 0.3771379888057709, "epoch": 0.21681120747164775, "step": 650 }, { "distill_loss": 0.13101331889629364, "epoch": 0.21681120747164775, "step": 650 }, { "epoch": 0.21681120747164775, "ref_ce_loss": 0.15947893261909485, "step": 650 }, { "epoch": 0.21681120747164775, "loss": 0.7453879117965698, "step": 650 }, { "ce_loss": 0.3240864872932434, "epoch": 0.21681120747164775, "step": 650 }, { "distill_loss": 0.1208181232213974, "epoch": 0.21681120747164775, "step": 650 }, { "epoch": 0.21681120747164775, "ref_ce_loss": 0.16739091277122498, "step": 650 }, { "epoch": 0.21681120747164775, "loss": 1.2144562005996704, "step": 650 }, { "ce_loss": 0.3694230020046234, "epoch": 0.21681120747164775, "step": 650 }, { "distill_loss": 0.12667544186115265, "epoch": 0.21681120747164775, "step": 650 }, { "epoch": 0.21681120747164775, "ref_ce_loss": 0.1709766983985901, "step": 650 }, { "epoch": 0.2201467645096731, "loss": 1.0487, "step": 660 }, { "epoch": 0.2201467645096731, "grad_norm": 4.961103439331055, "step": 660 }, { "epoch": 0.2201467645096731, "learning_rate": 0.00027499999999999996, "step": 660 }, { "epoch": 0.2201467645096731, "loss": 0.696092426776886, "step": 660 }, { "ce_loss": 0.41480615735054016, "epoch": 0.2201467645096731, "step": 660 }, { "distill_loss": 0.09278301894664764, "epoch": 0.2201467645096731, "step": 660 }, { "epoch": 0.2201467645096731, "ref_ce_loss": 0.18769042193889618, "step": 660 }, { "epoch": 0.2201467645096731, "loss": 1.1655255556106567, "step": 660 }, { "ce_loss": 0.38108381628990173, "epoch": 0.2201467645096731, "step": 660 }, { "distill_loss": 0.10685959458351135, "epoch": 0.2201467645096731, "step": 660 }, { "epoch": 0.2201467645096731, "ref_ce_loss": 0.2142338752746582, "step": 660 }, { "epoch": 0.2201467645096731, "loss": 1.8354722261428833, "step": 660 }, { "ce_loss": 0.35269877314567566, "epoch": 0.2201467645096731, "step": 660 }, { "distill_loss": 0.0982913225889206, "epoch": 0.2201467645096731, "step": 660 }, { "epoch": 0.2201467645096731, "ref_ce_loss": 0.18363001942634583, "step": 660 }, { "epoch": 0.2201467645096731, "loss": 0.9739862680435181, "step": 660 }, { "ce_loss": 0.487895667552948, "epoch": 0.2201467645096731, "step": 660 }, { "distill_loss": 0.11279213428497314, "epoch": 0.2201467645096731, "step": 660 }, { "epoch": 0.2201467645096731, "ref_ce_loss": 0.21531665325164795, "step": 660 }, { "epoch": 0.22348232154769845, "loss": 0.9574, "step": 670 }, { "epoch": 0.22348232154769845, "grad_norm": 2.3174378871917725, "step": 670 }, { "epoch": 0.22348232154769845, "learning_rate": 0.00027916666666666666, "step": 670 }, { "epoch": 0.22348232154769845, "loss": 1.0270733833312988, "step": 670 }, { "ce_loss": 0.4073409140110016, "epoch": 0.22348232154769845, "step": 670 }, { "distill_loss": 0.08957529067993164, "epoch": 0.22348232154769845, "step": 670 }, { "epoch": 0.22348232154769845, "ref_ce_loss": 0.22530145943164825, "step": 670 }, { "epoch": 0.22348232154769845, "loss": 0.8506837487220764, "step": 670 }, { "ce_loss": 0.3899139165878296, "epoch": 0.22348232154769845, "step": 670 }, { "distill_loss": 0.08442845940589905, "epoch": 0.22348232154769845, "step": 670 }, { "epoch": 0.22348232154769845, "ref_ce_loss": 0.15433235466480255, "step": 670 }, { "epoch": 0.22348232154769845, "loss": 1.01683509349823, "step": 670 }, { "ce_loss": 0.37784865498542786, "epoch": 0.22348232154769845, "step": 670 }, { "distill_loss": 0.10341054946184158, "epoch": 0.22348232154769845, "step": 670 }, { "epoch": 0.22348232154769845, "ref_ce_loss": 0.1931767463684082, "step": 670 }, { "epoch": 0.22348232154769845, "loss": 0.8917780518531799, "step": 670 }, { "ce_loss": 0.4849671423435211, "epoch": 0.22348232154769845, "step": 670 }, { "distill_loss": 0.09322772920131683, "epoch": 0.22348232154769845, "step": 670 }, { "epoch": 0.22348232154769845, "ref_ce_loss": 0.17773941159248352, "step": 670 }, { "epoch": 0.2268178785857238, "loss": 0.9316, "step": 680 }, { "epoch": 0.2268178785857238, "grad_norm": 2.05058217048645, "step": 680 }, { "epoch": 0.2268178785857238, "learning_rate": 0.0002833333333333333, "step": 680 }, { "epoch": 0.2268178785857238, "loss": 0.7885412573814392, "step": 680 }, { "ce_loss": 0.4074735939502716, "epoch": 0.2268178785857238, "step": 680 }, { "distill_loss": 0.1123306155204773, "epoch": 0.2268178785857238, "step": 680 }, { "epoch": 0.2268178785857238, "ref_ce_loss": 0.1499427706003189, "step": 680 }, { "epoch": 0.2268178785857238, "loss": 1.295933485031128, "step": 680 }, { "ce_loss": 0.3566689193248749, "epoch": 0.2268178785857238, "step": 680 }, { "distill_loss": 0.10098881274461746, "epoch": 0.2268178785857238, "step": 680 }, { "epoch": 0.2268178785857238, "ref_ce_loss": 0.21255967020988464, "step": 680 }, { "epoch": 0.2268178785857238, "loss": 2.1791958808898926, "step": 680 }, { "ce_loss": 0.45526185631752014, "epoch": 0.2268178785857238, "step": 680 }, { "distill_loss": 0.10729824006557465, "epoch": 0.2268178785857238, "step": 680 }, { "epoch": 0.2268178785857238, "ref_ce_loss": 0.20944558084011078, "step": 680 }, { "epoch": 0.2268178785857238, "loss": 1.860978364944458, "step": 680 }, { "ce_loss": 0.5708116292953491, "epoch": 0.2268178785857238, "step": 680 }, { "distill_loss": 0.12268486618995667, "epoch": 0.2268178785857238, "step": 680 }, { "epoch": 0.2268178785857238, "ref_ce_loss": 0.20244957506656647, "step": 680 }, { "epoch": 0.23015343562374915, "loss": 1.1451, "step": 690 }, { "epoch": 0.23015343562374915, "grad_norm": 5.780163288116455, "step": 690 }, { "epoch": 0.23015343562374915, "learning_rate": 0.0002875, "step": 690 }, { "epoch": 0.23015343562374915, "loss": 0.6632769107818604, "step": 690 }, { "ce_loss": 0.356410950422287, "epoch": 0.23015343562374915, "step": 690 }, { "distill_loss": 0.11251110583543777, "epoch": 0.23015343562374915, "step": 690 }, { "epoch": 0.23015343562374915, "ref_ce_loss": 0.1943245828151703, "step": 690 }, { "epoch": 0.23015343562374915, "loss": 0.8603688478469849, "step": 690 }, { "ce_loss": 0.46949803829193115, "epoch": 0.23015343562374915, "step": 690 }, { "distill_loss": 0.12070250511169434, "epoch": 0.23015343562374915, "step": 690 }, { "epoch": 0.23015343562374915, "ref_ce_loss": 0.20107735693454742, "step": 690 }, { "epoch": 0.23015343562374915, "loss": 0.6247305870056152, "step": 690 }, { "ce_loss": 0.38976994156837463, "epoch": 0.23015343562374915, "step": 690 }, { "distill_loss": 0.11423636227846146, "epoch": 0.23015343562374915, "step": 690 }, { "epoch": 0.23015343562374915, "ref_ce_loss": 0.12047644704580307, "step": 690 }, { "epoch": 0.23015343562374915, "loss": 1.4882597923278809, "step": 690 }, { "ce_loss": 0.426405131816864, "epoch": 0.23015343562374915, "step": 690 }, { "distill_loss": 0.09722738713026047, "epoch": 0.23015343562374915, "step": 690 }, { "epoch": 0.23015343562374915, "ref_ce_loss": 0.2013929933309555, "step": 690 }, { "epoch": 0.2334889926617745, "loss": 1.0173, "step": 700 }, { "epoch": 0.2334889926617745, "grad_norm": 3.844639301300049, "step": 700 }, { "epoch": 0.2334889926617745, "learning_rate": 0.00029166666666666664, "step": 700 }, { "epoch": 0.2334889926617745, "loss": 1.1349132061004639, "step": 700 }, { "ce_loss": 0.3564150333404541, "epoch": 0.2334889926617745, "step": 700 }, { "distill_loss": 0.09892964363098145, "epoch": 0.2334889926617745, "step": 700 }, { "epoch": 0.2334889926617745, "ref_ce_loss": 0.15781459212303162, "step": 700 }, { "epoch": 0.2334889926617745, "loss": 0.7385631203651428, "step": 700 }, { "ce_loss": 0.3456648886203766, "epoch": 0.2334889926617745, "step": 700 }, { "distill_loss": 0.09562936425209045, "epoch": 0.2334889926617745, "step": 700 }, { "epoch": 0.2334889926617745, "ref_ce_loss": 0.19819499552249908, "step": 700 }, { "epoch": 0.2334889926617745, "loss": 1.3640516996383667, "step": 700 }, { "ce_loss": 0.44929933547973633, "epoch": 0.2334889926617745, "step": 700 }, { "distill_loss": 0.09775910526514053, "epoch": 0.2334889926617745, "step": 700 }, { "epoch": 0.2334889926617745, "ref_ce_loss": 0.2679179906845093, "step": 700 }, { "epoch": 0.2334889926617745, "loss": 0.7409743666648865, "step": 700 }, { "ce_loss": 0.322372704744339, "epoch": 0.2334889926617745, "step": 700 }, { "distill_loss": 0.09759743511676788, "epoch": 0.2334889926617745, "step": 700 }, { "epoch": 0.2334889926617745, "ref_ce_loss": 0.19925224781036377, "step": 700 }, { "epoch": 0.23682454969979988, "loss": 1.076, "step": 710 }, { "epoch": 0.23682454969979988, "grad_norm": 3.4155962467193604, "step": 710 }, { "epoch": 0.23682454969979988, "learning_rate": 0.00029583333333333333, "step": 710 }, { "epoch": 0.23682454969979988, "loss": 0.8137826919555664, "step": 710 }, { "ce_loss": 0.3987344205379486, "epoch": 0.23682454969979988, "step": 710 }, { "distill_loss": 0.12683498859405518, "epoch": 0.23682454969979988, "step": 710 }, { "epoch": 0.23682454969979988, "ref_ce_loss": 0.17653390765190125, "step": 710 }, { "epoch": 0.23682454969979988, "loss": 0.8795384764671326, "step": 710 }, { "ce_loss": 0.4477671980857849, "epoch": 0.23682454969979988, "step": 710 }, { "distill_loss": 0.1383477747440338, "epoch": 0.23682454969979988, "step": 710 }, { "epoch": 0.23682454969979988, "ref_ce_loss": 0.1700534224510193, "step": 710 }, { "epoch": 0.23682454969979988, "loss": 0.7244356274604797, "step": 710 }, { "ce_loss": 0.3994516134262085, "epoch": 0.23682454969979988, "step": 710 }, { "distill_loss": 0.11458105593919754, "epoch": 0.23682454969979988, "step": 710 }, { "epoch": 0.23682454969979988, "ref_ce_loss": 0.16034944355487823, "step": 710 }, { "epoch": 0.23682454969979988, "loss": 1.2054007053375244, "step": 710 }, { "ce_loss": 0.31495407223701477, "epoch": 0.23682454969979988, "step": 710 }, { "distill_loss": 0.11541106551885605, "epoch": 0.23682454969979988, "step": 710 }, { "epoch": 0.23682454969979988, "ref_ce_loss": 0.15973275899887085, "step": 710 }, { "epoch": 0.24016010673782523, "loss": 0.9533, "step": 720 }, { "epoch": 0.24016010673782523, "grad_norm": 2.230461597442627, "step": 720 }, { "epoch": 0.24016010673782523, "learning_rate": 0.0003, "step": 720 }, { "epoch": 0.24016010673782523, "loss": 1.5545556545257568, "step": 720 }, { "ce_loss": 0.43651020526885986, "epoch": 0.24016010673782523, "step": 720 }, { "distill_loss": 0.09635943919420242, "epoch": 0.24016010673782523, "step": 720 }, { "epoch": 0.24016010673782523, "ref_ce_loss": 0.1669834852218628, "step": 720 }, { "epoch": 0.24016010673782523, "loss": 0.8339600563049316, "step": 720 }, { "ce_loss": 0.37342774868011475, "epoch": 0.24016010673782523, "step": 720 }, { "distill_loss": 0.10773156583309174, "epoch": 0.24016010673782523, "step": 720 }, { "epoch": 0.24016010673782523, "ref_ce_loss": 0.11810831725597382, "step": 720 }, { "epoch": 0.24016010673782523, "loss": 1.0487580299377441, "step": 720 }, { "ce_loss": 0.38172048330307007, "epoch": 0.24016010673782523, "step": 720 }, { "distill_loss": 0.09679360687732697, "epoch": 0.24016010673782523, "step": 720 }, { "epoch": 0.24016010673782523, "ref_ce_loss": 0.2521325349807739, "step": 720 }, { "epoch": 0.24016010673782523, "loss": 0.7091606259346008, "step": 720 }, { "ce_loss": 0.3137376010417938, "epoch": 0.24016010673782523, "step": 720 }, { "distill_loss": 0.08482564985752106, "epoch": 0.24016010673782523, "step": 720 }, { "epoch": 0.24016010673782523, "ref_ce_loss": 0.13457347452640533, "step": 720 }, { "epoch": 0.24349566377585058, "loss": 0.9342, "step": 730 }, { "epoch": 0.24349566377585058, "grad_norm": 6.439743518829346, "step": 730 }, { "epoch": 0.24349566377585058, "learning_rate": 0.00029999986322958505, "step": 730 }, { "epoch": 0.24349566377585058, "loss": 0.6478908061981201, "step": 730 }, { "ce_loss": 0.3937546908855438, "epoch": 0.24349566377585058, "step": 730 }, { "distill_loss": 0.11202051490545273, "epoch": 0.24349566377585058, "step": 730 }, { "epoch": 0.24349566377585058, "ref_ce_loss": 0.14208683371543884, "step": 730 }, { "epoch": 0.24349566377585058, "loss": 0.864059329032898, "step": 730 }, { "ce_loss": 0.3594917356967926, "epoch": 0.24349566377585058, "step": 730 }, { "distill_loss": 0.11220365017652512, "epoch": 0.24349566377585058, "step": 730 }, { "epoch": 0.24349566377585058, "ref_ce_loss": 0.17011480033397675, "step": 730 }, { "epoch": 0.24349566377585058, "loss": 1.1030160188674927, "step": 730 }, { "ce_loss": 0.2855687141418457, "epoch": 0.24349566377585058, "step": 730 }, { "distill_loss": 0.09748795628547668, "epoch": 0.24349566377585058, "step": 730 }, { "epoch": 0.24349566377585058, "ref_ce_loss": 0.20837855339050293, "step": 730 }, { "epoch": 0.24349566377585058, "loss": 0.8385856747627258, "step": 730 }, { "ce_loss": 0.4496913254261017, "epoch": 0.24349566377585058, "step": 730 }, { "distill_loss": 0.11531084030866623, "epoch": 0.24349566377585058, "step": 730 }, { "epoch": 0.24349566377585058, "ref_ce_loss": 0.1728518158197403, "step": 730 }, { "epoch": 0.24683122081387593, "loss": 0.9298, "step": 740 }, { "epoch": 0.24683122081387593, "grad_norm": 5.409254550933838, "step": 740 }, { "epoch": 0.24683122081387593, "learning_rate": 0.00029999945291858974, "step": 740 }, { "epoch": 0.24683122081387593, "loss": 0.9843995571136475, "step": 740 }, { "ce_loss": 0.37632668018341064, "epoch": 0.24683122081387593, "step": 740 }, { "distill_loss": 0.11238445341587067, "epoch": 0.24683122081387593, "step": 740 }, { "epoch": 0.24683122081387593, "ref_ce_loss": 0.20602065324783325, "step": 740 }, { "epoch": 0.24683122081387593, "loss": 1.8616141080856323, "step": 740 }, { "ce_loss": 0.46412500739097595, "epoch": 0.24683122081387593, "step": 740 }, { "distill_loss": 0.1517205834388733, "epoch": 0.24683122081387593, "step": 740 }, { "epoch": 0.24683122081387593, "ref_ce_loss": 0.18234631419181824, "step": 740 }, { "epoch": 0.24683122081387593, "loss": 0.7307447791099548, "step": 740 }, { "ce_loss": 0.33168455958366394, "epoch": 0.24683122081387593, "step": 740 }, { "distill_loss": 0.14137397706508636, "epoch": 0.24683122081387593, "step": 740 }, { "epoch": 0.24683122081387593, "ref_ce_loss": 0.1374725103378296, "step": 740 }, { "epoch": 0.24683122081387593, "loss": 1.3032493591308594, "step": 740 }, { "ce_loss": 0.3302554488182068, "epoch": 0.24683122081387593, "step": 740 }, { "distill_loss": 0.1149841845035553, "epoch": 0.24683122081387593, "step": 740 }, { "epoch": 0.24683122081387593, "ref_ce_loss": 0.16944146156311035, "step": 740 }, { "epoch": 0.2501667778519013, "loss": 1.0939, "step": 750 }, { "epoch": 0.2501667778519013, "grad_norm": 2.4583041667938232, "step": 750 }, { "epoch": 0.2501667778519013, "learning_rate": 0.0002999987690677622, "step": 750 }, { "epoch": 0.2501667778519013, "loss": 1.895957589149475, "step": 750 }, { "ce_loss": 0.4463769793510437, "epoch": 0.2501667778519013, "step": 750 }, { "distill_loss": 0.1424015462398529, "epoch": 0.2501667778519013, "step": 750 }, { "epoch": 0.2501667778519013, "ref_ce_loss": 0.28784894943237305, "step": 750 }, { "epoch": 0.2501667778519013, "loss": 0.755088210105896, "step": 750 }, { "ce_loss": 0.32497096061706543, "epoch": 0.2501667778519013, "step": 750 }, { "distill_loss": 0.11568926274776459, "epoch": 0.2501667778519013, "step": 750 }, { "epoch": 0.2501667778519013, "ref_ce_loss": 0.3128649592399597, "step": 750 }, { "epoch": 0.2501667778519013, "loss": 0.8456001877784729, "step": 750 }, { "ce_loss": 0.38667964935302734, "epoch": 0.2501667778519013, "step": 750 }, { "distill_loss": 0.11649499833583832, "epoch": 0.2501667778519013, "step": 750 }, { "epoch": 0.2501667778519013, "ref_ce_loss": 0.2550145983695984, "step": 750 }, { "epoch": 0.2501667778519013, "loss": 0.7525032758712769, "step": 750 }, { "ce_loss": 0.4029221832752228, "epoch": 0.2501667778519013, "step": 750 }, { "distill_loss": 0.12815462052822113, "epoch": 0.2501667778519013, "step": 750 }, { "epoch": 0.2501667778519013, "ref_ce_loss": 0.22105097770690918, "step": 750 }, { "epoch": 0.25350233488992663, "loss": 1.1464, "step": 760 }, { "epoch": 0.25350233488992663, "grad_norm": 6.5162835121154785, "step": 760 }, { "epoch": 0.25350233488992663, "learning_rate": 0.0002999978116783497, "step": 760 }, { "epoch": 0.25350233488992663, "loss": 1.1063671112060547, "step": 760 }, { "ce_loss": 0.3920098841190338, "epoch": 0.25350233488992663, "step": 760 }, { "distill_loss": 0.48732489347457886, "epoch": 0.25350233488992663, "step": 760 }, { "epoch": 0.25350233488992663, "ref_ce_loss": 0.2269572615623474, "step": 760 }, { "epoch": 0.25350233488992663, "loss": 2.2254278659820557, "step": 760 }, { "ce_loss": 0.4064970314502716, "epoch": 0.25350233488992663, "step": 760 }, { "distill_loss": 0.519298255443573, "epoch": 0.25350233488992663, "step": 760 }, { "epoch": 0.25350233488992663, "ref_ce_loss": 0.2351590245962143, "step": 760 }, { "epoch": 0.25350233488992663, "loss": 1.4916338920593262, "step": 760 }, { "ce_loss": 0.4203934073448181, "epoch": 0.25350233488992663, "step": 760 }, { "distill_loss": 0.5812166929244995, "epoch": 0.25350233488992663, "step": 760 }, { "epoch": 0.25350233488992663, "ref_ce_loss": 0.19478872418403625, "step": 760 }, { "epoch": 0.25350233488992663, "loss": 1.521181583404541, "step": 760 }, { "ce_loss": 0.41617247462272644, "epoch": 0.25350233488992663, "step": 760 }, { "distill_loss": 0.5565536618232727, "epoch": 0.25350233488992663, "step": 760 }, { "epoch": 0.25350233488992663, "ref_ce_loss": 0.20470012724399567, "step": 760 }, { "epoch": 0.256837891927952, "loss": 1.2578, "step": 770 }, { "epoch": 0.256837891927952, "grad_norm": 3.5118868350982666, "step": 770 }, { "epoch": 0.256837891927952, "learning_rate": 0.00029999658075209785, "step": 770 }, { "epoch": 0.256837891927952, "loss": 1.4006614685058594, "step": 770 }, { "ce_loss": 0.40021345019340515, "epoch": 0.256837891927952, "step": 770 }, { "distill_loss": 0.6183995008468628, "epoch": 0.256837891927952, "step": 770 }, { "epoch": 0.256837891927952, "ref_ce_loss": 0.1635797619819641, "step": 770 }, { "epoch": 0.256837891927952, "loss": 2.101757526397705, "step": 770 }, { "ce_loss": 0.3580198287963867, "epoch": 0.256837891927952, "step": 770 }, { "distill_loss": 0.5030682682991028, "epoch": 0.256837891927952, "step": 770 }, { "epoch": 0.256837891927952, "ref_ce_loss": 0.19172348082065582, "step": 770 }, { "epoch": 0.256837891927952, "loss": 1.6819496154785156, "step": 770 }, { "ce_loss": 0.35157284140586853, "epoch": 0.256837891927952, "step": 770 }, { "distill_loss": 0.5677676796913147, "epoch": 0.256837891927952, "step": 770 }, { "epoch": 0.256837891927952, "ref_ce_loss": 0.19775229692459106, "step": 770 }, { "epoch": 0.256837891927952, "loss": 1.4923537969589233, "step": 770 }, { "ce_loss": 0.3099478781223297, "epoch": 0.256837891927952, "step": 770 }, { "distill_loss": 0.4962100386619568, "epoch": 0.256837891927952, "step": 770 }, { "epoch": 0.256837891927952, "ref_ce_loss": 0.25995033979415894, "step": 770 }, { "epoch": 0.2601734489659773, "loss": 1.1482, "step": 780 }, { "epoch": 0.2601734489659773, "grad_norm": 4.092563152313232, "step": 780 }, { "epoch": 0.2601734489659773, "learning_rate": 0.0002999950762912516, "step": 780 }, { "epoch": 0.2601734489659773, "loss": 0.9934489727020264, "step": 780 }, { "ce_loss": 0.4670727849006653, "epoch": 0.2601734489659773, "step": 780 }, { "distill_loss": 0.21604394912719727, "epoch": 0.2601734489659773, "step": 780 }, { "epoch": 0.2601734489659773, "ref_ce_loss": 0.21766111254692078, "step": 780 }, { "epoch": 0.2601734489659773, "loss": 0.7785146832466125, "step": 780 }, { "ce_loss": 0.4251066744327545, "epoch": 0.2601734489659773, "step": 780 }, { "distill_loss": 0.23343425989151, "epoch": 0.2601734489659773, "step": 780 }, { "epoch": 0.2601734489659773, "ref_ce_loss": 0.11992902308702469, "step": 780 }, { "epoch": 0.2601734489659773, "loss": 0.9308755397796631, "step": 780 }, { "ce_loss": 0.43334394693374634, "epoch": 0.2601734489659773, "step": 780 }, { "distill_loss": 0.22830483317375183, "epoch": 0.2601734489659773, "step": 780 }, { "epoch": 0.2601734489659773, "ref_ce_loss": 0.15851473808288574, "step": 780 }, { "epoch": 0.2601734489659773, "loss": 1.02692711353302, "step": 780 }, { "ce_loss": 0.4778960645198822, "epoch": 0.2601734489659773, "step": 780 }, { "distill_loss": 0.27986395359039307, "epoch": 0.2601734489659773, "step": 780 }, { "epoch": 0.2601734489659773, "ref_ce_loss": 0.14882494509220123, "step": 780 }, { "epoch": 0.2635090060040027, "loss": 1.1524, "step": 790 }, { "epoch": 0.2635090060040027, "grad_norm": 5.4341020584106445, "step": 790 }, { "epoch": 0.2635090060040027, "learning_rate": 0.00029999329829855445, "step": 790 }, { "epoch": 0.2635090060040027, "loss": 0.7699970006942749, "step": 790 }, { "ce_loss": 0.3887271285057068, "epoch": 0.2635090060040027, "step": 790 }, { "distill_loss": 0.1653989851474762, "epoch": 0.2635090060040027, "step": 790 }, { "epoch": 0.2635090060040027, "ref_ce_loss": 0.21547864377498627, "step": 790 }, { "epoch": 0.2635090060040027, "loss": 0.9414461851119995, "step": 790 }, { "ce_loss": 0.44151678681373596, "epoch": 0.2635090060040027, "step": 790 }, { "distill_loss": 0.15568041801452637, "epoch": 0.2635090060040027, "step": 790 }, { "epoch": 0.2635090060040027, "ref_ce_loss": 0.1594770848751068, "step": 790 }, { "epoch": 0.2635090060040027, "loss": 0.846574068069458, "step": 790 }, { "ce_loss": 0.36872443556785583, "epoch": 0.2635090060040027, "step": 790 }, { "distill_loss": 0.15134361386299133, "epoch": 0.2635090060040027, "step": 790 }, { "epoch": 0.2635090060040027, "ref_ce_loss": 0.21225082874298096, "step": 790 }, { "epoch": 0.2635090060040027, "loss": 1.0768730640411377, "step": 790 }, { "ce_loss": 0.4060359299182892, "epoch": 0.2635090060040027, "step": 790 }, { "distill_loss": 0.1718452423810959, "epoch": 0.2635090060040027, "step": 790 }, { "epoch": 0.2635090060040027, "ref_ce_loss": 0.1805437058210373, "step": 790 }, { "epoch": 0.266844563042028, "loss": 0.9391, "step": 800 }, { "epoch": 0.266844563042028, "grad_norm": 2.150057554244995, "step": 800 }, { "epoch": 0.266844563042028, "learning_rate": 0.0002999912467772487, "step": 800 }, { "epoch": 0.266844563042028, "loss": 0.8158339262008667, "step": 800 }, { "ce_loss": 0.4157412648200989, "epoch": 0.266844563042028, "step": 800 }, { "distill_loss": 0.14980031549930573, "epoch": 0.266844563042028, "step": 800 }, { "epoch": 0.266844563042028, "ref_ce_loss": 0.15453822910785675, "step": 800 }, { "epoch": 0.266844563042028, "loss": 0.7618894577026367, "step": 800 }, { "ce_loss": 0.34882333874702454, "epoch": 0.266844563042028, "step": 800 }, { "distill_loss": 0.14651793241500854, "epoch": 0.266844563042028, "step": 800 }, { "epoch": 0.266844563042028, "ref_ce_loss": 0.13346315920352936, "step": 800 }, { "epoch": 0.266844563042028, "loss": 1.3546881675720215, "step": 800 }, { "ce_loss": 0.43503326177597046, "epoch": 0.266844563042028, "step": 800 }, { "distill_loss": 0.1588219702243805, "epoch": 0.266844563042028, "step": 800 }, { "epoch": 0.266844563042028, "ref_ce_loss": 0.254705548286438, "step": 800 }, { "epoch": 0.266844563042028, "loss": 1.3324687480926514, "step": 800 }, { "ce_loss": 0.446909099817276, "epoch": 0.266844563042028, "step": 800 }, { "distill_loss": 0.1556074321269989, "epoch": 0.266844563042028, "step": 800 }, { "epoch": 0.266844563042028, "ref_ce_loss": 0.12691357731819153, "step": 800 }, { "epoch": 0.2701801200800534, "loss": 0.9072, "step": 810 }, { "epoch": 0.2701801200800534, "grad_norm": 3.318474054336548, "step": 810 }, { "epoch": 0.2701801200800534, "learning_rate": 0.0002999889217310755, "step": 810 }, { "epoch": 0.2701801200800534, "loss": 0.8637751340866089, "step": 810 }, { "ce_loss": 0.4324488341808319, "epoch": 0.2701801200800534, "step": 810 }, { "distill_loss": 0.19835472106933594, "epoch": 0.2701801200800534, "step": 810 }, { "epoch": 0.2701801200800534, "ref_ce_loss": 0.23256337642669678, "step": 810 }, { "epoch": 0.2701801200800534, "loss": 0.8926365375518799, "step": 810 }, { "ce_loss": 0.3715226352214813, "epoch": 0.2701801200800534, "step": 810 }, { "distill_loss": 0.24980111420154572, "epoch": 0.2701801200800534, "step": 810 }, { "epoch": 0.2701801200800534, "ref_ce_loss": 0.27107831835746765, "step": 810 }, { "epoch": 0.2701801200800534, "loss": 1.0381509065628052, "step": 810 }, { "ce_loss": 0.42502278089523315, "epoch": 0.2701801200800534, "step": 810 }, { "distill_loss": 0.22801654040813446, "epoch": 0.2701801200800534, "step": 810 }, { "epoch": 0.2701801200800534, "ref_ce_loss": 0.18256254494190216, "step": 810 }, { "epoch": 0.2701801200800534, "loss": 1.7744220495224, "step": 810 }, { "ce_loss": 0.4806549549102783, "epoch": 0.2701801200800534, "step": 810 }, { "distill_loss": 0.18088483810424805, "epoch": 0.2701801200800534, "step": 810 }, { "epoch": 0.2701801200800534, "ref_ce_loss": 0.2571941316127777, "step": 810 }, { "epoch": 0.2735156771180787, "loss": 1.1348, "step": 820 }, { "epoch": 0.2735156771180787, "grad_norm": 4.989599227905273, "step": 820 }, { "epoch": 0.2735156771180787, "learning_rate": 0.00029998632316427493, "step": 820 }, { "epoch": 0.2735156771180787, "loss": 0.7250751852989197, "step": 820 }, { "ce_loss": 0.34602952003479004, "epoch": 0.2735156771180787, "step": 820 }, { "distill_loss": 0.1138123869895935, "epoch": 0.2735156771180787, "step": 820 }, { "epoch": 0.2735156771180787, "ref_ce_loss": 0.1507563441991806, "step": 820 }, { "epoch": 0.2735156771180787, "loss": 0.9217355251312256, "step": 820 }, { "ce_loss": 0.37067872285842896, "epoch": 0.2735156771180787, "step": 820 }, { "distill_loss": 0.12390212714672089, "epoch": 0.2735156771180787, "step": 820 }, { "epoch": 0.2735156771180787, "ref_ce_loss": 0.1725834459066391, "step": 820 }, { "epoch": 0.2735156771180787, "loss": 0.6601697206497192, "step": 820 }, { "ce_loss": 0.35830461978912354, "epoch": 0.2735156771180787, "step": 820 }, { "distill_loss": 0.1254177987575531, "epoch": 0.2735156771180787, "step": 820 }, { "epoch": 0.2735156771180787, "ref_ce_loss": 0.17583151161670685, "step": 820 }, { "epoch": 0.2735156771180787, "loss": 0.9171438217163086, "step": 820 }, { "ce_loss": 0.3662046194076538, "epoch": 0.2735156771180787, "step": 820 }, { "distill_loss": 0.10298191010951996, "epoch": 0.2735156771180787, "step": 820 }, { "epoch": 0.2735156771180787, "ref_ce_loss": 0.16088753938674927, "step": 820 }, { "epoch": 0.2768512341561041, "loss": 1.0359, "step": 830 }, { "epoch": 0.2768512341561041, "grad_norm": 6.688352584838867, "step": 830 }, { "epoch": 0.2768512341561041, "learning_rate": 0.0002999834510815857, "step": 830 }, { "epoch": 0.2768512341561041, "loss": 0.6852495670318604, "step": 830 }, { "ce_loss": 0.27872976660728455, "epoch": 0.2768512341561041, "step": 830 }, { "distill_loss": 0.10291218012571335, "epoch": 0.2768512341561041, "step": 830 }, { "epoch": 0.2768512341561041, "ref_ce_loss": 0.1563701629638672, "step": 830 }, { "epoch": 0.2768512341561041, "loss": 1.3477394580841064, "step": 830 }, { "ce_loss": 0.3723379969596863, "epoch": 0.2768512341561041, "step": 830 }, { "distill_loss": 0.12437531352043152, "epoch": 0.2768512341561041, "step": 830 }, { "epoch": 0.2768512341561041, "ref_ce_loss": 0.22796234488487244, "step": 830 }, { "epoch": 0.2768512341561041, "loss": 0.8273101449012756, "step": 830 }, { "ce_loss": 0.37694263458251953, "epoch": 0.2768512341561041, "step": 830 }, { "distill_loss": 0.108419269323349, "epoch": 0.2768512341561041, "step": 830 }, { "epoch": 0.2768512341561041, "ref_ce_loss": 0.2073294222354889, "step": 830 }, { "epoch": 0.2768512341561041, "loss": 1.1366130113601685, "step": 830 }, { "ce_loss": 0.5759130120277405, "epoch": 0.2768512341561041, "step": 830 }, { "distill_loss": 0.12910136580467224, "epoch": 0.2768512341561041, "step": 830 }, { "epoch": 0.2768512341561041, "ref_ce_loss": 0.2354922592639923, "step": 830 }, { "epoch": 0.2801867911941294, "loss": 1.2926, "step": 840 }, { "epoch": 0.2801867911941294, "grad_norm": 48.90994644165039, "step": 840 }, { "epoch": 0.2801867911941294, "learning_rate": 0.00029998030548824525, "step": 840 }, { "epoch": 0.2801867911941294, "loss": 1.7805827856063843, "step": 840 }, { "ce_loss": 0.3729347884654999, "epoch": 0.2801867911941294, "step": 840 }, { "distill_loss": 1.0355364084243774, "epoch": 0.2801867911941294, "step": 840 }, { "epoch": 0.2801867911941294, "ref_ce_loss": 0.2145635336637497, "step": 840 }, { "epoch": 0.2801867911941294, "loss": 1.8465008735656738, "step": 840 }, { "ce_loss": 0.4972056746482849, "epoch": 0.2801867911941294, "step": 840 }, { "distill_loss": 1.098988652229309, "epoch": 0.2801867911941294, "step": 840 }, { "epoch": 0.2801867911941294, "ref_ce_loss": 0.25016623735427856, "step": 840 }, { "epoch": 0.2801867911941294, "loss": 2.252458095550537, "step": 840 }, { "ce_loss": 0.49673929810523987, "epoch": 0.2801867911941294, "step": 840 }, { "distill_loss": 1.036517858505249, "epoch": 0.2801867911941294, "step": 840 }, { "epoch": 0.2801867911941294, "ref_ce_loss": 0.27009356021881104, "step": 840 }, { "epoch": 0.2801867911941294, "loss": 2.5026607513427734, "step": 840 }, { "ce_loss": 0.4665423333644867, "epoch": 0.2801867911941294, "step": 840 }, { "distill_loss": 0.9117804765701294, "epoch": 0.2801867911941294, "step": 840 }, { "epoch": 0.2801867911941294, "ref_ce_loss": 0.27563977241516113, "step": 840 }, { "epoch": 0.2835223482321548, "loss": 1.4018, "step": 850 }, { "epoch": 0.2835223482321548, "grad_norm": 2.755733013153076, "step": 850 }, { "epoch": 0.2835223482321548, "learning_rate": 0.0002999768863899901, "step": 850 }, { "epoch": 0.2835223482321548, "loss": 1.6243078708648682, "step": 850 }, { "ce_loss": 0.3918367624282837, "epoch": 0.2835223482321548, "step": 850 }, { "distill_loss": 0.7303428053855896, "epoch": 0.2835223482321548, "step": 850 }, { "epoch": 0.2835223482321548, "ref_ce_loss": 0.17016659677028656, "step": 850 }, { "epoch": 0.2835223482321548, "loss": 1.8513051271438599, "step": 850 }, { "ce_loss": 0.32263848185539246, "epoch": 0.2835223482321548, "step": 850 }, { "distill_loss": 0.661507248878479, "epoch": 0.2835223482321548, "step": 850 }, { "epoch": 0.2835223482321548, "ref_ce_loss": 0.14734257757663727, "step": 850 }, { "epoch": 0.2835223482321548, "loss": 1.4546396732330322, "step": 850 }, { "ce_loss": 0.3858022689819336, "epoch": 0.2835223482321548, "step": 850 }, { "distill_loss": 0.6699823141098022, "epoch": 0.2835223482321548, "step": 850 }, { "epoch": 0.2835223482321548, "ref_ce_loss": 0.16758663952350616, "step": 850 }, { "epoch": 0.2835223482321548, "loss": 1.7295818328857422, "step": 850 }, { "ce_loss": 0.3436650037765503, "epoch": 0.2835223482321548, "step": 850 }, { "distill_loss": 0.7292658686637878, "epoch": 0.2835223482321548, "step": 850 }, { "epoch": 0.2835223482321548, "ref_ce_loss": 0.155188649892807, "step": 850 }, { "epoch": 0.2868579052701801, "loss": 1.229, "step": 860 }, { "epoch": 0.2868579052701801, "grad_norm": 3.504427194595337, "step": 860 }, { "epoch": 0.2868579052701801, "learning_rate": 0.00029997319379305515, "step": 860 }, { "epoch": 0.2868579052701801, "loss": 0.9320521354675293, "step": 860 }, { "ce_loss": 0.28551921248435974, "epoch": 0.2868579052701801, "step": 860 }, { "distill_loss": 0.2454386055469513, "epoch": 0.2868579052701801, "step": 860 }, { "epoch": 0.2868579052701801, "ref_ce_loss": 0.19818471372127533, "step": 860 }, { "epoch": 0.2868579052701801, "loss": 1.5051312446594238, "step": 860 }, { "ce_loss": 0.42141714692115784, "epoch": 0.2868579052701801, "step": 860 }, { "distill_loss": 0.29946091771125793, "epoch": 0.2868579052701801, "step": 860 }, { "epoch": 0.2868579052701801, "ref_ce_loss": 0.1941777914762497, "step": 860 }, { "epoch": 0.2868579052701801, "loss": 0.8788027763366699, "step": 860 }, { "ce_loss": 0.3769928216934204, "epoch": 0.2868579052701801, "step": 860 }, { "distill_loss": 0.24679483473300934, "epoch": 0.2868579052701801, "step": 860 }, { "epoch": 0.2868579052701801, "ref_ce_loss": 0.17477509379386902, "step": 860 }, { "epoch": 0.2868579052701801, "loss": 1.0972120761871338, "step": 860 }, { "ce_loss": 0.3971550464630127, "epoch": 0.2868579052701801, "step": 860 }, { "distill_loss": 0.2257571965456009, "epoch": 0.2868579052701801, "step": 860 }, { "epoch": 0.2868579052701801, "ref_ce_loss": 0.23073622584342957, "step": 860 }, { "epoch": 0.2901934623082055, "loss": 1.2198, "step": 870 }, { "epoch": 0.2901934623082055, "grad_norm": 2.217575788497925, "step": 870 }, { "epoch": 0.2901934623082055, "learning_rate": 0.00029996922770417434, "step": 870 }, { "epoch": 0.2901934623082055, "loss": 1.7647467851638794, "step": 870 }, { "ce_loss": 0.39066219329833984, "epoch": 0.2901934623082055, "step": 870 }, { "distill_loss": 0.27164989709854126, "epoch": 0.2901934623082055, "step": 870 }, { "epoch": 0.2901934623082055, "ref_ce_loss": 0.16236373782157898, "step": 870 }, { "epoch": 0.2901934623082055, "loss": 1.2479718923568726, "step": 870 }, { "ce_loss": 0.39999279379844666, "epoch": 0.2901934623082055, "step": 870 }, { "distill_loss": 0.3134003281593323, "epoch": 0.2901934623082055, "step": 870 }, { "epoch": 0.2901934623082055, "ref_ce_loss": 0.23171456158161163, "step": 870 }, { "epoch": 0.2901934623082055, "loss": 1.099112868309021, "step": 870 }, { "ce_loss": 0.4728688895702362, "epoch": 0.2901934623082055, "step": 870 }, { "distill_loss": 0.32087886333465576, "epoch": 0.2901934623082055, "step": 870 }, { "epoch": 0.2901934623082055, "ref_ce_loss": 0.16885533928871155, "step": 870 }, { "epoch": 0.2901934623082055, "loss": 1.2090306282043457, "step": 870 }, { "ce_loss": 0.32257935404777527, "epoch": 0.2901934623082055, "step": 870 }, { "distill_loss": 0.2634151577949524, "epoch": 0.2901934623082055, "step": 870 }, { "epoch": 0.2901934623082055, "ref_ce_loss": 0.19036482274532318, "step": 870 }, { "epoch": 0.2935290193462308, "loss": 1.0661, "step": 880 }, { "epoch": 0.2935290193462308, "grad_norm": 2.9561502933502197, "step": 880 }, { "epoch": 0.2935290193462308, "learning_rate": 0.00029996498813058024, "step": 880 }, { "epoch": 0.2935290193462308, "loss": 0.9875502586364746, "step": 880 }, { "ce_loss": 0.4626186788082123, "epoch": 0.2935290193462308, "step": 880 }, { "distill_loss": 0.2516644597053528, "epoch": 0.2935290193462308, "step": 880 }, { "epoch": 0.2935290193462308, "ref_ce_loss": 0.17623616755008698, "step": 880 }, { "epoch": 0.2935290193462308, "loss": 0.6726231575012207, "step": 880 }, { "ce_loss": 0.2600361108779907, "epoch": 0.2935290193462308, "step": 880 }, { "distill_loss": 0.1706208437681198, "epoch": 0.2935290193462308, "step": 880 }, { "epoch": 0.2935290193462308, "ref_ce_loss": 0.15342611074447632, "step": 880 }, { "epoch": 0.2935290193462308, "loss": 0.5651594996452332, "step": 880 }, { "ce_loss": 0.24951525032520294, "epoch": 0.2935290193462308, "step": 880 }, { "distill_loss": 0.13484697043895721, "epoch": 0.2935290193462308, "step": 880 }, { "epoch": 0.2935290193462308, "ref_ce_loss": 0.1341100037097931, "step": 880 }, { "epoch": 0.2935290193462308, "loss": 1.158497929573059, "step": 880 }, { "ce_loss": 0.2759685218334198, "epoch": 0.2935290193462308, "step": 880 }, { "distill_loss": 0.20097249746322632, "epoch": 0.2935290193462308, "step": 880 }, { "epoch": 0.2935290193462308, "ref_ce_loss": 0.18886953592300415, "step": 880 }, { "epoch": 0.2968645763842562, "loss": 1.0493, "step": 890 }, { "epoch": 0.2968645763842562, "grad_norm": 4.124588966369629, "step": 890 }, { "epoch": 0.2968645763842562, "learning_rate": 0.0002999604750800042, "step": 890 }, { "epoch": 0.2968645763842562, "loss": 1.3233633041381836, "step": 890 }, { "ce_loss": 0.43904969096183777, "epoch": 0.2968645763842562, "step": 890 }, { "distill_loss": 0.20250177383422852, "epoch": 0.2968645763842562, "step": 890 }, { "epoch": 0.2968645763842562, "ref_ce_loss": 0.26501935720443726, "step": 890 }, { "epoch": 0.2968645763842562, "loss": 0.9193978905677795, "step": 890 }, { "ce_loss": 0.4037984013557434, "epoch": 0.2968645763842562, "step": 890 }, { "distill_loss": 0.17271189391613007, "epoch": 0.2968645763842562, "step": 890 }, { "epoch": 0.2968645763842562, "ref_ce_loss": 0.21354354918003082, "step": 890 }, { "epoch": 0.2968645763842562, "loss": 0.8298638463020325, "step": 890 }, { "ce_loss": 0.4446713626384735, "epoch": 0.2968645763842562, "step": 890 }, { "distill_loss": 0.18691898882389069, "epoch": 0.2968645763842562, "step": 890 }, { "epoch": 0.2968645763842562, "ref_ce_loss": 0.19824057817459106, "step": 890 }, { "epoch": 0.2968645763842562, "loss": 0.8961043953895569, "step": 890 }, { "ce_loss": 0.3133203983306885, "epoch": 0.2968645763842562, "step": 890 }, { "distill_loss": 0.18369875848293304, "epoch": 0.2968645763842562, "step": 890 }, { "epoch": 0.2968645763842562, "ref_ce_loss": 0.14095714688301086, "step": 890 }, { "epoch": 0.3002001334222815, "loss": 0.9395, "step": 900 }, { "epoch": 0.3002001334222815, "grad_norm": 3.79996657371521, "step": 900 }, { "epoch": 0.3002001334222815, "learning_rate": 0.0002999556885606761, "step": 900 }, { "epoch": 0.3002001334222815, "loss": 0.4957481324672699, "step": 900 }, { "ce_loss": 0.2566486597061157, "epoch": 0.3002001334222815, "step": 900 }, { "distill_loss": 0.11270153522491455, "epoch": 0.3002001334222815, "step": 900 }, { "epoch": 0.3002001334222815, "ref_ce_loss": 0.1263445019721985, "step": 900 }, { "epoch": 0.3002001334222815, "loss": 0.7489277124404907, "step": 900 }, { "ce_loss": 0.32021090388298035, "epoch": 0.3002001334222815, "step": 900 }, { "distill_loss": 0.11752177029848099, "epoch": 0.3002001334222815, "step": 900 }, { "epoch": 0.3002001334222815, "ref_ce_loss": 0.1603088527917862, "step": 900 }, { "epoch": 0.3002001334222815, "loss": 0.8972264528274536, "step": 900 }, { "ce_loss": 0.41744017601013184, "epoch": 0.3002001334222815, "step": 900 }, { "distill_loss": 0.137289896607399, "epoch": 0.3002001334222815, "step": 900 }, { "epoch": 0.3002001334222815, "ref_ce_loss": 0.2510444223880768, "step": 900 }, { "epoch": 0.3002001334222815, "loss": 1.0207258462905884, "step": 900 }, { "ce_loss": 0.4178427755832672, "epoch": 0.3002001334222815, "step": 900 }, { "distill_loss": 0.1229424700140953, "epoch": 0.3002001334222815, "step": 900 }, { "epoch": 0.3002001334222815, "ref_ce_loss": 0.18919920921325684, "step": 900 }, { "epoch": 0.3035356904603069, "loss": 0.9636, "step": 910 }, { "epoch": 0.3035356904603069, "grad_norm": 2.994086742401123, "step": 910 }, { "epoch": 0.3035356904603069, "learning_rate": 0.00029995062858132485, "step": 910 }, { "epoch": 0.3035356904603069, "loss": 0.9765963554382324, "step": 910 }, { "ce_loss": 0.3894851505756378, "epoch": 0.3035356904603069, "step": 910 }, { "distill_loss": 0.1433676779270172, "epoch": 0.3035356904603069, "step": 910 }, { "epoch": 0.3035356904603069, "ref_ce_loss": 0.13310347497463226, "step": 910 }, { "epoch": 0.3035356904603069, "loss": 1.011177659034729, "step": 910 }, { "ce_loss": 0.3391077220439911, "epoch": 0.3035356904603069, "step": 910 }, { "distill_loss": 0.1197926327586174, "epoch": 0.3035356904603069, "step": 910 }, { "epoch": 0.3035356904603069, "ref_ce_loss": 0.19934594631195068, "step": 910 }, { "epoch": 0.3035356904603069, "loss": 0.6123791337013245, "step": 910 }, { "ce_loss": 0.26249223947525024, "epoch": 0.3035356904603069, "step": 910 }, { "distill_loss": 0.1554940640926361, "epoch": 0.3035356904603069, "step": 910 }, { "epoch": 0.3035356904603069, "ref_ce_loss": 0.08239661157131195, "step": 910 }, { "epoch": 0.3035356904603069, "loss": 1.058248519897461, "step": 910 }, { "ce_loss": 0.39104774594306946, "epoch": 0.3035356904603069, "step": 910 }, { "distill_loss": 0.1529640257358551, "epoch": 0.3035356904603069, "step": 910 }, { "epoch": 0.3035356904603069, "ref_ce_loss": 0.1721760630607605, "step": 910 }, { "epoch": 0.3068712474983322, "loss": 0.954, "step": 920 }, { "epoch": 0.3068712474983322, "grad_norm": 3.0830633640289307, "step": 920 }, { "epoch": 0.3068712474983322, "learning_rate": 0.00029994529515117767, "step": 920 }, { "epoch": 0.3068712474983322, "loss": 1.0534474849700928, "step": 920 }, { "ce_loss": 0.4601081311702728, "epoch": 0.3068712474983322, "step": 920 }, { "distill_loss": 0.1674957424402237, "epoch": 0.3068712474983322, "step": 920 }, { "epoch": 0.3068712474983322, "ref_ce_loss": 0.13673503696918488, "step": 920 }, { "epoch": 0.3068712474983322, "loss": 1.3385803699493408, "step": 920 }, { "ce_loss": 0.3786323368549347, "epoch": 0.3068712474983322, "step": 920 }, { "distill_loss": 0.16205266118049622, "epoch": 0.3068712474983322, "step": 920 }, { "epoch": 0.3068712474983322, "ref_ce_loss": 0.16322043538093567, "step": 920 }, { "epoch": 0.3068712474983322, "loss": 0.7872010469436646, "step": 920 }, { "ce_loss": 0.40525513887405396, "epoch": 0.3068712474983322, "step": 920 }, { "distill_loss": 0.15195631980895996, "epoch": 0.3068712474983322, "step": 920 }, { "epoch": 0.3068712474983322, "ref_ce_loss": 0.1313735991716385, "step": 920 }, { "epoch": 0.3068712474983322, "loss": 0.9474194049835205, "step": 920 }, { "ce_loss": 0.36184266209602356, "epoch": 0.3068712474983322, "step": 920 }, { "distill_loss": 0.1604013741016388, "epoch": 0.3068712474983322, "step": 920 }, { "epoch": 0.3068712474983322, "ref_ce_loss": 0.16650882363319397, "step": 920 }, { "epoch": 0.31020680453635757, "loss": 1.0615, "step": 930 }, { "epoch": 0.31020680453635757, "grad_norm": 6.1084465980529785, "step": 930 }, { "epoch": 0.31020680453635757, "learning_rate": 0.0002999396882799608, "step": 930 }, { "epoch": 0.31020680453635757, "loss": 0.8135707974433899, "step": 930 }, { "ce_loss": 0.48061808943748474, "epoch": 0.31020680453635757, "step": 930 }, { "distill_loss": 0.12107715755701065, "epoch": 0.31020680453635757, "step": 930 }, { "epoch": 0.31020680453635757, "ref_ce_loss": 0.21182826161384583, "step": 930 }, { "epoch": 0.31020680453635757, "loss": 1.6873639822006226, "step": 930 }, { "ce_loss": 0.4000921845436096, "epoch": 0.31020680453635757, "step": 930 }, { "distill_loss": 0.11861014366149902, "epoch": 0.31020680453635757, "step": 930 }, { "epoch": 0.31020680453635757, "ref_ce_loss": 0.19486136734485626, "step": 930 }, { "epoch": 0.31020680453635757, "loss": 0.6255282163619995, "step": 930 }, { "ce_loss": 0.29100140929222107, "epoch": 0.31020680453635757, "step": 930 }, { "distill_loss": 0.11383562535047531, "epoch": 0.31020680453635757, "step": 930 }, { "epoch": 0.31020680453635757, "ref_ce_loss": 0.16077548265457153, "step": 930 }, { "epoch": 0.31020680453635757, "loss": 1.0830172300338745, "step": 930 }, { "ce_loss": 0.406831830739975, "epoch": 0.31020680453635757, "step": 930 }, { "distill_loss": 0.12229843437671661, "epoch": 0.31020680453635757, "step": 930 }, { "epoch": 0.31020680453635757, "ref_ce_loss": 0.18925532698631287, "step": 930 }, { "epoch": 0.3135423615743829, "loss": 1.0012, "step": 940 }, { "epoch": 0.3135423615743829, "grad_norm": 2.763657808303833, "step": 940 }, { "epoch": 0.3135423615743829, "learning_rate": 0.00029993380797789884, "step": 940 }, { "epoch": 0.3135423615743829, "loss": 1.2902885675430298, "step": 940 }, { "ce_loss": 0.5113952159881592, "epoch": 0.3135423615743829, "step": 940 }, { "distill_loss": 0.15199342370033264, "epoch": 0.3135423615743829, "step": 940 }, { "epoch": 0.3135423615743829, "ref_ce_loss": 0.18690067529678345, "step": 940 }, { "epoch": 0.3135423615743829, "loss": 0.7129663825035095, "step": 940 }, { "ce_loss": 0.40314140915870667, "epoch": 0.3135423615743829, "step": 940 }, { "distill_loss": 0.1183619499206543, "epoch": 0.3135423615743829, "step": 940 }, { "epoch": 0.3135423615743829, "ref_ce_loss": 0.18934160470962524, "step": 940 }, { "epoch": 0.3135423615743829, "loss": 0.8741893768310547, "step": 940 }, { "ce_loss": 0.3346640467643738, "epoch": 0.3135423615743829, "step": 940 }, { "distill_loss": 0.13830089569091797, "epoch": 0.3135423615743829, "step": 940 }, { "epoch": 0.3135423615743829, "ref_ce_loss": 0.1432468295097351, "step": 940 }, { "epoch": 0.3135423615743829, "loss": 0.8341860771179199, "step": 940 }, { "ce_loss": 0.418517142534256, "epoch": 0.3135423615743829, "step": 940 }, { "distill_loss": 0.1469680368900299, "epoch": 0.3135423615743829, "step": 940 }, { "epoch": 0.3135423615743829, "ref_ce_loss": 0.13575048744678497, "step": 940 }, { "epoch": 0.31687791861240827, "loss": 0.9327, "step": 950 }, { "epoch": 0.31687791861240827, "grad_norm": 3.665161371231079, "step": 950 }, { "epoch": 0.31687791861240827, "learning_rate": 0.0002999276542557152, "step": 950 }, { "epoch": 0.31687791861240827, "loss": 0.7519707083702087, "step": 950 }, { "ce_loss": 0.302935928106308, "epoch": 0.31687791861240827, "step": 950 }, { "distill_loss": 0.13023529946804047, "epoch": 0.31687791861240827, "step": 950 }, { "epoch": 0.31687791861240827, "ref_ce_loss": 0.17454656958580017, "step": 950 }, { "epoch": 0.31687791861240827, "loss": 1.0623005628585815, "step": 950 }, { "ce_loss": 0.309317946434021, "epoch": 0.31687791861240827, "step": 950 }, { "distill_loss": 0.13614854216575623, "epoch": 0.31687791861240827, "step": 950 }, { "epoch": 0.31687791861240827, "ref_ce_loss": 0.20230692625045776, "step": 950 }, { "epoch": 0.31687791861240827, "loss": 1.529290795326233, "step": 950 }, { "ce_loss": 0.281059592962265, "epoch": 0.31687791861240827, "step": 950 }, { "distill_loss": 0.13188424706459045, "epoch": 0.31687791861240827, "step": 950 }, { "epoch": 0.31687791861240827, "ref_ce_loss": 0.19181722402572632, "step": 950 }, { "epoch": 0.31687791861240827, "loss": 1.0552257299423218, "step": 950 }, { "ce_loss": 0.3567151427268982, "epoch": 0.31687791861240827, "step": 950 }, { "distill_loss": 0.14000089466571808, "epoch": 0.31687791861240827, "step": 950 }, { "epoch": 0.31687791861240827, "ref_ce_loss": 0.18195955455303192, "step": 950 }, { "epoch": 0.3202134756504336, "loss": 0.9537, "step": 960 }, { "epoch": 0.3202134756504336, "grad_norm": 2.9725399017333984, "step": 960 }, { "epoch": 0.3202134756504336, "learning_rate": 0.00029992122712463185, "step": 960 }, { "epoch": 0.3202134756504336, "loss": 0.8172087073326111, "step": 960 }, { "ce_loss": 0.4168473482131958, "epoch": 0.3202134756504336, "step": 960 }, { "distill_loss": 0.12376371771097183, "epoch": 0.3202134756504336, "step": 960 }, { "epoch": 0.3202134756504336, "ref_ce_loss": 0.27427318692207336, "step": 960 }, { "epoch": 0.3202134756504336, "loss": 0.7152084708213806, "step": 960 }, { "ce_loss": 0.3916681706905365, "epoch": 0.3202134756504336, "step": 960 }, { "distill_loss": 0.1268494725227356, "epoch": 0.3202134756504336, "step": 960 }, { "epoch": 0.3202134756504336, "ref_ce_loss": 0.19664986431598663, "step": 960 }, { "epoch": 0.3202134756504336, "loss": 0.750626802444458, "step": 960 }, { "ce_loss": 0.358602911233902, "epoch": 0.3202134756504336, "step": 960 }, { "distill_loss": 0.12475446611642838, "epoch": 0.3202134756504336, "step": 960 }, { "epoch": 0.3202134756504336, "ref_ce_loss": 0.16163264214992523, "step": 960 }, { "epoch": 0.3202134756504336, "loss": 0.7146525979042053, "step": 960 }, { "ce_loss": 0.30217331647872925, "epoch": 0.3202134756504336, "step": 960 }, { "distill_loss": 0.13478901982307434, "epoch": 0.3202134756504336, "step": 960 }, { "epoch": 0.3202134756504336, "ref_ce_loss": 0.1618402898311615, "step": 960 }, { "epoch": 0.32354903268845897, "loss": 0.9339, "step": 970 }, { "epoch": 0.32354903268845897, "grad_norm": 4.9782819747924805, "step": 970 }, { "epoch": 0.32354903268845897, "learning_rate": 0.0002999145265963693, "step": 970 }, { "epoch": 0.32354903268845897, "loss": 0.7821298241615295, "step": 970 }, { "ce_loss": 0.4171198606491089, "epoch": 0.32354903268845897, "step": 970 }, { "distill_loss": 0.1375242918729782, "epoch": 0.32354903268845897, "step": 970 }, { "epoch": 0.32354903268845897, "ref_ce_loss": 0.17934392392635345, "step": 970 }, { "epoch": 0.32354903268845897, "loss": 1.0573623180389404, "step": 970 }, { "ce_loss": 0.42946135997772217, "epoch": 0.32354903268845897, "step": 970 }, { "distill_loss": 0.12328819185495377, "epoch": 0.32354903268845897, "step": 970 }, { "epoch": 0.32354903268845897, "ref_ce_loss": 0.16554884612560272, "step": 970 }, { "epoch": 0.32354903268845897, "loss": 1.2828457355499268, "step": 970 }, { "ce_loss": 0.4472983777523041, "epoch": 0.32354903268845897, "step": 970 }, { "distill_loss": 0.13271930813789368, "epoch": 0.32354903268845897, "step": 970 }, { "epoch": 0.32354903268845897, "ref_ce_loss": 0.201155424118042, "step": 970 }, { "epoch": 0.32354903268845897, "loss": 1.5741230249404907, "step": 970 }, { "ce_loss": 0.3056272864341736, "epoch": 0.32354903268845897, "step": 970 }, { "distill_loss": 0.10602132976055145, "epoch": 0.32354903268845897, "step": 970 }, { "epoch": 0.32354903268845897, "ref_ce_loss": 0.2535144090652466, "step": 970 }, { "epoch": 0.3268845897264843, "loss": 1.0272, "step": 980 }, { "epoch": 0.3268845897264843, "grad_norm": 6.052799701690674, "step": 980 }, { "epoch": 0.3268845897264843, "learning_rate": 0.00029990755268314667, "step": 980 }, { "epoch": 0.3268845897264843, "loss": 0.7132030129432678, "step": 980 }, { "ce_loss": 0.30206334590911865, "epoch": 0.3268845897264843, "step": 980 }, { "distill_loss": 0.10779759287834167, "epoch": 0.3268845897264843, "step": 980 }, { "epoch": 0.3268845897264843, "ref_ce_loss": 0.18162314593791962, "step": 980 }, { "epoch": 0.3268845897264843, "loss": 0.8841077089309692, "step": 980 }, { "ce_loss": 0.37281209230422974, "epoch": 0.3268845897264843, "step": 980 }, { "distill_loss": 0.11574500054121017, "epoch": 0.3268845897264843, "step": 980 }, { "epoch": 0.3268845897264843, "ref_ce_loss": 0.2086096853017807, "step": 980 }, { "epoch": 0.3268845897264843, "loss": 0.9464960098266602, "step": 980 }, { "ce_loss": 0.4134019613265991, "epoch": 0.3268845897264843, "step": 980 }, { "distill_loss": 0.1166180819272995, "epoch": 0.3268845897264843, "step": 980 }, { "epoch": 0.3268845897264843, "ref_ce_loss": 0.24008263647556305, "step": 980 }, { "epoch": 0.3268845897264843, "loss": 0.9239301681518555, "step": 980 }, { "ce_loss": 0.3819361627101898, "epoch": 0.3268845897264843, "step": 980 }, { "distill_loss": 0.10396760702133179, "epoch": 0.3268845897264843, "step": 980 }, { "epoch": 0.3268845897264843, "ref_ce_loss": 0.23532161116600037, "step": 980 }, { "epoch": 0.33022014676450967, "loss": 0.8154, "step": 990 }, { "epoch": 0.33022014676450967, "grad_norm": 4.464677333831787, "step": 990 }, { "epoch": 0.33022014676450967, "learning_rate": 0.00029990030539768167, "step": 990 }, { "epoch": 0.33022014676450967, "loss": 1.126884937286377, "step": 990 }, { "ce_loss": 0.2962307333946228, "epoch": 0.33022014676450967, "step": 990 }, { "distill_loss": 0.11264996230602264, "epoch": 0.33022014676450967, "step": 990 }, { "epoch": 0.33022014676450967, "ref_ce_loss": 0.15359817445278168, "step": 990 }, { "epoch": 0.33022014676450967, "loss": 0.7011892795562744, "step": 990 }, { "ce_loss": 0.35758545994758606, "epoch": 0.33022014676450967, "step": 990 }, { "distill_loss": 0.11551101505756378, "epoch": 0.33022014676450967, "step": 990 }, { "epoch": 0.33022014676450967, "ref_ce_loss": 0.12288648635149002, "step": 990 }, { "epoch": 0.33022014676450967, "loss": 0.7315992116928101, "step": 990 }, { "ce_loss": 0.29825419187545776, "epoch": 0.33022014676450967, "step": 990 }, { "distill_loss": 0.09923024475574493, "epoch": 0.33022014676450967, "step": 990 }, { "epoch": 0.33022014676450967, "ref_ce_loss": 0.15791910886764526, "step": 990 }, { "epoch": 0.33022014676450967, "loss": 1.1659493446350098, "step": 990 }, { "ce_loss": 0.4181416928768158, "epoch": 0.33022014676450967, "step": 990 }, { "distill_loss": 0.1343041956424713, "epoch": 0.33022014676450967, "step": 990 }, { "epoch": 0.33022014676450967, "ref_ce_loss": 0.12854261696338654, "step": 990 }, { "epoch": 0.333555703802535, "loss": 0.9506, "step": 1000 }, { "epoch": 0.333555703802535, "grad_norm": 8.558215141296387, "step": 1000 }, { "epoch": 0.333555703802535, "learning_rate": 0.0002998927847531905, "step": 1000 }, { "epoch": 0.333555703802535, "loss": 1.2239561080932617, "step": 1000 }, { "ce_loss": 0.3037545382976532, "epoch": 0.333555703802535, "step": 1000 }, { "distill_loss": 0.6090459823608398, "epoch": 0.333555703802535, "step": 1000 }, { "epoch": 0.333555703802535, "ref_ce_loss": 0.16965021193027496, "step": 1000 }, { "epoch": 0.333555703802535, "loss": 1.0540249347686768, "step": 1000 }, { "ce_loss": 0.3109924793243408, "epoch": 0.333555703802535, "step": 1000 }, { "distill_loss": 0.5928265452384949, "epoch": 0.333555703802535, "step": 1000 }, { "epoch": 0.333555703802535, "ref_ce_loss": 0.14983440935611725, "step": 1000 }, { "epoch": 0.333555703802535, "loss": 1.3430770635604858, "step": 1000 }, { "ce_loss": 0.37822142243385315, "epoch": 0.333555703802535, "step": 1000 }, { "distill_loss": 0.702450156211853, "epoch": 0.333555703802535, "step": 1000 }, { "epoch": 0.333555703802535, "ref_ce_loss": 0.1507684737443924, "step": 1000 }, { "epoch": 0.333555703802535, "loss": 1.3171063661575317, "step": 1000 }, { "ce_loss": 0.3266257643699646, "epoch": 0.333555703802535, "step": 1000 }, { "distill_loss": 0.7057124376296997, "epoch": 0.333555703802535, "step": 1000 }, { "epoch": 0.333555703802535, "ref_ce_loss": 0.1825946867465973, "step": 1000 }, { "epoch": 0.33689126084056037, "loss": 1.1746, "step": 1010 }, { "epoch": 0.33689126084056037, "grad_norm": 2.476720094680786, "step": 1010 }, { "epoch": 0.33689126084056037, "learning_rate": 0.0002998849907633878, "step": 1010 }, { "epoch": 0.33689126084056037, "loss": 0.8267805576324463, "step": 1010 }, { "ce_loss": 0.2990218698978424, "epoch": 0.33689126084056037, "step": 1010 }, { "distill_loss": 0.21271488070487976, "epoch": 0.33689126084056037, "step": 1010 }, { "epoch": 0.33689126084056037, "ref_ce_loss": 0.175185889005661, "step": 1010 }, { "epoch": 0.33689126084056037, "loss": 0.7139723896980286, "step": 1010 }, { "ce_loss": 0.3102499544620514, "epoch": 0.33689126084056037, "step": 1010 }, { "distill_loss": 0.22610852122306824, "epoch": 0.33689126084056037, "step": 1010 }, { "epoch": 0.33689126084056037, "ref_ce_loss": 0.1776013821363449, "step": 1010 }, { "epoch": 0.33689126084056037, "loss": 0.8357424736022949, "step": 1010 }, { "ce_loss": 0.3024822175502777, "epoch": 0.33689126084056037, "step": 1010 }, { "distill_loss": 0.2356206774711609, "epoch": 0.33689126084056037, "step": 1010 }, { "epoch": 0.33689126084056037, "ref_ce_loss": 0.2250850945711136, "step": 1010 }, { "epoch": 0.33689126084056037, "loss": 1.2665164470672607, "step": 1010 }, { "ce_loss": 0.3598532974720001, "epoch": 0.33689126084056037, "step": 1010 }, { "distill_loss": 0.24678495526313782, "epoch": 0.33689126084056037, "step": 1010 }, { "epoch": 0.33689126084056037, "ref_ce_loss": 0.157108873128891, "step": 1010 }, { "epoch": 0.3402268178785857, "loss": 1.0136, "step": 1020 }, { "epoch": 0.3402268178785857, "grad_norm": 3.2634665966033936, "step": 1020 }, { "epoch": 0.3402268178785857, "learning_rate": 0.0002998769234424868, "step": 1020 }, { "epoch": 0.3402268178785857, "loss": 0.7189880609512329, "step": 1020 }, { "ce_loss": 0.3308708369731903, "epoch": 0.3402268178785857, "step": 1020 }, { "distill_loss": 0.19060233235359192, "epoch": 0.3402268178785857, "step": 1020 }, { "epoch": 0.3402268178785857, "ref_ce_loss": 0.13148698210716248, "step": 1020 }, { "epoch": 0.3402268178785857, "loss": 0.709821879863739, "step": 1020 }, { "ce_loss": 0.33700650930404663, "epoch": 0.3402268178785857, "step": 1020 }, { "distill_loss": 0.19593364000320435, "epoch": 0.3402268178785857, "step": 1020 }, { "epoch": 0.3402268178785857, "ref_ce_loss": 0.1766604334115982, "step": 1020 }, { "epoch": 0.3402268178785857, "loss": 0.6329488158226013, "step": 1020 }, { "ce_loss": 0.3196988105773926, "epoch": 0.3402268178785857, "step": 1020 }, { "distill_loss": 0.19230948388576508, "epoch": 0.3402268178785857, "step": 1020 }, { "epoch": 0.3402268178785857, "ref_ce_loss": 0.1209319457411766, "step": 1020 }, { "epoch": 0.3402268178785857, "loss": 1.4003040790557861, "step": 1020 }, { "ce_loss": 0.3359232246875763, "epoch": 0.3402268178785857, "step": 1020 }, { "distill_loss": 0.22875024378299713, "epoch": 0.3402268178785857, "step": 1020 }, { "epoch": 0.3402268178785857, "ref_ce_loss": 0.18941046297550201, "step": 1020 }, { "epoch": 0.34356237491661107, "loss": 0.9281, "step": 1030 }, { "epoch": 0.34356237491661107, "grad_norm": 2.158473253250122, "step": 1030 }, { "epoch": 0.34356237491661107, "learning_rate": 0.00029986858280519897, "step": 1030 }, { "epoch": 0.34356237491661107, "loss": 1.0241984128952026, "step": 1030 }, { "ce_loss": 0.4109732210636139, "epoch": 0.34356237491661107, "step": 1030 }, { "distill_loss": 0.22403854131698608, "epoch": 0.34356237491661107, "step": 1030 }, { "epoch": 0.34356237491661107, "ref_ce_loss": 0.19617129862308502, "step": 1030 }, { "epoch": 0.34356237491661107, "loss": 0.9698517322540283, "step": 1030 }, { "ce_loss": 0.33989307284355164, "epoch": 0.34356237491661107, "step": 1030 }, { "distill_loss": 0.21469691395759583, "epoch": 0.34356237491661107, "step": 1030 }, { "epoch": 0.34356237491661107, "ref_ce_loss": 0.1773064285516739, "step": 1030 }, { "epoch": 0.34356237491661107, "loss": 1.1452717781066895, "step": 1030 }, { "ce_loss": 0.374833881855011, "epoch": 0.34356237491661107, "step": 1030 }, { "distill_loss": 0.21497607231140137, "epoch": 0.34356237491661107, "step": 1030 }, { "epoch": 0.34356237491661107, "ref_ce_loss": 0.20118488371372223, "step": 1030 }, { "epoch": 0.34356237491661107, "loss": 1.1008607149124146, "step": 1030 }, { "ce_loss": 0.367872416973114, "epoch": 0.34356237491661107, "step": 1030 }, { "distill_loss": 0.22034002840518951, "epoch": 0.34356237491661107, "step": 1030 }, { "epoch": 0.34356237491661107, "ref_ce_loss": 0.21571652591228485, "step": 1030 }, { "epoch": 0.3468979319546364, "loss": 1.1223, "step": 1040 }, { "epoch": 0.3468979319546364, "grad_norm": 5.033119201660156, "step": 1040 }, { "epoch": 0.3468979319546364, "learning_rate": 0.0002998599688667345, "step": 1040 }, { "epoch": 0.3468979319546364, "loss": 0.5317191481590271, "step": 1040 }, { "ce_loss": 0.2717324495315552, "epoch": 0.3468979319546364, "step": 1040 }, { "distill_loss": 0.13226278126239777, "epoch": 0.3468979319546364, "step": 1040 }, { "epoch": 0.3468979319546364, "ref_ce_loss": 0.12715670466423035, "step": 1040 }, { "epoch": 0.3468979319546364, "loss": 1.1311434507369995, "step": 1040 }, { "ce_loss": 0.5449992418289185, "epoch": 0.3468979319546364, "step": 1040 }, { "distill_loss": 0.2138533592224121, "epoch": 0.3468979319546364, "step": 1040 }, { "epoch": 0.3468979319546364, "ref_ce_loss": 0.22759371995925903, "step": 1040 }, { "epoch": 0.3468979319546364, "loss": 1.385339617729187, "step": 1040 }, { "ce_loss": 0.36298808455467224, "epoch": 0.3468979319546364, "step": 1040 }, { "distill_loss": 0.1468626856803894, "epoch": 0.3468979319546364, "step": 1040 }, { "epoch": 0.3468979319546364, "ref_ce_loss": 0.20583710074424744, "step": 1040 }, { "epoch": 0.3468979319546364, "loss": 1.0133357048034668, "step": 1040 }, { "ce_loss": 0.3378484845161438, "epoch": 0.3468979319546364, "step": 1040 }, { "distill_loss": 0.1804906278848648, "epoch": 0.3468979319546364, "step": 1040 }, { "epoch": 0.3468979319546364, "ref_ce_loss": 0.11686846613883972, "step": 1040 }, { "epoch": 0.35023348899266177, "loss": 0.9878, "step": 1050 }, { "epoch": 0.35023348899266177, "grad_norm": 2.773841381072998, "step": 1050 }, { "epoch": 0.35023348899266177, "learning_rate": 0.0002998510816428017, "step": 1050 }, { "epoch": 0.35023348899266177, "loss": 0.8605588674545288, "step": 1050 }, { "ce_loss": 0.40844660997390747, "epoch": 0.35023348899266177, "step": 1050 }, { "distill_loss": 0.1829022318124771, "epoch": 0.35023348899266177, "step": 1050 }, { "epoch": 0.35023348899266177, "ref_ce_loss": 0.1620185673236847, "step": 1050 }, { "epoch": 0.35023348899266177, "loss": 0.9875406622886658, "step": 1050 }, { "ce_loss": 0.5963196158409119, "epoch": 0.35023348899266177, "step": 1050 }, { "distill_loss": 0.21429228782653809, "epoch": 0.35023348899266177, "step": 1050 }, { "epoch": 0.35023348899266177, "ref_ce_loss": 0.17668293416500092, "step": 1050 }, { "epoch": 0.35023348899266177, "loss": 0.8259153962135315, "step": 1050 }, { "ce_loss": 0.36064836382865906, "epoch": 0.35023348899266177, "step": 1050 }, { "distill_loss": 0.17781072854995728, "epoch": 0.35023348899266177, "step": 1050 }, { "epoch": 0.35023348899266177, "ref_ce_loss": 0.18637806177139282, "step": 1050 }, { "epoch": 0.35023348899266177, "loss": 0.7744853496551514, "step": 1050 }, { "ce_loss": 0.38626617193222046, "epoch": 0.35023348899266177, "step": 1050 }, { "distill_loss": 0.18170593678951263, "epoch": 0.35023348899266177, "step": 1050 }, { "epoch": 0.35023348899266177, "ref_ce_loss": 0.1515505313873291, "step": 1050 }, { "epoch": 0.3535690460306871, "loss": 1.0142, "step": 1060 }, { "epoch": 0.3535690460306871, "grad_norm": 2.391183376312256, "step": 1060 }, { "epoch": 0.3535690460306871, "learning_rate": 0.00029984192114960746, "step": 1060 }, { "epoch": 0.3535690460306871, "loss": 0.7195358276367188, "step": 1060 }, { "ce_loss": 0.3371320962905884, "epoch": 0.3535690460306871, "step": 1060 }, { "distill_loss": 0.23293828964233398, "epoch": 0.3535690460306871, "step": 1060 }, { "epoch": 0.3535690460306871, "ref_ce_loss": 0.08528699725866318, "step": 1060 }, { "epoch": 0.3535690460306871, "loss": 1.3199244737625122, "step": 1060 }, { "ce_loss": 0.4546003043651581, "epoch": 0.3535690460306871, "step": 1060 }, { "distill_loss": 0.2744484543800354, "epoch": 0.3535690460306871, "step": 1060 }, { "epoch": 0.3535690460306871, "ref_ce_loss": 0.15281927585601807, "step": 1060 }, { "epoch": 0.3535690460306871, "loss": 0.916218101978302, "step": 1060 }, { "ce_loss": 0.3605182468891144, "epoch": 0.3535690460306871, "step": 1060 }, { "distill_loss": 0.2062976062297821, "epoch": 0.3535690460306871, "step": 1060 }, { "epoch": 0.3535690460306871, "ref_ce_loss": 0.19087523221969604, "step": 1060 }, { "epoch": 0.3535690460306871, "loss": 0.6971317529678345, "step": 1060 }, { "ce_loss": 0.27850502729415894, "epoch": 0.3535690460306871, "step": 1060 }, { "distill_loss": 0.20633718371391296, "epoch": 0.3535690460306871, "step": 1060 }, { "epoch": 0.3535690460306871, "ref_ce_loss": 0.21209193766117096, "step": 1060 }, { "epoch": 0.35690460306871247, "loss": 0.9138, "step": 1070 }, { "epoch": 0.35690460306871247, "grad_norm": 3.1012446880340576, "step": 1070 }, { "epoch": 0.35690460306871247, "learning_rate": 0.0002998324874038568, "step": 1070 }, { "epoch": 0.35690460306871247, "loss": 1.016005277633667, "step": 1070 }, { "ce_loss": 0.3927364647388458, "epoch": 0.35690460306871247, "step": 1070 }, { "distill_loss": 0.2100256383419037, "epoch": 0.35690460306871247, "step": 1070 }, { "epoch": 0.35690460306871247, "ref_ce_loss": 0.24908369779586792, "step": 1070 }, { "epoch": 0.35690460306871247, "loss": 0.8831821084022522, "step": 1070 }, { "ce_loss": 0.43767842650413513, "epoch": 0.35690460306871247, "step": 1070 }, { "distill_loss": 0.21271327137947083, "epoch": 0.35690460306871247, "step": 1070 }, { "epoch": 0.35690460306871247, "ref_ce_loss": 0.23278841376304626, "step": 1070 }, { "epoch": 0.35690460306871247, "loss": 0.774807870388031, "step": 1070 }, { "ce_loss": 0.4057586193084717, "epoch": 0.35690460306871247, "step": 1070 }, { "distill_loss": 0.22247371077537537, "epoch": 0.35690460306871247, "step": 1070 }, { "epoch": 0.35690460306871247, "ref_ce_loss": 0.14649169147014618, "step": 1070 }, { "epoch": 0.35690460306871247, "loss": 1.6678129434585571, "step": 1070 }, { "ce_loss": 0.4439554512500763, "epoch": 0.35690460306871247, "step": 1070 }, { "distill_loss": 0.22488150000572205, "epoch": 0.35690460306871247, "step": 1070 }, { "epoch": 0.35690460306871247, "ref_ce_loss": 0.23912842571735382, "step": 1070 }, { "epoch": 0.3602401601067378, "loss": 1.0485, "step": 1080 }, { "epoch": 0.3602401601067378, "grad_norm": 2.263382911682129, "step": 1080 }, { "epoch": 0.3602401601067378, "learning_rate": 0.00029982278042275327, "step": 1080 }, { "epoch": 0.3602401601067378, "loss": 0.9002568125724792, "step": 1080 }, { "ce_loss": 0.2690925598144531, "epoch": 0.3602401601067378, "step": 1080 }, { "distill_loss": 0.2799534499645233, "epoch": 0.3602401601067378, "step": 1080 }, { "epoch": 0.3602401601067378, "ref_ce_loss": 0.18060727417469025, "step": 1080 }, { "epoch": 0.3602401601067378, "loss": 0.8647176027297974, "step": 1080 }, { "ce_loss": 0.301247239112854, "epoch": 0.3602401601067378, "step": 1080 }, { "distill_loss": 0.3146838843822479, "epoch": 0.3602401601067378, "step": 1080 }, { "epoch": 0.3602401601067378, "ref_ce_loss": 0.15903924405574799, "step": 1080 }, { "epoch": 0.3602401601067378, "loss": 0.9686211943626404, "step": 1080 }, { "ce_loss": 0.38894566893577576, "epoch": 0.3602401601067378, "step": 1080 }, { "distill_loss": 0.3025593161582947, "epoch": 0.3602401601067378, "step": 1080 }, { "epoch": 0.3602401601067378, "ref_ce_loss": 0.18121132254600525, "step": 1080 }, { "epoch": 0.3602401601067378, "loss": 0.8153437376022339, "step": 1080 }, { "ce_loss": 0.27934902906417847, "epoch": 0.3602401601067378, "step": 1080 }, { "distill_loss": 0.3312188982963562, "epoch": 0.3602401601067378, "step": 1080 }, { "epoch": 0.3602401601067378, "ref_ce_loss": 0.1401250958442688, "step": 1080 }, { "epoch": 0.36357571714476317, "loss": 1.0589, "step": 1090 }, { "epoch": 0.36357571714476317, "grad_norm": 2.614572048187256, "step": 1090 }, { "epoch": 0.36357571714476317, "learning_rate": 0.0002998128002239985, "step": 1090 }, { "epoch": 0.36357571714476317, "loss": 0.9504167437553406, "step": 1090 }, { "ce_loss": 0.40660786628723145, "epoch": 0.36357571714476317, "step": 1090 }, { "distill_loss": 0.24065075814723969, "epoch": 0.36357571714476317, "step": 1090 }, { "epoch": 0.36357571714476317, "ref_ce_loss": 0.21729962527751923, "step": 1090 }, { "epoch": 0.36357571714476317, "loss": 0.8468886613845825, "step": 1090 }, { "ce_loss": 0.3686285614967346, "epoch": 0.36357571714476317, "step": 1090 }, { "distill_loss": 0.22304001450538635, "epoch": 0.36357571714476317, "step": 1090 }, { "epoch": 0.36357571714476317, "ref_ce_loss": 0.16232386231422424, "step": 1090 }, { "epoch": 0.36357571714476317, "loss": 0.8567818403244019, "step": 1090 }, { "ce_loss": 0.3550497889518738, "epoch": 0.36357571714476317, "step": 1090 }, { "distill_loss": 0.24365133047103882, "epoch": 0.36357571714476317, "step": 1090 }, { "epoch": 0.36357571714476317, "ref_ce_loss": 0.15234960615634918, "step": 1090 }, { "epoch": 0.36357571714476317, "loss": 0.9920378923416138, "step": 1090 }, { "ce_loss": 0.40550684928894043, "epoch": 0.36357571714476317, "step": 1090 }, { "distill_loss": 0.23939445614814758, "epoch": 0.36357571714476317, "step": 1090 }, { "epoch": 0.36357571714476317, "ref_ce_loss": 0.25587886571884155, "step": 1090 }, { "epoch": 0.3669112741827885, "loss": 0.9507, "step": 1100 }, { "epoch": 0.3669112741827885, "grad_norm": 1.8720557689666748, "step": 1100 }, { "epoch": 0.3669112741827885, "learning_rate": 0.00029980254682579244, "step": 1100 }, { "epoch": 0.3669112741827885, "loss": 0.6526878476142883, "step": 1100 }, { "ce_loss": 0.3227846920490265, "epoch": 0.3669112741827885, "step": 1100 }, { "distill_loss": 0.11304277926683426, "epoch": 0.3669112741827885, "step": 1100 }, { "epoch": 0.3669112741827885, "ref_ce_loss": 0.21555647253990173, "step": 1100 }, { "epoch": 0.3669112741827885, "loss": 0.9330704212188721, "step": 1100 }, { "ce_loss": 0.42695215344429016, "epoch": 0.3669112741827885, "step": 1100 }, { "distill_loss": 0.13858823478221893, "epoch": 0.3669112741827885, "step": 1100 }, { "epoch": 0.3669112741827885, "ref_ce_loss": 0.18709157407283783, "step": 1100 }, { "epoch": 0.3669112741827885, "loss": 0.6232830882072449, "step": 1100 }, { "ce_loss": 0.25201037526130676, "epoch": 0.3669112741827885, "step": 1100 }, { "distill_loss": 0.10899011045694351, "epoch": 0.3669112741827885, "step": 1100 }, { "epoch": 0.3669112741827885, "ref_ce_loss": 0.16215498745441437, "step": 1100 }, { "epoch": 0.3669112741827885, "loss": 0.6160975098609924, "step": 1100 }, { "ce_loss": 0.24335621297359467, "epoch": 0.3669112741827885, "step": 1100 }, { "distill_loss": 0.1022597998380661, "epoch": 0.3669112741827885, "step": 1100 }, { "epoch": 0.3669112741827885, "ref_ce_loss": 0.12327880412340164, "step": 1100 }, { "epoch": 0.37024683122081387, "loss": 0.8546, "step": 1110 }, { "epoch": 0.37024683122081387, "grad_norm": 2.0390474796295166, "step": 1110 }, { "epoch": 0.37024683122081387, "learning_rate": 0.00029979202024683324, "step": 1110 }, { "epoch": 0.37024683122081387, "loss": 0.7502092123031616, "step": 1110 }, { "ce_loss": 0.3670903742313385, "epoch": 0.37024683122081387, "step": 1110 }, { "distill_loss": 0.11122335493564606, "epoch": 0.37024683122081387, "step": 1110 }, { "epoch": 0.37024683122081387, "ref_ce_loss": 0.18666565418243408, "step": 1110 }, { "epoch": 0.37024683122081387, "loss": 0.7180501222610474, "step": 1110 }, { "ce_loss": 0.38356834650039673, "epoch": 0.37024683122081387, "step": 1110 }, { "distill_loss": 0.124627023935318, "epoch": 0.37024683122081387, "step": 1110 }, { "epoch": 0.37024683122081387, "ref_ce_loss": 0.2097601294517517, "step": 1110 }, { "epoch": 0.37024683122081387, "loss": 1.266775131225586, "step": 1110 }, { "ce_loss": 0.36160174012184143, "epoch": 0.37024683122081387, "step": 1110 }, { "distill_loss": 0.11643195152282715, "epoch": 0.37024683122081387, "step": 1110 }, { "epoch": 0.37024683122081387, "ref_ce_loss": 0.15776577591896057, "step": 1110 }, { "epoch": 0.37024683122081387, "loss": 0.6577326059341431, "step": 1110 }, { "ce_loss": 0.3060700297355652, "epoch": 0.37024683122081387, "step": 1110 }, { "distill_loss": 0.11561097204685211, "epoch": 0.37024683122081387, "step": 1110 }, { "epoch": 0.37024683122081387, "ref_ce_loss": 0.1511240005493164, "step": 1110 }, { "epoch": 0.3735823882588392, "loss": 1.115, "step": 1120 }, { "epoch": 0.3735823882588392, "grad_norm": 2.075615167617798, "step": 1120 }, { "epoch": 0.3735823882588392, "learning_rate": 0.00029978122050631725, "step": 1120 }, { "epoch": 0.3735823882588392, "loss": 0.7357965111732483, "step": 1120 }, { "ce_loss": 0.33645859360694885, "epoch": 0.3735823882588392, "step": 1120 }, { "distill_loss": 0.10484284907579422, "epoch": 0.3735823882588392, "step": 1120 }, { "epoch": 0.3735823882588392, "ref_ce_loss": 0.14064335823059082, "step": 1120 }, { "epoch": 0.3735823882588392, "loss": 0.5798081755638123, "step": 1120 }, { "ce_loss": 0.2527194023132324, "epoch": 0.3735823882588392, "step": 1120 }, { "distill_loss": 0.08029267936944962, "epoch": 0.3735823882588392, "step": 1120 }, { "epoch": 0.3735823882588392, "ref_ce_loss": 0.17915977537631989, "step": 1120 }, { "epoch": 0.3735823882588392, "loss": 0.9990452527999878, "step": 1120 }, { "ce_loss": 0.391522616147995, "epoch": 0.3735823882588392, "step": 1120 }, { "distill_loss": 0.10954640805721283, "epoch": 0.3735823882588392, "step": 1120 }, { "epoch": 0.3735823882588392, "ref_ce_loss": 0.19175396859645844, "step": 1120 }, { "epoch": 0.3735823882588392, "loss": 0.7248374819755554, "step": 1120 }, { "ce_loss": 0.41094255447387695, "epoch": 0.3735823882588392, "step": 1120 }, { "distill_loss": 0.11109843105077744, "epoch": 0.3735823882588392, "step": 1120 }, { "epoch": 0.3735823882588392, "ref_ce_loss": 0.2024851143360138, "step": 1120 }, { "epoch": 0.37691794529686456, "loss": 0.8209, "step": 1130 }, { "epoch": 0.37691794529686456, "grad_norm": 1.610378623008728, "step": 1130 }, { "epoch": 0.37691794529686456, "learning_rate": 0.00029977014762393894, "step": 1130 }, { "epoch": 0.37691794529686456, "loss": 1.7917901277542114, "step": 1130 }, { "ce_loss": 0.41128435730934143, "epoch": 0.37691794529686456, "step": 1130 }, { "distill_loss": 0.10002844035625458, "epoch": 0.37691794529686456, "step": 1130 }, { "epoch": 0.37691794529686456, "ref_ce_loss": 0.19286653399467468, "step": 1130 }, { "epoch": 0.37691794529686456, "loss": 0.7495785355567932, "step": 1130 }, { "ce_loss": 0.3824373483657837, "epoch": 0.37691794529686456, "step": 1130 }, { "distill_loss": 0.09886406362056732, "epoch": 0.37691794529686456, "step": 1130 }, { "epoch": 0.37691794529686456, "ref_ce_loss": 0.18395927548408508, "step": 1130 }, { "epoch": 0.37691794529686456, "loss": 0.9176914691925049, "step": 1130 }, { "ce_loss": 0.3707192540168762, "epoch": 0.37691794529686456, "step": 1130 }, { "distill_loss": 0.08182831108570099, "epoch": 0.37691794529686456, "step": 1130 }, { "epoch": 0.37691794529686456, "ref_ce_loss": 0.18617984652519226, "step": 1130 }, { "epoch": 0.37691794529686456, "loss": 1.0768955945968628, "step": 1130 }, { "ce_loss": 0.32718828320503235, "epoch": 0.37691794529686456, "step": 1130 }, { "distill_loss": 0.09349283576011658, "epoch": 0.37691794529686456, "step": 1130 }, { "epoch": 0.37691794529686456, "ref_ce_loss": 0.12686288356781006, "step": 1130 }, { "epoch": 0.3802535023348899, "loss": 0.9497, "step": 1140 }, { "epoch": 0.3802535023348899, "grad_norm": 3.2042505741119385, "step": 1140 }, { "epoch": 0.3802535023348899, "learning_rate": 0.0002997588016198908, "step": 1140 }, { "epoch": 0.3802535023348899, "loss": 0.9679973125457764, "step": 1140 }, { "ce_loss": 0.4296916723251343, "epoch": 0.3802535023348899, "step": 1140 }, { "distill_loss": 0.10788857191801071, "epoch": 0.3802535023348899, "step": 1140 }, { "epoch": 0.3802535023348899, "ref_ce_loss": 0.250051885843277, "step": 1140 }, { "epoch": 0.3802535023348899, "loss": 0.7972154021263123, "step": 1140 }, { "ce_loss": 0.41705816984176636, "epoch": 0.3802535023348899, "step": 1140 }, { "distill_loss": 0.12142591923475266, "epoch": 0.3802535023348899, "step": 1140 }, { "epoch": 0.3802535023348899, "ref_ce_loss": 0.18019571900367737, "step": 1140 }, { "epoch": 0.3802535023348899, "loss": 0.8467376232147217, "step": 1140 }, { "ce_loss": 0.36607176065444946, "epoch": 0.3802535023348899, "step": 1140 }, { "distill_loss": 0.10272481292486191, "epoch": 0.3802535023348899, "step": 1140 }, { "epoch": 0.3802535023348899, "ref_ce_loss": 0.1918632835149765, "step": 1140 }, { "epoch": 0.3802535023348899, "loss": 0.722500205039978, "step": 1140 }, { "ce_loss": 0.3441055715084076, "epoch": 0.3802535023348899, "step": 1140 }, { "distill_loss": 0.12058152258396149, "epoch": 0.3802535023348899, "step": 1140 }, { "epoch": 0.3802535023348899, "ref_ce_loss": 0.1723051369190216, "step": 1140 }, { "epoch": 0.38358905937291526, "loss": 0.8633, "step": 1150 }, { "epoch": 0.38358905937291526, "grad_norm": 1.7969938516616821, "step": 1150 }, { "epoch": 0.38358905937291526, "learning_rate": 0.00029974718251486363, "step": 1150 }, { "epoch": 0.38358905937291526, "loss": 1.4030652046203613, "step": 1150 }, { "ce_loss": 0.33470800518989563, "epoch": 0.38358905937291526, "step": 1150 }, { "distill_loss": 0.0937473401427269, "epoch": 0.38358905937291526, "step": 1150 }, { "epoch": 0.38358905937291526, "ref_ce_loss": 0.15464939177036285, "step": 1150 }, { "epoch": 0.38358905937291526, "loss": 0.7301994562149048, "step": 1150 }, { "ce_loss": 0.3315107822418213, "epoch": 0.38358905937291526, "step": 1150 }, { "distill_loss": 0.09801779687404633, "epoch": 0.38358905937291526, "step": 1150 }, { "epoch": 0.38358905937291526, "ref_ce_loss": 0.17848145961761475, "step": 1150 }, { "epoch": 0.38358905937291526, "loss": 0.8190576434135437, "step": 1150 }, { "ce_loss": 0.3555756211280823, "epoch": 0.38358905937291526, "step": 1150 }, { "distill_loss": 0.09977540373802185, "epoch": 0.38358905937291526, "step": 1150 }, { "epoch": 0.38358905937291526, "ref_ce_loss": 0.1742119938135147, "step": 1150 }, { "epoch": 0.38358905937291526, "loss": 0.6100097298622131, "step": 1150 }, { "ce_loss": 0.27171769738197327, "epoch": 0.38358905937291526, "step": 1150 }, { "distill_loss": 0.09016477316617966, "epoch": 0.38358905937291526, "step": 1150 }, { "epoch": 0.38358905937291526, "ref_ce_loss": 0.1305205374956131, "step": 1150 }, { "epoch": 0.3869246164109406, "loss": 0.9321, "step": 1160 }, { "epoch": 0.3869246164109406, "grad_norm": 2.3243284225463867, "step": 1160 }, { "epoch": 0.3869246164109406, "learning_rate": 0.0002997352903300459, "step": 1160 }, { "epoch": 0.3869246164109406, "loss": 0.8214346170425415, "step": 1160 }, { "ce_loss": 0.349997878074646, "epoch": 0.3869246164109406, "step": 1160 }, { "distill_loss": 0.08020985126495361, "epoch": 0.3869246164109406, "step": 1160 }, { "epoch": 0.3869246164109406, "ref_ce_loss": 0.14636772871017456, "step": 1160 }, { "epoch": 0.3869246164109406, "loss": 0.7112948894500732, "step": 1160 }, { "ce_loss": 0.4055274426937103, "epoch": 0.3869246164109406, "step": 1160 }, { "distill_loss": 0.07691830396652222, "epoch": 0.3869246164109406, "step": 1160 }, { "epoch": 0.3869246164109406, "ref_ce_loss": 0.22505919635295868, "step": 1160 }, { "epoch": 0.3869246164109406, "loss": 1.0252524614334106, "step": 1160 }, { "ce_loss": 0.4234050512313843, "epoch": 0.3869246164109406, "step": 1160 }, { "distill_loss": 0.10572229325771332, "epoch": 0.3869246164109406, "step": 1160 }, { "epoch": 0.3869246164109406, "ref_ce_loss": 0.19568488001823425, "step": 1160 }, { "epoch": 0.3869246164109406, "loss": 1.1514942646026611, "step": 1160 }, { "ce_loss": 0.38408973813056946, "epoch": 0.3869246164109406, "step": 1160 }, { "distill_loss": 0.09920583665370941, "epoch": 0.3869246164109406, "step": 1160 }, { "epoch": 0.3869246164109406, "ref_ce_loss": 0.1944963037967682, "step": 1160 }, { "epoch": 0.39026017344896596, "loss": 0.9385, "step": 1170 }, { "epoch": 0.39026017344896596, "grad_norm": 2.040172576904297, "step": 1170 }, { "epoch": 0.39026017344896596, "learning_rate": 0.0002997231250871244, "step": 1170 }, { "epoch": 0.39026017344896596, "loss": 1.2399312257766724, "step": 1170 }, { "ce_loss": 0.3100973069667816, "epoch": 0.39026017344896596, "step": 1170 }, { "distill_loss": 0.11440251022577286, "epoch": 0.39026017344896596, "step": 1170 }, { "epoch": 0.39026017344896596, "ref_ce_loss": 0.14282414317131042, "step": 1170 }, { "epoch": 0.39026017344896596, "loss": 0.8066614866256714, "step": 1170 }, { "ce_loss": 0.29232895374298096, "epoch": 0.39026017344896596, "step": 1170 }, { "distill_loss": 0.09082722663879395, "epoch": 0.39026017344896596, "step": 1170 }, { "epoch": 0.39026017344896596, "ref_ce_loss": 0.15069840848445892, "step": 1170 }, { "epoch": 0.39026017344896596, "loss": 0.6730996370315552, "step": 1170 }, { "ce_loss": 0.30601993203163147, "epoch": 0.39026017344896596, "step": 1170 }, { "distill_loss": 0.11863667517900467, "epoch": 0.39026017344896596, "step": 1170 }, { "epoch": 0.39026017344896596, "ref_ce_loss": 0.14708290994167328, "step": 1170 }, { "epoch": 0.39026017344896596, "loss": 2.0505471229553223, "step": 1170 }, { "ce_loss": 0.3397879898548126, "epoch": 0.39026017344896596, "step": 1170 }, { "distill_loss": 0.11590779572725296, "epoch": 0.39026017344896596, "step": 1170 }, { "epoch": 0.39026017344896596, "ref_ce_loss": 0.18287134170532227, "step": 1170 }, { "epoch": 0.3935957304869913, "loss": 0.9995, "step": 1180 }, { "epoch": 0.3935957304869913, "grad_norm": 3.7084498405456543, "step": 1180 }, { "epoch": 0.3935957304869913, "learning_rate": 0.0002997106868082837, "step": 1180 }, { "epoch": 0.3935957304869913, "loss": 0.9556168913841248, "step": 1180 }, { "ce_loss": 0.3851969540119171, "epoch": 0.3935957304869913, "step": 1180 }, { "distill_loss": 0.22958514094352722, "epoch": 0.3935957304869913, "step": 1180 }, { "epoch": 0.3935957304869913, "ref_ce_loss": 0.20239123702049255, "step": 1180 }, { "epoch": 0.3935957304869913, "loss": 0.8375714421272278, "step": 1180 }, { "ce_loss": 0.30635103583335876, "epoch": 0.3935957304869913, "step": 1180 }, { "distill_loss": 0.19275817275047302, "epoch": 0.3935957304869913, "step": 1180 }, { "epoch": 0.3935957304869913, "ref_ce_loss": 0.21164017915725708, "step": 1180 }, { "epoch": 0.3935957304869913, "loss": 0.780803918838501, "step": 1180 }, { "ce_loss": 0.36288484930992126, "epoch": 0.3935957304869913, "step": 1180 }, { "distill_loss": 0.2200234830379486, "epoch": 0.3935957304869913, "step": 1180 }, { "epoch": 0.3935957304869913, "ref_ce_loss": 0.19749197363853455, "step": 1180 }, { "epoch": 0.3935957304869913, "loss": 0.9087613821029663, "step": 1180 }, { "ce_loss": 0.2523532211780548, "epoch": 0.3935957304869913, "step": 1180 }, { "distill_loss": 0.1913367360830307, "epoch": 0.3935957304869913, "step": 1180 }, { "epoch": 0.3935957304869913, "ref_ce_loss": 0.1895836889743805, "step": 1180 }, { "epoch": 0.39693128752501666, "loss": 0.9122, "step": 1190 }, { "epoch": 0.39693128752501666, "grad_norm": 2.2110822200775146, "step": 1190 }, { "epoch": 0.39693128752501666, "learning_rate": 0.0002996979755162063, "step": 1190 }, { "epoch": 0.39693128752501666, "loss": 1.271198034286499, "step": 1190 }, { "ce_loss": 0.2786172032356262, "epoch": 0.39693128752501666, "step": 1190 }, { "distill_loss": 0.15106816589832306, "epoch": 0.39693128752501666, "step": 1190 }, { "epoch": 0.39693128752501666, "ref_ce_loss": 0.13261818885803223, "step": 1190 }, { "epoch": 0.39693128752501666, "loss": 0.7029586434364319, "step": 1190 }, { "ce_loss": 0.32034623622894287, "epoch": 0.39693128752501666, "step": 1190 }, { "distill_loss": 0.1312541663646698, "epoch": 0.39693128752501666, "step": 1190 }, { "epoch": 0.39693128752501666, "ref_ce_loss": 0.15258146822452545, "step": 1190 }, { "epoch": 0.39693128752501666, "loss": 0.7901096940040588, "step": 1190 }, { "ce_loss": 0.37307074666023254, "epoch": 0.39693128752501666, "step": 1190 }, { "distill_loss": 0.13851478695869446, "epoch": 0.39693128752501666, "step": 1190 }, { "epoch": 0.39693128752501666, "ref_ce_loss": 0.16212794184684753, "step": 1190 }, { "epoch": 0.39693128752501666, "loss": 1.0396982431411743, "step": 1190 }, { "ce_loss": 0.4234752357006073, "epoch": 0.39693128752501666, "step": 1190 }, { "distill_loss": 0.15358714759349823, "epoch": 0.39693128752501666, "step": 1190 }, { "epoch": 0.39693128752501666, "ref_ce_loss": 0.22049111127853394, "step": 1190 }, { "epoch": 0.400266844563042, "loss": 0.8564, "step": 1200 }, { "epoch": 0.400266844563042, "grad_norm": 2.3212337493896484, "step": 1200 }, { "epoch": 0.400266844563042, "learning_rate": 0.00029968499123407267, "step": 1200 }, { "epoch": 0.400266844563042, "loss": 0.5984683632850647, "step": 1200 }, { "ce_loss": 0.3177834451198578, "epoch": 0.400266844563042, "step": 1200 }, { "distill_loss": 0.09032616019248962, "epoch": 0.400266844563042, "step": 1200 }, { "epoch": 0.400266844563042, "ref_ce_loss": 0.12230977416038513, "step": 1200 }, { "epoch": 0.400266844563042, "loss": 1.0842852592468262, "step": 1200 }, { "ce_loss": 0.28705641627311707, "epoch": 0.400266844563042, "step": 1200 }, { "distill_loss": 0.08843199163675308, "epoch": 0.400266844563042, "step": 1200 }, { "epoch": 0.400266844563042, "ref_ce_loss": 0.15939076244831085, "step": 1200 }, { "epoch": 0.400266844563042, "loss": 0.961142897605896, "step": 1200 }, { "ce_loss": 0.33942434191703796, "epoch": 0.400266844563042, "step": 1200 }, { "distill_loss": 0.0857396274805069, "epoch": 0.400266844563042, "step": 1200 }, { "epoch": 0.400266844563042, "ref_ce_loss": 0.23185887932777405, "step": 1200 }, { "epoch": 0.400266844563042, "loss": 0.9665995240211487, "step": 1200 }, { "ce_loss": 0.27184587717056274, "epoch": 0.400266844563042, "step": 1200 }, { "distill_loss": 0.0773671567440033, "epoch": 0.400266844563042, "step": 1200 }, { "epoch": 0.400266844563042, "ref_ce_loss": 0.22190047800540924, "step": 1200 }, { "epoch": 0.40360240160106736, "loss": 0.8723, "step": 1210 }, { "epoch": 0.40360240160106736, "grad_norm": 2.192169427871704, "step": 1210 }, { "epoch": 0.40360240160106736, "learning_rate": 0.00029967173398556086, "step": 1210 }, { "epoch": 0.40360240160106736, "loss": 0.7515622973442078, "step": 1210 }, { "ce_loss": 0.3157145380973816, "epoch": 0.40360240160106736, "step": 1210 }, { "distill_loss": 0.094798743724823, "epoch": 0.40360240160106736, "step": 1210 }, { "epoch": 0.40360240160106736, "ref_ce_loss": 0.23686806857585907, "step": 1210 }, { "epoch": 0.40360240160106736, "loss": 0.7274072170257568, "step": 1210 }, { "ce_loss": 0.3264521658420563, "epoch": 0.40360240160106736, "step": 1210 }, { "distill_loss": 0.09107182919979095, "epoch": 0.40360240160106736, "step": 1210 }, { "epoch": 0.40360240160106736, "ref_ce_loss": 0.1764145791530609, "step": 1210 }, { "epoch": 0.40360240160106736, "loss": 1.4342138767242432, "step": 1210 }, { "ce_loss": 0.3195574879646301, "epoch": 0.40360240160106736, "step": 1210 }, { "distill_loss": 0.07628357410430908, "epoch": 0.40360240160106736, "step": 1210 }, { "epoch": 0.40360240160106736, "ref_ce_loss": 0.15575119853019714, "step": 1210 }, { "epoch": 0.40360240160106736, "loss": 0.5367107391357422, "step": 1210 }, { "ce_loss": 0.2918946444988251, "epoch": 0.40360240160106736, "step": 1210 }, { "distill_loss": 0.08166878670454025, "epoch": 0.40360240160106736, "step": 1210 }, { "epoch": 0.40360240160106736, "ref_ce_loss": 0.16308514773845673, "step": 1210 }, { "epoch": 0.4069379586390927, "loss": 0.9943, "step": 1220 }, { "epoch": 0.4069379586390927, "grad_norm": 3.139317512512207, "step": 1220 }, { "epoch": 0.4069379586390927, "learning_rate": 0.00029965820379484695, "step": 1220 }, { "epoch": 0.4069379586390927, "loss": 1.1364343166351318, "step": 1220 }, { "ce_loss": 0.37319913506507874, "epoch": 0.4069379586390927, "step": 1220 }, { "distill_loss": 0.09696052223443985, "epoch": 0.4069379586390927, "step": 1220 }, { "epoch": 0.4069379586390927, "ref_ce_loss": 0.18106557428836823, "step": 1220 }, { "epoch": 0.4069379586390927, "loss": 0.905819833278656, "step": 1220 }, { "ce_loss": 0.4253186881542206, "epoch": 0.4069379586390927, "step": 1220 }, { "distill_loss": 0.08640832453966141, "epoch": 0.4069379586390927, "step": 1220 }, { "epoch": 0.4069379586390927, "ref_ce_loss": 0.2814835011959076, "step": 1220 }, { "epoch": 0.4069379586390927, "loss": 0.6299371123313904, "step": 1220 }, { "ce_loss": 0.3122254014015198, "epoch": 0.4069379586390927, "step": 1220 }, { "distill_loss": 0.08051759749650955, "epoch": 0.4069379586390927, "step": 1220 }, { "epoch": 0.4069379586390927, "ref_ce_loss": 0.23709669709205627, "step": 1220 }, { "epoch": 0.4069379586390927, "loss": 0.8787524700164795, "step": 1220 }, { "ce_loss": 0.30214032530784607, "epoch": 0.4069379586390927, "step": 1220 }, { "distill_loss": 0.08182747662067413, "epoch": 0.4069379586390927, "step": 1220 }, { "epoch": 0.4069379586390927, "ref_ce_loss": 0.1771172732114792, "step": 1220 }, { "epoch": 0.41027351567711806, "loss": 0.8505, "step": 1230 }, { "epoch": 0.41027351567711806, "grad_norm": 2.6399848461151123, "step": 1230 }, { "epoch": 0.41027351567711806, "learning_rate": 0.00029964440068660467, "step": 1230 }, { "epoch": 0.41027351567711806, "loss": 0.881452739238739, "step": 1230 }, { "ce_loss": 0.4639228284358978, "epoch": 0.41027351567711806, "step": 1230 }, { "distill_loss": 0.16353951394557953, "epoch": 0.41027351567711806, "step": 1230 }, { "epoch": 0.41027351567711806, "ref_ce_loss": 0.18111327290534973, "step": 1230 }, { "epoch": 0.41027351567711806, "loss": 1.0701333284378052, "step": 1230 }, { "ce_loss": 0.5111604928970337, "epoch": 0.41027351567711806, "step": 1230 }, { "distill_loss": 0.14016252756118774, "epoch": 0.41027351567711806, "step": 1230 }, { "epoch": 0.41027351567711806, "ref_ce_loss": 0.2831771671772003, "step": 1230 }, { "epoch": 0.41027351567711806, "loss": 0.7755110263824463, "step": 1230 }, { "ce_loss": 0.3701687753200531, "epoch": 0.41027351567711806, "step": 1230 }, { "distill_loss": 0.13138516247272491, "epoch": 0.41027351567711806, "step": 1230 }, { "epoch": 0.41027351567711806, "ref_ce_loss": 0.2043834626674652, "step": 1230 }, { "epoch": 0.41027351567711806, "loss": 0.90342116355896, "step": 1230 }, { "ce_loss": 0.24989053606987, "epoch": 0.41027351567711806, "step": 1230 }, { "distill_loss": 0.14895430207252502, "epoch": 0.41027351567711806, "step": 1230 }, { "epoch": 0.41027351567711806, "ref_ce_loss": 0.1435110718011856, "step": 1230 }, { "epoch": 0.4136090727151434, "loss": 1.8225, "step": 1240 }, { "epoch": 0.4136090727151434, "grad_norm": 5.425915718078613, "step": 1240 }, { "epoch": 0.4136090727151434, "learning_rate": 0.0002996303246860054, "step": 1240 }, { "epoch": 0.4136090727151434, "loss": 1.3830713033676147, "step": 1240 }, { "ce_loss": 0.3072783350944519, "epoch": 0.4136090727151434, "step": 1240 }, { "distill_loss": 0.7292137145996094, "epoch": 0.4136090727151434, "step": 1240 }, { "epoch": 0.4136090727151434, "ref_ce_loss": 0.1486404836177826, "step": 1240 }, { "epoch": 0.4136090727151434, "loss": 1.6045691967010498, "step": 1240 }, { "ce_loss": 0.4130804240703583, "epoch": 0.4136090727151434, "step": 1240 }, { "distill_loss": 0.8139525651931763, "epoch": 0.4136090727151434, "step": 1240 }, { "epoch": 0.4136090727151434, "ref_ce_loss": 0.19312764704227448, "step": 1240 }, { "epoch": 0.4136090727151434, "loss": 2.623363733291626, "step": 1240 }, { "ce_loss": 0.3971395194530487, "epoch": 0.4136090727151434, "step": 1240 }, { "distill_loss": 0.8659202456474304, "epoch": 0.4136090727151434, "step": 1240 }, { "epoch": 0.4136090727151434, "ref_ce_loss": 0.20216262340545654, "step": 1240 }, { "epoch": 0.4136090727151434, "loss": 1.5429506301879883, "step": 1240 }, { "ce_loss": 0.2974897623062134, "epoch": 0.4136090727151434, "step": 1240 }, { "distill_loss": 0.8454087376594543, "epoch": 0.4136090727151434, "step": 1240 }, { "epoch": 0.4136090727151434, "ref_ce_loss": 0.15195105969905853, "step": 1240 }, { "epoch": 0.41694462975316876, "loss": 1.2865, "step": 1250 }, { "epoch": 0.41694462975316876, "grad_norm": 2.501133918762207, "step": 1250 }, { "epoch": 0.41694462975316876, "learning_rate": 0.0002996159758187183, "step": 1250 }, { "epoch": 0.41694462975316876, "loss": 1.0447216033935547, "step": 1250 }, { "ce_loss": 0.4286389946937561, "epoch": 0.41694462975316876, "step": 1250 }, { "distill_loss": 0.3530900478363037, "epoch": 0.41694462975316876, "step": 1250 }, { "epoch": 0.41694462975316876, "ref_ce_loss": 0.17672403156757355, "step": 1250 }, { "epoch": 0.41694462975316876, "loss": 1.8678219318389893, "step": 1250 }, { "ce_loss": 0.3537362217903137, "epoch": 0.41694462975316876, "step": 1250 }, { "distill_loss": 0.32580870389938354, "epoch": 0.41694462975316876, "step": 1250 }, { "epoch": 0.41694462975316876, "ref_ce_loss": 0.23289765417575836, "step": 1250 }, { "epoch": 0.41694462975316876, "loss": 1.0801249742507935, "step": 1250 }, { "ce_loss": 0.3455648422241211, "epoch": 0.41694462975316876, "step": 1250 }, { "distill_loss": 0.33082038164138794, "epoch": 0.41694462975316876, "step": 1250 }, { "epoch": 0.41694462975316876, "ref_ce_loss": 0.21357986330986023, "step": 1250 }, { "epoch": 0.41694462975316876, "loss": 1.061863899230957, "step": 1250 }, { "ce_loss": 0.3110015094280243, "epoch": 0.41694462975316876, "step": 1250 }, { "distill_loss": 0.29846805334091187, "epoch": 0.41694462975316876, "step": 1250 }, { "epoch": 0.41694462975316876, "ref_ce_loss": 0.20160391926765442, "step": 1250 }, { "epoch": 0.4202801867911941, "loss": 1.0603, "step": 1260 }, { "epoch": 0.4202801867911941, "grad_norm": 4.449580669403076, "step": 1260 }, { "epoch": 0.4202801867911941, "learning_rate": 0.00029960135411090995, "step": 1260 }, { "epoch": 0.4202801867911941, "loss": 0.6296399831771851, "step": 1260 }, { "ce_loss": 0.2865029573440552, "epoch": 0.4202801867911941, "step": 1260 }, { "distill_loss": 0.1852218210697174, "epoch": 0.4202801867911941, "step": 1260 }, { "epoch": 0.4202801867911941, "ref_ce_loss": 0.09856808930635452, "step": 1260 }, { "epoch": 0.4202801867911941, "loss": 0.9737272262573242, "step": 1260 }, { "ce_loss": 0.33188915252685547, "epoch": 0.4202801867911941, "step": 1260 }, { "distill_loss": 0.19780108332633972, "epoch": 0.4202801867911941, "step": 1260 }, { "epoch": 0.4202801867911941, "ref_ce_loss": 0.1794106811285019, "step": 1260 }, { "epoch": 0.4202801867911941, "loss": 0.934063732624054, "step": 1260 }, { "ce_loss": 0.44743072986602783, "epoch": 0.4202801867911941, "step": 1260 }, { "distill_loss": 0.231248140335083, "epoch": 0.4202801867911941, "step": 1260 }, { "epoch": 0.4202801867911941, "ref_ce_loss": 0.25490811467170715, "step": 1260 }, { "epoch": 0.4202801867911941, "loss": 0.9184874892234802, "step": 1260 }, { "ce_loss": 0.36027130484580994, "epoch": 0.4202801867911941, "step": 1260 }, { "distill_loss": 0.24745814502239227, "epoch": 0.4202801867911941, "step": 1260 }, { "epoch": 0.4202801867911941, "ref_ce_loss": 0.2323707938194275, "step": 1260 }, { "epoch": 0.42361574382921946, "loss": 1.0, "step": 1270 }, { "epoch": 0.42361574382921946, "grad_norm": 4.113201141357422, "step": 1270 }, { "epoch": 0.42361574382921946, "learning_rate": 0.00029958645958924466, "step": 1270 }, { "epoch": 0.42361574382921946, "loss": 0.6826831102371216, "step": 1270 }, { "ce_loss": 0.3329589068889618, "epoch": 0.42361574382921946, "step": 1270 }, { "distill_loss": 0.19312255084514618, "epoch": 0.42361574382921946, "step": 1270 }, { "epoch": 0.42361574382921946, "ref_ce_loss": 0.15657509863376617, "step": 1270 }, { "epoch": 0.42361574382921946, "loss": 0.7136074304580688, "step": 1270 }, { "ce_loss": 0.28988170623779297, "epoch": 0.42361574382921946, "step": 1270 }, { "distill_loss": 0.16940924525260925, "epoch": 0.42361574382921946, "step": 1270 }, { "epoch": 0.42361574382921946, "ref_ce_loss": 0.1954113394021988, "step": 1270 }, { "epoch": 0.42361574382921946, "loss": 0.6761580109596252, "step": 1270 }, { "ce_loss": 0.33831357955932617, "epoch": 0.42361574382921946, "step": 1270 }, { "distill_loss": 0.18440185487270355, "epoch": 0.42361574382921946, "step": 1270 }, { "epoch": 0.42361574382921946, "ref_ce_loss": 0.15340223908424377, "step": 1270 }, { "epoch": 0.42361574382921946, "loss": 0.7417470216751099, "step": 1270 }, { "ce_loss": 0.31187084317207336, "epoch": 0.42361574382921946, "step": 1270 }, { "distill_loss": 0.2230009287595749, "epoch": 0.42361574382921946, "step": 1270 }, { "epoch": 0.42361574382921946, "ref_ce_loss": 0.11719028651714325, "step": 1270 }, { "epoch": 0.4269513008672448, "loss": 0.9325, "step": 1280 }, { "epoch": 0.4269513008672448, "grad_norm": 2.9946320056915283, "step": 1280 }, { "epoch": 0.4269513008672448, "learning_rate": 0.0002995712922808841, "step": 1280 }, { "epoch": 0.4269513008672448, "loss": 1.672420859336853, "step": 1280 }, { "ce_loss": 0.3916040062904358, "epoch": 0.4269513008672448, "step": 1280 }, { "distill_loss": 0.21645702421665192, "epoch": 0.4269513008672448, "step": 1280 }, { "epoch": 0.4269513008672448, "ref_ce_loss": 0.22933925688266754, "step": 1280 }, { "epoch": 0.4269513008672448, "loss": 0.93517005443573, "step": 1280 }, { "ce_loss": 0.40770724415779114, "epoch": 0.4269513008672448, "step": 1280 }, { "distill_loss": 0.25952303409576416, "epoch": 0.4269513008672448, "step": 1280 }, { "epoch": 0.4269513008672448, "ref_ce_loss": 0.22096198797225952, "step": 1280 }, { "epoch": 0.4269513008672448, "loss": 0.9371185302734375, "step": 1280 }, { "ce_loss": 0.36753150820732117, "epoch": 0.4269513008672448, "step": 1280 }, { "distill_loss": 0.33493292331695557, "epoch": 0.4269513008672448, "step": 1280 }, { "epoch": 0.4269513008672448, "ref_ce_loss": 0.15782517194747925, "step": 1280 }, { "epoch": 0.4269513008672448, "loss": 0.7511712312698364, "step": 1280 }, { "ce_loss": 0.25156018137931824, "epoch": 0.4269513008672448, "step": 1280 }, { "distill_loss": 0.26513832807540894, "epoch": 0.4269513008672448, "step": 1280 }, { "epoch": 0.4269513008672448, "ref_ce_loss": 0.140127494931221, "step": 1280 }, { "epoch": 0.43028685790527016, "loss": 0.8839, "step": 1290 }, { "epoch": 0.43028685790527016, "grad_norm": 5.134028434753418, "step": 1290 }, { "epoch": 0.43028685790527016, "learning_rate": 0.0002995558522134875, "step": 1290 }, { "epoch": 0.43028685790527016, "loss": 0.7763099670410156, "step": 1290 }, { "ce_loss": 0.3329162895679474, "epoch": 0.43028685790527016, "step": 1290 }, { "distill_loss": 0.2020120471715927, "epoch": 0.43028685790527016, "step": 1290 }, { "epoch": 0.43028685790527016, "ref_ce_loss": 0.15234671533107758, "step": 1290 }, { "epoch": 0.43028685790527016, "loss": 1.1825041770935059, "step": 1290 }, { "ce_loss": 0.346752792596817, "epoch": 0.43028685790527016, "step": 1290 }, { "distill_loss": 0.21200241148471832, "epoch": 0.43028685790527016, "step": 1290 }, { "epoch": 0.43028685790527016, "ref_ce_loss": 0.16237637400627136, "step": 1290 }, { "epoch": 0.43028685790527016, "loss": 0.6205200552940369, "step": 1290 }, { "ce_loss": 0.280308336019516, "epoch": 0.43028685790527016, "step": 1290 }, { "distill_loss": 0.20304308831691742, "epoch": 0.43028685790527016, "step": 1290 }, { "epoch": 0.43028685790527016, "ref_ce_loss": 0.1370389461517334, "step": 1290 }, { "epoch": 0.43028685790527016, "loss": 0.9143213033676147, "step": 1290 }, { "ce_loss": 0.38608551025390625, "epoch": 0.43028685790527016, "step": 1290 }, { "distill_loss": 0.18370597064495087, "epoch": 0.43028685790527016, "step": 1290 }, { "epoch": 0.43028685790527016, "ref_ce_loss": 0.14932240545749664, "step": 1290 }, { "epoch": 0.4336224149432955, "loss": 1.2536, "step": 1300 }, { "epoch": 0.4336224149432955, "grad_norm": 83.33971405029297, "step": 1300 }, { "epoch": 0.4336224149432955, "learning_rate": 0.0002995401394152114, "step": 1300 }, { "epoch": 0.4336224149432955, "loss": 1.5523114204406738, "step": 1300 }, { "ce_loss": 0.39939671754837036, "epoch": 0.4336224149432955, "step": 1300 }, { "distill_loss": 0.5877339243888855, "epoch": 0.4336224149432955, "step": 1300 }, { "epoch": 0.4336224149432955, "ref_ce_loss": 0.16907824575901031, "step": 1300 }, { "epoch": 0.4336224149432955, "loss": 1.2784559726715088, "step": 1300 }, { "ce_loss": 0.32869669795036316, "epoch": 0.4336224149432955, "step": 1300 }, { "distill_loss": 0.5456231832504272, "epoch": 0.4336224149432955, "step": 1300 }, { "epoch": 0.4336224149432955, "ref_ce_loss": 0.16758182644844055, "step": 1300 }, { "epoch": 0.4336224149432955, "loss": 1.1197588443756104, "step": 1300 }, { "ce_loss": 0.3304974436759949, "epoch": 0.4336224149432955, "step": 1300 }, { "distill_loss": 0.5112226605415344, "epoch": 0.4336224149432955, "step": 1300 }, { "epoch": 0.4336224149432955, "ref_ce_loss": 0.16701827943325043, "step": 1300 }, { "epoch": 0.4336224149432955, "loss": 1.211663842201233, "step": 1300 }, { "ce_loss": 0.322611004114151, "epoch": 0.4336224149432955, "step": 1300 }, { "distill_loss": 0.6028107404708862, "epoch": 0.4336224149432955, "step": 1300 }, { "epoch": 0.4336224149432955, "ref_ce_loss": 0.2120797336101532, "step": 1300 }, { "epoch": 0.43695797198132086, "loss": 1.0281, "step": 1310 }, { "epoch": 0.43695797198132086, "grad_norm": 2.6226251125335693, "step": 1310 }, { "epoch": 0.43695797198132086, "learning_rate": 0.00029952415391470977, "step": 1310 }, { "epoch": 0.43695797198132086, "loss": 1.3600938320159912, "step": 1310 }, { "ce_loss": 0.42805051803588867, "epoch": 0.43695797198132086, "step": 1310 }, { "distill_loss": 0.21402305364608765, "epoch": 0.43695797198132086, "step": 1310 }, { "epoch": 0.43695797198132086, "ref_ce_loss": 0.16621027886867523, "step": 1310 }, { "epoch": 0.43695797198132086, "loss": 0.971459686756134, "step": 1310 }, { "ce_loss": 0.32620683312416077, "epoch": 0.43695797198132086, "step": 1310 }, { "distill_loss": 0.18102744221687317, "epoch": 0.43695797198132086, "step": 1310 }, { "epoch": 0.43695797198132086, "ref_ce_loss": 0.19330710172653198, "step": 1310 }, { "epoch": 0.43695797198132086, "loss": 1.3167970180511475, "step": 1310 }, { "ce_loss": 0.4114319980144501, "epoch": 0.43695797198132086, "step": 1310 }, { "distill_loss": 0.22762969136238098, "epoch": 0.43695797198132086, "step": 1310 }, { "epoch": 0.43695797198132086, "ref_ce_loss": 0.16943278908729553, "step": 1310 }, { "epoch": 0.43695797198132086, "loss": 0.8594940900802612, "step": 1310 }, { "ce_loss": 0.36697542667388916, "epoch": 0.43695797198132086, "step": 1310 }, { "distill_loss": 0.2220364212989807, "epoch": 0.43695797198132086, "step": 1310 }, { "epoch": 0.43695797198132086, "ref_ce_loss": 0.15329211950302124, "step": 1310 }, { "epoch": 0.4402935290193462, "loss": 0.8734, "step": 1320 }, { "epoch": 0.4402935290193462, "grad_norm": 2.488173723220825, "step": 1320 }, { "epoch": 0.4402935290193462, "learning_rate": 0.0002995078957411339, "step": 1320 }, { "epoch": 0.4402935290193462, "loss": 0.8020517230033875, "step": 1320 }, { "ce_loss": 0.3077715039253235, "epoch": 0.4402935290193462, "step": 1320 }, { "distill_loss": 0.13831500709056854, "epoch": 0.4402935290193462, "step": 1320 }, { "epoch": 0.4402935290193462, "ref_ce_loss": 0.18166959285736084, "step": 1320 }, { "epoch": 0.4402935290193462, "loss": 1.254194736480713, "step": 1320 }, { "ce_loss": 0.3027229309082031, "epoch": 0.4402935290193462, "step": 1320 }, { "distill_loss": 0.1362319141626358, "epoch": 0.4402935290193462, "step": 1320 }, { "epoch": 0.4402935290193462, "ref_ce_loss": 0.20650614798069, "step": 1320 }, { "epoch": 0.4402935290193462, "loss": 0.5869837999343872, "step": 1320 }, { "ce_loss": 0.2980230748653412, "epoch": 0.4402935290193462, "step": 1320 }, { "distill_loss": 0.1261177659034729, "epoch": 0.4402935290193462, "step": 1320 }, { "epoch": 0.4402935290193462, "ref_ce_loss": 0.1628243774175644, "step": 1320 }, { "epoch": 0.4402935290193462, "loss": 0.789718508720398, "step": 1320 }, { "ce_loss": 0.40123918652534485, "epoch": 0.4402935290193462, "step": 1320 }, { "distill_loss": 0.15148304402828217, "epoch": 0.4402935290193462, "step": 1320 }, { "epoch": 0.4402935290193462, "ref_ce_loss": 0.16462190449237823, "step": 1320 }, { "epoch": 0.44362908605737156, "loss": 0.7744, "step": 1330 }, { "epoch": 0.44362908605737156, "grad_norm": 1.8618627786636353, "step": 1330 }, { "epoch": 0.44362908605737156, "learning_rate": 0.00029949136492413224, "step": 1330 }, { "epoch": 0.44362908605737156, "loss": 0.5880228281021118, "step": 1330 }, { "ce_loss": 0.27567267417907715, "epoch": 0.44362908605737156, "step": 1330 }, { "distill_loss": 0.09483489394187927, "epoch": 0.44362908605737156, "step": 1330 }, { "epoch": 0.44362908605737156, "ref_ce_loss": 0.18227918446063995, "step": 1330 }, { "epoch": 0.44362908605737156, "loss": 0.8252865076065063, "step": 1330 }, { "ce_loss": 0.2724820077419281, "epoch": 0.44362908605737156, "step": 1330 }, { "distill_loss": 0.0967143326997757, "epoch": 0.44362908605737156, "step": 1330 }, { "epoch": 0.44362908605737156, "ref_ce_loss": 0.18324853479862213, "step": 1330 }, { "epoch": 0.44362908605737156, "loss": 0.7001053094863892, "step": 1330 }, { "ce_loss": 0.2155034989118576, "epoch": 0.44362908605737156, "step": 1330 }, { "distill_loss": 0.11413068324327469, "epoch": 0.44362908605737156, "step": 1330 }, { "epoch": 0.44362908605737156, "ref_ce_loss": 0.1161014661192894, "step": 1330 }, { "epoch": 0.44362908605737156, "loss": 0.8354309797286987, "step": 1330 }, { "ce_loss": 0.36032605171203613, "epoch": 0.44362908605737156, "step": 1330 }, { "distill_loss": 0.09894613176584244, "epoch": 0.44362908605737156, "step": 1330 }, { "epoch": 0.44362908605737156, "ref_ce_loss": 0.22084367275238037, "step": 1330 }, { "epoch": 0.4469646430953969, "loss": 0.9264, "step": 1340 }, { "epoch": 0.4469646430953969, "grad_norm": 3.5847744941711426, "step": 1340 }, { "epoch": 0.4469646430953969, "learning_rate": 0.0002994745614938505, "step": 1340 }, { "epoch": 0.4469646430953969, "loss": 2.41884708404541, "step": 1340 }, { "ce_loss": 0.35969698429107666, "epoch": 0.4469646430953969, "step": 1340 }, { "distill_loss": 0.13992217183113098, "epoch": 0.4469646430953969, "step": 1340 }, { "epoch": 0.4469646430953969, "ref_ce_loss": 0.1331671178340912, "step": 1340 }, { "epoch": 0.4469646430953969, "loss": 0.6663955450057983, "step": 1340 }, { "ce_loss": 0.26573446393013, "epoch": 0.4469646430953969, "step": 1340 }, { "distill_loss": 0.12028539180755615, "epoch": 0.4469646430953969, "step": 1340 }, { "epoch": 0.4469646430953969, "ref_ce_loss": 0.14755044877529144, "step": 1340 }, { "epoch": 0.4469646430953969, "loss": 0.7013957500457764, "step": 1340 }, { "ce_loss": 0.3335946798324585, "epoch": 0.4469646430953969, "step": 1340 }, { "distill_loss": 0.10393598675727844, "epoch": 0.4469646430953969, "step": 1340 }, { "epoch": 0.4469646430953969, "ref_ce_loss": 0.211795374751091, "step": 1340 }, { "epoch": 0.4469646430953969, "loss": 0.5281260013580322, "step": 1340 }, { "ce_loss": 0.23023775219917297, "epoch": 0.4469646430953969, "step": 1340 }, { "distill_loss": 0.10416372120380402, "epoch": 0.4469646430953969, "step": 1340 }, { "epoch": 0.4469646430953969, "ref_ce_loss": 0.10854792594909668, "step": 1340 }, { "epoch": 0.45030020013342226, "loss": 0.8841, "step": 1350 }, { "epoch": 0.45030020013342226, "grad_norm": 2.1285672187805176, "step": 1350 }, { "epoch": 0.45030020013342226, "learning_rate": 0.0002994574854809315, "step": 1350 }, { "epoch": 0.45030020013342226, "loss": 0.52470463514328, "step": 1350 }, { "ce_loss": 0.2516459822654724, "epoch": 0.45030020013342226, "step": 1350 }, { "distill_loss": 0.14257560670375824, "epoch": 0.45030020013342226, "step": 1350 }, { "epoch": 0.45030020013342226, "ref_ce_loss": 0.07917644828557968, "step": 1350 }, { "epoch": 0.45030020013342226, "loss": 0.7037546634674072, "step": 1350 }, { "ce_loss": 0.31851497292518616, "epoch": 0.45030020013342226, "step": 1350 }, { "distill_loss": 0.12531189620494843, "epoch": 0.45030020013342226, "step": 1350 }, { "epoch": 0.45030020013342226, "ref_ce_loss": 0.18432098627090454, "step": 1350 }, { "epoch": 0.45030020013342226, "loss": 0.5914328098297119, "step": 1350 }, { "ce_loss": 0.27234095335006714, "epoch": 0.45030020013342226, "step": 1350 }, { "distill_loss": 0.11418743431568146, "epoch": 0.45030020013342226, "step": 1350 }, { "epoch": 0.45030020013342226, "ref_ce_loss": 0.20483791828155518, "step": 1350 }, { "epoch": 0.45030020013342226, "loss": 0.6631147265434265, "step": 1350 }, { "ce_loss": 0.2455311268568039, "epoch": 0.45030020013342226, "step": 1350 }, { "distill_loss": 0.13238249719142914, "epoch": 0.45030020013342226, "step": 1350 }, { "epoch": 0.45030020013342226, "ref_ce_loss": 0.18045808374881744, "step": 1350 }, { "epoch": 0.4536357571714476, "loss": 0.7771, "step": 1360 }, { "epoch": 0.4536357571714476, "grad_norm": 2.4119577407836914, "step": 1360 }, { "epoch": 0.4536357571714476, "learning_rate": 0.0002994401369165151, "step": 1360 }, { "epoch": 0.4536357571714476, "loss": 0.6945372819900513, "step": 1360 }, { "ce_loss": 0.32869598269462585, "epoch": 0.4536357571714476, "step": 1360 }, { "distill_loss": 0.1271277368068695, "epoch": 0.4536357571714476, "step": 1360 }, { "epoch": 0.4536357571714476, "ref_ce_loss": 0.23853451013565063, "step": 1360 }, { "epoch": 0.4536357571714476, "loss": 0.8358821868896484, "step": 1360 }, { "ce_loss": 0.26175400614738464, "epoch": 0.4536357571714476, "step": 1360 }, { "distill_loss": 0.1243264302611351, "epoch": 0.4536357571714476, "step": 1360 }, { "epoch": 0.4536357571714476, "ref_ce_loss": 0.2343246340751648, "step": 1360 }, { "epoch": 0.4536357571714476, "loss": 0.7749500274658203, "step": 1360 }, { "ce_loss": 0.330771267414093, "epoch": 0.4536357571714476, "step": 1360 }, { "distill_loss": 0.14768579602241516, "epoch": 0.4536357571714476, "step": 1360 }, { "epoch": 0.4536357571714476, "ref_ce_loss": 0.15305696427822113, "step": 1360 }, { "epoch": 0.4536357571714476, "loss": 0.9154521226882935, "step": 1360 }, { "ce_loss": 0.3262263238430023, "epoch": 0.4536357571714476, "step": 1360 }, { "distill_loss": 0.1562902182340622, "epoch": 0.4536357571714476, "step": 1360 }, { "epoch": 0.4536357571714476, "ref_ce_loss": 0.158981591463089, "step": 1360 }, { "epoch": 0.45697131420947296, "loss": 0.8381, "step": 1370 }, { "epoch": 0.45697131420947296, "grad_norm": 2.2991511821746826, "step": 1370 }, { "epoch": 0.45697131420947296, "learning_rate": 0.00029942251583223834, "step": 1370 }, { "epoch": 0.45697131420947296, "loss": 0.802588939666748, "step": 1370 }, { "ce_loss": 0.3817768692970276, "epoch": 0.45697131420947296, "step": 1370 }, { "distill_loss": 0.15474967658519745, "epoch": 0.45697131420947296, "step": 1370 }, { "epoch": 0.45697131420947296, "ref_ce_loss": 0.21179994940757751, "step": 1370 }, { "epoch": 0.45697131420947296, "loss": 0.5642900466918945, "step": 1370 }, { "ce_loss": 0.22733645141124725, "epoch": 0.45697131420947296, "step": 1370 }, { "distill_loss": 0.1353457272052765, "epoch": 0.45697131420947296, "step": 1370 }, { "epoch": 0.45697131420947296, "ref_ce_loss": 0.12604719400405884, "step": 1370 }, { "epoch": 0.45697131420947296, "loss": 0.9796323776245117, "step": 1370 }, { "ce_loss": 0.3570672273635864, "epoch": 0.45697131420947296, "step": 1370 }, { "distill_loss": 0.1766320914030075, "epoch": 0.45697131420947296, "step": 1370 }, { "epoch": 0.45697131420947296, "ref_ce_loss": 0.12003152817487717, "step": 1370 }, { "epoch": 0.45697131420947296, "loss": 0.8954633474349976, "step": 1370 }, { "ce_loss": 0.34348300099372864, "epoch": 0.45697131420947296, "step": 1370 }, { "distill_loss": 0.17705875635147095, "epoch": 0.45697131420947296, "step": 1370 }, { "epoch": 0.45697131420947296, "ref_ce_loss": 0.26571720838546753, "step": 1370 }, { "epoch": 0.4603068712474983, "loss": 0.8116, "step": 1380 }, { "epoch": 0.4603068712474983, "grad_norm": 2.4742720127105713, "step": 1380 }, { "epoch": 0.4603068712474983, "learning_rate": 0.00029940462226023506, "step": 1380 }, { "epoch": 0.4603068712474983, "loss": 0.640617311000824, "step": 1380 }, { "ce_loss": 0.24786534905433655, "epoch": 0.4603068712474983, "step": 1380 }, { "distill_loss": 0.15409091114997864, "epoch": 0.4603068712474983, "step": 1380 }, { "epoch": 0.4603068712474983, "ref_ce_loss": 0.12548106908798218, "step": 1380 }, { "epoch": 0.4603068712474983, "loss": 1.1944732666015625, "step": 1380 }, { "ce_loss": 0.33839792013168335, "epoch": 0.4603068712474983, "step": 1380 }, { "distill_loss": 0.14373010396957397, "epoch": 0.4603068712474983, "step": 1380 }, { "epoch": 0.4603068712474983, "ref_ce_loss": 0.20675034821033478, "step": 1380 }, { "epoch": 0.4603068712474983, "loss": 0.8365032076835632, "step": 1380 }, { "ce_loss": 0.34791073203086853, "epoch": 0.4603068712474983, "step": 1380 }, { "distill_loss": 0.16508665680885315, "epoch": 0.4603068712474983, "step": 1380 }, { "epoch": 0.4603068712474983, "ref_ce_loss": 0.2095586210489273, "step": 1380 }, { "epoch": 0.4603068712474983, "loss": 0.9003428816795349, "step": 1380 }, { "ce_loss": 0.3127005398273468, "epoch": 0.4603068712474983, "step": 1380 }, { "distill_loss": 0.15318584442138672, "epoch": 0.4603068712474983, "step": 1380 }, { "epoch": 0.4603068712474983, "ref_ce_loss": 0.15895813703536987, "step": 1380 }, { "epoch": 0.46364242828552366, "loss": 0.8893, "step": 1390 }, { "epoch": 0.46364242828552366, "grad_norm": 2.6648497581481934, "step": 1390 }, { "epoch": 0.46364242828552366, "learning_rate": 0.0002993864562331361, "step": 1390 }, { "epoch": 0.46364242828552366, "loss": 0.525078535079956, "step": 1390 }, { "ce_loss": 0.25243738293647766, "epoch": 0.46364242828552366, "step": 1390 }, { "distill_loss": 0.1383885145187378, "epoch": 0.46364242828552366, "step": 1390 }, { "epoch": 0.46364242828552366, "ref_ce_loss": 0.13377568125724792, "step": 1390 }, { "epoch": 0.46364242828552366, "loss": 1.248044729232788, "step": 1390 }, { "ce_loss": 0.4142797887325287, "epoch": 0.46364242828552366, "step": 1390 }, { "distill_loss": 0.21165764331817627, "epoch": 0.46364242828552366, "step": 1390 }, { "epoch": 0.46364242828552366, "ref_ce_loss": 0.22711358964443207, "step": 1390 }, { "epoch": 0.46364242828552366, "loss": 1.7012591361999512, "step": 1390 }, { "ce_loss": 0.25872302055358887, "epoch": 0.46364242828552366, "step": 1390 }, { "distill_loss": 0.18654128909111023, "epoch": 0.46364242828552366, "step": 1390 }, { "epoch": 0.46364242828552366, "ref_ce_loss": 0.1603059321641922, "step": 1390 }, { "epoch": 0.46364242828552366, "loss": 1.113049030303955, "step": 1390 }, { "ce_loss": 0.33455508947372437, "epoch": 0.46364242828552366, "step": 1390 }, { "distill_loss": 0.14230862259864807, "epoch": 0.46364242828552366, "step": 1390 }, { "epoch": 0.46364242828552366, "ref_ce_loss": 0.14293868839740753, "step": 1390 }, { "epoch": 0.466977985323549, "loss": 0.9262, "step": 1400 }, { "epoch": 0.466977985323549, "grad_norm": 4.262118339538574, "step": 1400 }, { "epoch": 0.466977985323549, "learning_rate": 0.0002993680177840691, "step": 1400 }, { "epoch": 0.466977985323549, "loss": 0.8223698139190674, "step": 1400 }, { "ce_loss": 0.3486742377281189, "epoch": 0.466977985323549, "step": 1400 }, { "distill_loss": 0.16176161170005798, "epoch": 0.466977985323549, "step": 1400 }, { "epoch": 0.466977985323549, "ref_ce_loss": 0.18248051404953003, "step": 1400 }, { "epoch": 0.466977985323549, "loss": 0.9195911884307861, "step": 1400 }, { "ce_loss": 0.34413450956344604, "epoch": 0.466977985323549, "step": 1400 }, { "distill_loss": 0.14552196860313416, "epoch": 0.466977985323549, "step": 1400 }, { "epoch": 0.466977985323549, "ref_ce_loss": 0.22156870365142822, "step": 1400 }, { "epoch": 0.466977985323549, "loss": 0.8011733293533325, "step": 1400 }, { "ce_loss": 0.26975682377815247, "epoch": 0.466977985323549, "step": 1400 }, { "distill_loss": 0.14448806643486023, "epoch": 0.466977985323549, "step": 1400 }, { "epoch": 0.466977985323549, "ref_ce_loss": 0.1277724802494049, "step": 1400 }, { "epoch": 0.466977985323549, "loss": 0.7647350430488586, "step": 1400 }, { "ce_loss": 0.3322110176086426, "epoch": 0.466977985323549, "step": 1400 }, { "distill_loss": 0.15847086906433105, "epoch": 0.466977985323549, "step": 1400 }, { "epoch": 0.466977985323549, "ref_ce_loss": 0.18618299067020416, "step": 1400 }, { "epoch": 0.4703135423615744, "loss": 0.8987, "step": 1410 }, { "epoch": 0.4703135423615744, "grad_norm": 4.312910079956055, "step": 1410 }, { "epoch": 0.4703135423615744, "learning_rate": 0.00029934930694665854, "step": 1410 }, { "epoch": 0.4703135423615744, "loss": 0.7264111638069153, "step": 1410 }, { "ce_loss": 0.33520573377609253, "epoch": 0.4703135423615744, "step": 1410 }, { "distill_loss": 0.2195453941822052, "epoch": 0.4703135423615744, "step": 1410 }, { "epoch": 0.4703135423615744, "ref_ce_loss": 0.17120620608329773, "step": 1410 }, { "epoch": 0.4703135423615744, "loss": 0.6752222180366516, "step": 1410 }, { "ce_loss": 0.3040088713169098, "epoch": 0.4703135423615744, "step": 1410 }, { "distill_loss": 0.15849845111370087, "epoch": 0.4703135423615744, "step": 1410 }, { "epoch": 0.4703135423615744, "ref_ce_loss": 0.21257616579532623, "step": 1410 }, { "epoch": 0.4703135423615744, "loss": 0.7416223287582397, "step": 1410 }, { "ce_loss": 0.32344698905944824, "epoch": 0.4703135423615744, "step": 1410 }, { "distill_loss": 0.20644345879554749, "epoch": 0.4703135423615744, "step": 1410 }, { "epoch": 0.4703135423615744, "ref_ce_loss": 0.14413928985595703, "step": 1410 }, { "epoch": 0.4703135423615744, "loss": 1.558759331703186, "step": 1410 }, { "ce_loss": 0.34111708402633667, "epoch": 0.4703135423615744, "step": 1410 }, { "distill_loss": 0.1806751787662506, "epoch": 0.4703135423615744, "step": 1410 }, { "epoch": 0.4703135423615744, "ref_ce_loss": 0.2201414853334427, "step": 1410 }, { "epoch": 0.47364909939959976, "loss": 0.8553, "step": 1420 }, { "epoch": 0.47364909939959976, "grad_norm": 3.414496898651123, "step": 1420 }, { "epoch": 0.47364909939959976, "learning_rate": 0.0002993303237550256, "step": 1420 }, { "epoch": 0.47364909939959976, "loss": 1.673762559890747, "step": 1420 }, { "ce_loss": 0.3085290789604187, "epoch": 0.47364909939959976, "step": 1420 }, { "distill_loss": 0.16244950890541077, "epoch": 0.47364909939959976, "step": 1420 }, { "epoch": 0.47364909939959976, "ref_ce_loss": 0.24353231489658356, "step": 1420 }, { "epoch": 0.47364909939959976, "loss": 0.9255890846252441, "step": 1420 }, { "ce_loss": 0.40204712748527527, "epoch": 0.47364909939959976, "step": 1420 }, { "distill_loss": 0.15355859696865082, "epoch": 0.47364909939959976, "step": 1420 }, { "epoch": 0.47364909939959976, "ref_ce_loss": 0.13155770301818848, "step": 1420 }, { "epoch": 0.47364909939959976, "loss": 0.9814118146896362, "step": 1420 }, { "ce_loss": 0.4489750266075134, "epoch": 0.47364909939959976, "step": 1420 }, { "distill_loss": 0.17946653068065643, "epoch": 0.47364909939959976, "step": 1420 }, { "epoch": 0.47364909939959976, "ref_ce_loss": 0.2264609932899475, "step": 1420 }, { "epoch": 0.47364909939959976, "loss": 1.0083775520324707, "step": 1420 }, { "ce_loss": 0.3463815450668335, "epoch": 0.47364909939959976, "step": 1420 }, { "distill_loss": 0.16612856090068817, "epoch": 0.47364909939959976, "step": 1420 }, { "epoch": 0.47364909939959976, "ref_ce_loss": 0.2110508233308792, "step": 1420 }, { "epoch": 0.4769846564376251, "loss": 0.9051, "step": 1430 }, { "epoch": 0.4769846564376251, "grad_norm": 2.172429084777832, "step": 1430 }, { "epoch": 0.4769846564376251, "learning_rate": 0.00029931106824378814, "step": 1430 }, { "epoch": 0.4769846564376251, "loss": 0.6023657321929932, "step": 1430 }, { "ce_loss": 0.33847174048423767, "epoch": 0.4769846564376251, "step": 1430 }, { "distill_loss": 0.125957190990448, "epoch": 0.4769846564376251, "step": 1430 }, { "epoch": 0.4769846564376251, "ref_ce_loss": 0.136063352227211, "step": 1430 }, { "epoch": 0.4769846564376251, "loss": 0.745324969291687, "step": 1430 }, { "ce_loss": 0.3038365840911865, "epoch": 0.4769846564376251, "step": 1430 }, { "distill_loss": 0.11901705712080002, "epoch": 0.4769846564376251, "step": 1430 }, { "epoch": 0.4769846564376251, "ref_ce_loss": 0.16039136052131653, "step": 1430 }, { "epoch": 0.4769846564376251, "loss": 0.685943067073822, "step": 1430 }, { "ce_loss": 0.2750985026359558, "epoch": 0.4769846564376251, "step": 1430 }, { "distill_loss": 0.11924824863672256, "epoch": 0.4769846564376251, "step": 1430 }, { "epoch": 0.4769846564376251, "ref_ce_loss": 0.18697425723075867, "step": 1430 }, { "epoch": 0.4769846564376251, "loss": 1.4483582973480225, "step": 1430 }, { "ce_loss": 0.3882851302623749, "epoch": 0.4769846564376251, "step": 1430 }, { "distill_loss": 0.12825100123882294, "epoch": 0.4769846564376251, "step": 1430 }, { "epoch": 0.4769846564376251, "ref_ce_loss": 0.2077254205942154, "step": 1430 }, { "epoch": 0.48032021347565046, "loss": 0.7953, "step": 1440 }, { "epoch": 0.48032021347565046, "grad_norm": 2.2697198390960693, "step": 1440 }, { "epoch": 0.48032021347565046, "learning_rate": 0.00029929154044806063, "step": 1440 }, { "epoch": 0.48032021347565046, "loss": 0.8644044995307922, "step": 1440 }, { "ce_loss": 0.291441947221756, "epoch": 0.48032021347565046, "step": 1440 }, { "distill_loss": 0.11410816758871078, "epoch": 0.48032021347565046, "step": 1440 }, { "epoch": 0.48032021347565046, "ref_ce_loss": 0.12129537761211395, "step": 1440 }, { "epoch": 0.48032021347565046, "loss": 1.0806946754455566, "step": 1440 }, { "ce_loss": 0.29708412289619446, "epoch": 0.48032021347565046, "step": 1440 }, { "distill_loss": 0.1344713270664215, "epoch": 0.48032021347565046, "step": 1440 }, { "epoch": 0.48032021347565046, "ref_ce_loss": 0.1404844969511032, "step": 1440 }, { "epoch": 0.48032021347565046, "loss": 0.6967662572860718, "step": 1440 }, { "ce_loss": 0.31594088673591614, "epoch": 0.48032021347565046, "step": 1440 }, { "distill_loss": 0.10365678369998932, "epoch": 0.48032021347565046, "step": 1440 }, { "epoch": 0.48032021347565046, "ref_ce_loss": 0.19828706979751587, "step": 1440 }, { "epoch": 0.48032021347565046, "loss": 0.8506115674972534, "step": 1440 }, { "ce_loss": 0.26143166422843933, "epoch": 0.48032021347565046, "step": 1440 }, { "distill_loss": 0.11792758107185364, "epoch": 0.48032021347565046, "step": 1440 }, { "epoch": 0.48032021347565046, "ref_ce_loss": 0.1731175184249878, "step": 1440 }, { "epoch": 0.4836557705136758, "loss": 0.8059, "step": 1450 }, { "epoch": 0.4836557705136758, "grad_norm": 2.886183738708496, "step": 1450 }, { "epoch": 0.4836557705136758, "learning_rate": 0.00029927174040345403, "step": 1450 }, { "epoch": 0.4836557705136758, "loss": 0.8864375352859497, "step": 1450 }, { "ce_loss": 0.3058259189128876, "epoch": 0.4836557705136758, "step": 1450 }, { "distill_loss": 0.11227105557918549, "epoch": 0.4836557705136758, "step": 1450 }, { "epoch": 0.4836557705136758, "ref_ce_loss": 0.21765603125095367, "step": 1450 }, { "epoch": 0.4836557705136758, "loss": 0.499306857585907, "step": 1450 }, { "ce_loss": 0.2557595372200012, "epoch": 0.4836557705136758, "step": 1450 }, { "distill_loss": 0.10255733132362366, "epoch": 0.4836557705136758, "step": 1450 }, { "epoch": 0.4836557705136758, "ref_ce_loss": 0.14077462255954742, "step": 1450 }, { "epoch": 0.4836557705136758, "loss": 1.0722293853759766, "step": 1450 }, { "ce_loss": 0.47057244181632996, "epoch": 0.4836557705136758, "step": 1450 }, { "distill_loss": 0.14682574570178986, "epoch": 0.4836557705136758, "step": 1450 }, { "epoch": 0.4836557705136758, "ref_ce_loss": 0.19197282195091248, "step": 1450 }, { "epoch": 0.4836557705136758, "loss": 0.6186968088150024, "step": 1450 }, { "ce_loss": 0.29221686720848083, "epoch": 0.4836557705136758, "step": 1450 }, { "distill_loss": 0.1100810170173645, "epoch": 0.4836557705136758, "step": 1450 }, { "epoch": 0.4836557705136758, "ref_ce_loss": 0.15563970804214478, "step": 1450 }, { "epoch": 0.48699132755170116, "loss": 0.9554, "step": 1460 }, { "epoch": 0.48699132755170116, "grad_norm": 2.7962470054626465, "step": 1460 }, { "epoch": 0.48699132755170116, "learning_rate": 0.00029925166814607585, "step": 1460 }, { "epoch": 0.48699132755170116, "loss": 1.372177243232727, "step": 1460 }, { "ce_loss": 0.3510309159755707, "epoch": 0.48699132755170116, "step": 1460 }, { "distill_loss": 0.1559959203004837, "epoch": 0.48699132755170116, "step": 1460 }, { "epoch": 0.48699132755170116, "ref_ce_loss": 0.14250187575817108, "step": 1460 }, { "epoch": 0.48699132755170116, "loss": 0.8158239722251892, "step": 1460 }, { "ce_loss": 0.32633304595947266, "epoch": 0.48699132755170116, "step": 1460 }, { "distill_loss": 0.1910652071237564, "epoch": 0.48699132755170116, "step": 1460 }, { "epoch": 0.48699132755170116, "ref_ce_loss": 0.16944916546344757, "step": 1460 }, { "epoch": 0.48699132755170116, "loss": 1.0479116439819336, "step": 1460 }, { "ce_loss": 0.39617040753364563, "epoch": 0.48699132755170116, "step": 1460 }, { "distill_loss": 0.19950151443481445, "epoch": 0.48699132755170116, "step": 1460 }, { "epoch": 0.48699132755170116, "ref_ce_loss": 0.16884933412075043, "step": 1460 }, { "epoch": 0.48699132755170116, "loss": 0.8634979724884033, "step": 1460 }, { "ce_loss": 0.3990035355091095, "epoch": 0.48699132755170116, "step": 1460 }, { "distill_loss": 0.1979626566171646, "epoch": 0.48699132755170116, "step": 1460 }, { "epoch": 0.48699132755170116, "ref_ce_loss": 0.18388716876506805, "step": 1460 }, { "epoch": 0.4903268845897265, "loss": 0.8563, "step": 1470 }, { "epoch": 0.4903268845897265, "grad_norm": 1.9522337913513184, "step": 1470 }, { "epoch": 0.4903268845897265, "learning_rate": 0.00029923132371252993, "step": 1470 }, { "epoch": 0.4903268845897265, "loss": 0.5769478678703308, "step": 1470 }, { "ce_loss": 0.2803075611591339, "epoch": 0.4903268845897265, "step": 1470 }, { "distill_loss": 0.1305488646030426, "epoch": 0.4903268845897265, "step": 1470 }, { "epoch": 0.4903268845897265, "ref_ce_loss": 0.12108226865530014, "step": 1470 }, { "epoch": 0.4903268845897265, "loss": 0.784702479839325, "step": 1470 }, { "ce_loss": 0.33124253153800964, "epoch": 0.4903268845897265, "step": 1470 }, { "distill_loss": 0.1394960582256317, "epoch": 0.4903268845897265, "step": 1470 }, { "epoch": 0.4903268845897265, "ref_ce_loss": 0.20602154731750488, "step": 1470 }, { "epoch": 0.4903268845897265, "loss": 0.5377941131591797, "step": 1470 }, { "ce_loss": 0.2358846515417099, "epoch": 0.4903268845897265, "step": 1470 }, { "distill_loss": 0.1433420330286026, "epoch": 0.4903268845897265, "step": 1470 }, { "epoch": 0.4903268845897265, "ref_ce_loss": 0.14286667108535767, "step": 1470 }, { "epoch": 0.4903268845897265, "loss": 0.8376799821853638, "step": 1470 }, { "ce_loss": 0.2516549825668335, "epoch": 0.4903268845897265, "step": 1470 }, { "distill_loss": 0.11286590993404388, "epoch": 0.4903268845897265, "step": 1470 }, { "epoch": 0.4903268845897265, "ref_ce_loss": 0.09511788934469223, "step": 1470 }, { "epoch": 0.49366244162775186, "loss": 0.8752, "step": 1480 }, { "epoch": 0.49366244162775186, "grad_norm": 3.132399797439575, "step": 1480 }, { "epoch": 0.49366244162775186, "learning_rate": 0.0002992107071399165, "step": 1480 }, { "epoch": 0.49366244162775186, "loss": 1.5298254489898682, "step": 1480 }, { "ce_loss": 0.39849844574928284, "epoch": 0.49366244162775186, "step": 1480 }, { "distill_loss": 0.16513606905937195, "epoch": 0.49366244162775186, "step": 1480 }, { "epoch": 0.49366244162775186, "ref_ce_loss": 0.2410481870174408, "step": 1480 }, { "epoch": 0.49366244162775186, "loss": 1.0180957317352295, "step": 1480 }, { "ce_loss": 0.33581236004829407, "epoch": 0.49366244162775186, "step": 1480 }, { "distill_loss": 0.15931996703147888, "epoch": 0.49366244162775186, "step": 1480 }, { "epoch": 0.49366244162775186, "ref_ce_loss": 0.21301725506782532, "step": 1480 }, { "epoch": 0.49366244162775186, "loss": 0.7502650022506714, "step": 1480 }, { "ce_loss": 0.382405549287796, "epoch": 0.49366244162775186, "step": 1480 }, { "distill_loss": 0.12572389841079712, "epoch": 0.49366244162775186, "step": 1480 }, { "epoch": 0.49366244162775186, "ref_ce_loss": 0.17360630631446838, "step": 1480 }, { "epoch": 0.49366244162775186, "loss": 1.0398483276367188, "step": 1480 }, { "ce_loss": 0.30432286858558655, "epoch": 0.49366244162775186, "step": 1480 }, { "distill_loss": 0.14817538857460022, "epoch": 0.49366244162775186, "step": 1480 }, { "epoch": 0.49366244162775186, "ref_ce_loss": 0.17467889189720154, "step": 1480 }, { "epoch": 0.4969979986657772, "loss": 0.8252, "step": 1490 }, { "epoch": 0.4969979986657772, "grad_norm": 4.466595649719238, "step": 1490 }, { "epoch": 0.4969979986657772, "learning_rate": 0.0002991898184658321, "step": 1490 }, { "epoch": 0.4969979986657772, "loss": 0.729331910610199, "step": 1490 }, { "ce_loss": 0.22167912125587463, "epoch": 0.4969979986657772, "step": 1490 }, { "distill_loss": 0.1331212818622589, "epoch": 0.4969979986657772, "step": 1490 }, { "epoch": 0.4969979986657772, "ref_ce_loss": 0.10691002011299133, "step": 1490 }, { "epoch": 0.4969979986657772, "loss": 0.8190153241157532, "step": 1490 }, { "ce_loss": 0.3913503587245941, "epoch": 0.4969979986657772, "step": 1490 }, { "distill_loss": 0.11496517062187195, "epoch": 0.4969979986657772, "step": 1490 }, { "epoch": 0.4969979986657772, "ref_ce_loss": 0.19716744124889374, "step": 1490 }, { "epoch": 0.4969979986657772, "loss": 0.6939219832420349, "step": 1490 }, { "ce_loss": 0.3533763289451599, "epoch": 0.4969979986657772, "step": 1490 }, { "distill_loss": 0.12615175545215607, "epoch": 0.4969979986657772, "step": 1490 }, { "epoch": 0.4969979986657772, "ref_ce_loss": 0.21436937153339386, "step": 1490 }, { "epoch": 0.4969979986657772, "loss": 0.7554702758789062, "step": 1490 }, { "ce_loss": 0.2819823920726776, "epoch": 0.4969979986657772, "step": 1490 }, { "distill_loss": 0.13555271923542023, "epoch": 0.4969979986657772, "step": 1490 }, { "epoch": 0.4969979986657772, "ref_ce_loss": 0.224581778049469, "step": 1490 }, { "epoch": 0.5003335557038026, "loss": 0.8087, "step": 1500 }, { "epoch": 0.5003335557038026, "grad_norm": 4.537039279937744, "step": 1500 }, { "epoch": 0.5003335557038026, "learning_rate": 0.0002991686577283694, "step": 1500 }, { "epoch": 0.5003335557038026, "loss": 0.8498672246932983, "step": 1500 }, { "ce_loss": 0.33424246311187744, "epoch": 0.5003335557038026, "step": 1500 }, { "distill_loss": 0.11146187782287598, "epoch": 0.5003335557038026, "step": 1500 }, { "epoch": 0.5003335557038026, "ref_ce_loss": 0.133957639336586, "step": 1500 }, { "epoch": 0.5003335557038026, "loss": 0.7086281180381775, "step": 1500 }, { "ce_loss": 0.29514920711517334, "epoch": 0.5003335557038026, "step": 1500 }, { "distill_loss": 0.09464933723211288, "epoch": 0.5003335557038026, "step": 1500 }, { "epoch": 0.5003335557038026, "ref_ce_loss": 0.21752260625362396, "step": 1500 }, { "epoch": 0.5003335557038026, "loss": 1.3933093547821045, "step": 1500 }, { "ce_loss": 0.5271459817886353, "epoch": 0.5003335557038026, "step": 1500 }, { "distill_loss": 0.14879895746707916, "epoch": 0.5003335557038026, "step": 1500 }, { "epoch": 0.5003335557038026, "ref_ce_loss": 0.24883712828159332, "step": 1500 }, { "epoch": 0.5003335557038026, "loss": 0.5399357080459595, "step": 1500 }, { "ce_loss": 0.23881879448890686, "epoch": 0.5003335557038026, "step": 1500 }, { "distill_loss": 0.09969379752874374, "epoch": 0.5003335557038026, "step": 1500 }, { "epoch": 0.5003335557038026, "ref_ce_loss": 0.12885983288288116, "step": 1500 }, { "epoch": 0.5036691127418279, "loss": 0.8127, "step": 1510 }, { "epoch": 0.5036691127418279, "grad_norm": 1.9899190664291382, "step": 1510 }, { "epoch": 0.5036691127418279, "learning_rate": 0.0002991472249661172, "step": 1510 }, { "epoch": 0.5036691127418279, "loss": 0.611236572265625, "step": 1510 }, { "ce_loss": 0.2055187225341797, "epoch": 0.5036691127418279, "step": 1510 }, { "distill_loss": 0.1261042356491089, "epoch": 0.5036691127418279, "step": 1510 }, { "epoch": 0.5036691127418279, "ref_ce_loss": 0.17034177482128143, "step": 1510 }, { "epoch": 0.5036691127418279, "loss": 1.0687274932861328, "step": 1510 }, { "ce_loss": 0.2887864410877228, "epoch": 0.5036691127418279, "step": 1510 }, { "distill_loss": 0.13521718978881836, "epoch": 0.5036691127418279, "step": 1510 }, { "epoch": 0.5036691127418279, "ref_ce_loss": 0.17292891442775726, "step": 1510 }, { "epoch": 0.5036691127418279, "loss": 0.822563111782074, "step": 1510 }, { "ce_loss": 0.3748740255832672, "epoch": 0.5036691127418279, "step": 1510 }, { "distill_loss": 0.1336393654346466, "epoch": 0.5036691127418279, "step": 1510 }, { "epoch": 0.5036691127418279, "ref_ce_loss": 0.2078617364168167, "step": 1510 }, { "epoch": 0.5036691127418279, "loss": 0.7008830904960632, "step": 1510 }, { "ce_loss": 0.31130093336105347, "epoch": 0.5036691127418279, "step": 1510 }, { "distill_loss": 0.1265094131231308, "epoch": 0.5036691127418279, "step": 1510 }, { "epoch": 0.5036691127418279, "ref_ce_loss": 0.18166042864322662, "step": 1510 }, { "epoch": 0.5070046697798533, "loss": 0.7516, "step": 1520 }, { "epoch": 0.5070046697798533, "grad_norm": 3.6005780696868896, "step": 1520 }, { "epoch": 0.5070046697798533, "learning_rate": 0.00029912552021816045, "step": 1520 }, { "epoch": 0.5070046697798533, "loss": 0.7088308930397034, "step": 1520 }, { "ce_loss": 0.252478688955307, "epoch": 0.5070046697798533, "step": 1520 }, { "distill_loss": 0.08968272060155869, "epoch": 0.5070046697798533, "step": 1520 }, { "epoch": 0.5070046697798533, "ref_ce_loss": 0.1475578248500824, "step": 1520 }, { "epoch": 0.5070046697798533, "loss": 0.718932032585144, "step": 1520 }, { "ce_loss": 0.35802438855171204, "epoch": 0.5070046697798533, "step": 1520 }, { "distill_loss": 0.10070653259754181, "epoch": 0.5070046697798533, "step": 1520 }, { "epoch": 0.5070046697798533, "ref_ce_loss": 0.12593254446983337, "step": 1520 }, { "epoch": 0.5070046697798533, "loss": 0.6470715403556824, "step": 1520 }, { "ce_loss": 0.2930530607700348, "epoch": 0.5070046697798533, "step": 1520 }, { "distill_loss": 0.11143629252910614, "epoch": 0.5070046697798533, "step": 1520 }, { "epoch": 0.5070046697798533, "ref_ce_loss": 0.17003856599330902, "step": 1520 }, { "epoch": 0.5070046697798533, "loss": 0.6389113664627075, "step": 1520 }, { "ce_loss": 0.37416961789131165, "epoch": 0.5070046697798533, "step": 1520 }, { "distill_loss": 0.11495236307382584, "epoch": 0.5070046697798533, "step": 1520 }, { "epoch": 0.5070046697798533, "ref_ce_loss": 0.1497492641210556, "step": 1520 }, { "epoch": 0.5103402268178786, "loss": 0.7928, "step": 1530 }, { "epoch": 0.5103402268178786, "grad_norm": 2.498560667037964, "step": 1530 }, { "epoch": 0.5103402268178786, "learning_rate": 0.00029910354352408, "step": 1530 }, { "epoch": 0.5103402268178786, "loss": 0.6787996888160706, "step": 1530 }, { "ce_loss": 0.26317986845970154, "epoch": 0.5103402268178786, "step": 1530 }, { "distill_loss": 0.11007644236087799, "epoch": 0.5103402268178786, "step": 1530 }, { "epoch": 0.5103402268178786, "ref_ce_loss": 0.22199080884456635, "step": 1530 }, { "epoch": 0.5103402268178786, "loss": 0.5043588876724243, "step": 1530 }, { "ce_loss": 0.2592538893222809, "epoch": 0.5103402268178786, "step": 1530 }, { "distill_loss": 0.12781588733196259, "epoch": 0.5103402268178786, "step": 1530 }, { "epoch": 0.5103402268178786, "ref_ce_loss": 0.11714787781238556, "step": 1530 }, { "epoch": 0.5103402268178786, "loss": 0.6850191950798035, "step": 1530 }, { "ce_loss": 0.3347513973712921, "epoch": 0.5103402268178786, "step": 1530 }, { "distill_loss": 0.11932611465454102, "epoch": 0.5103402268178786, "step": 1530 }, { "epoch": 0.5103402268178786, "ref_ce_loss": 0.23088660836219788, "step": 1530 }, { "epoch": 0.5103402268178786, "loss": 0.5668002367019653, "step": 1530 }, { "ce_loss": 0.20559754967689514, "epoch": 0.5103402268178786, "step": 1530 }, { "distill_loss": 0.11886685341596603, "epoch": 0.5103402268178786, "step": 1530 }, { "epoch": 0.5103402268178786, "ref_ce_loss": 0.16831466555595398, "step": 1530 }, { "epoch": 0.513675783855904, "loss": 0.8024, "step": 1540 }, { "epoch": 0.513675783855904, "grad_norm": 3.3080246448516846, "step": 1540 }, { "epoch": 0.513675783855904, "learning_rate": 0.0002990812949239528, "step": 1540 }, { "epoch": 0.513675783855904, "loss": 0.6707828044891357, "step": 1540 }, { "ce_loss": 0.2728685736656189, "epoch": 0.513675783855904, "step": 1540 }, { "distill_loss": 0.12154404073953629, "epoch": 0.513675783855904, "step": 1540 }, { "epoch": 0.513675783855904, "ref_ce_loss": 0.18021827936172485, "step": 1540 }, { "epoch": 0.513675783855904, "loss": 0.7965375185012817, "step": 1540 }, { "ce_loss": 0.34719210863113403, "epoch": 0.513675783855904, "step": 1540 }, { "distill_loss": 0.09814593195915222, "epoch": 0.513675783855904, "step": 1540 }, { "epoch": 0.513675783855904, "ref_ce_loss": 0.25575193762779236, "step": 1540 }, { "epoch": 0.513675783855904, "loss": 0.8639137148857117, "step": 1540 }, { "ce_loss": 0.3147978186607361, "epoch": 0.513675783855904, "step": 1540 }, { "distill_loss": 0.11294306814670563, "epoch": 0.513675783855904, "step": 1540 }, { "epoch": 0.513675783855904, "ref_ce_loss": 0.2474347949028015, "step": 1540 }, { "epoch": 0.513675783855904, "loss": 0.4631844758987427, "step": 1540 }, { "ce_loss": 0.2097543329000473, "epoch": 0.513675783855904, "step": 1540 }, { "distill_loss": 0.09885016828775406, "epoch": 0.513675783855904, "step": 1540 }, { "epoch": 0.513675783855904, "ref_ce_loss": 0.1541988104581833, "step": 1540 }, { "epoch": 0.5170113408939293, "loss": 0.7923, "step": 1550 }, { "epoch": 0.5170113408939293, "grad_norm": 1.7913470268249512, "step": 1550 }, { "epoch": 0.5170113408939293, "learning_rate": 0.0002990587744583514, "step": 1550 }, { "epoch": 0.5170113408939293, "loss": 0.7131497859954834, "step": 1550 }, { "ce_loss": 0.35181015729904175, "epoch": 0.5170113408939293, "step": 1550 }, { "distill_loss": 0.11702708154916763, "epoch": 0.5170113408939293, "step": 1550 }, { "epoch": 0.5170113408939293, "ref_ce_loss": 0.18052375316619873, "step": 1550 }, { "epoch": 0.5170113408939293, "loss": 0.707023024559021, "step": 1550 }, { "ce_loss": 0.27232182025909424, "epoch": 0.5170113408939293, "step": 1550 }, { "distill_loss": 0.10445040464401245, "epoch": 0.5170113408939293, "step": 1550 }, { "epoch": 0.5170113408939293, "ref_ce_loss": 0.13911911845207214, "step": 1550 }, { "epoch": 0.5170113408939293, "loss": 0.6784090995788574, "step": 1550 }, { "ce_loss": 0.2939833402633667, "epoch": 0.5170113408939293, "step": 1550 }, { "distill_loss": 0.08660943061113358, "epoch": 0.5170113408939293, "step": 1550 }, { "epoch": 0.5170113408939293, "ref_ce_loss": 0.18671560287475586, "step": 1550 }, { "epoch": 0.5170113408939293, "loss": 1.2959554195404053, "step": 1550 }, { "ce_loss": 0.3662611246109009, "epoch": 0.5170113408939293, "step": 1550 }, { "distill_loss": 0.11476387083530426, "epoch": 0.5170113408939293, "step": 1550 }, { "epoch": 0.5170113408939293, "ref_ce_loss": 0.1677011251449585, "step": 1550 }, { "epoch": 0.5203468979319547, "loss": 0.8326, "step": 1560 }, { "epoch": 0.5203468979319547, "grad_norm": 2.679382562637329, "step": 1560 }, { "epoch": 0.5203468979319547, "learning_rate": 0.0002990359821683443, "step": 1560 }, { "epoch": 0.5203468979319547, "loss": 0.5702127814292908, "step": 1560 }, { "ce_loss": 0.21940156817436218, "epoch": 0.5203468979319547, "step": 1560 }, { "distill_loss": 0.1002032607793808, "epoch": 0.5203468979319547, "step": 1560 }, { "epoch": 0.5203468979319547, "ref_ce_loss": 0.16673019528388977, "step": 1560 }, { "epoch": 0.5203468979319547, "loss": 0.8199391961097717, "step": 1560 }, { "ce_loss": 0.2875426709651947, "epoch": 0.5203468979319547, "step": 1560 }, { "distill_loss": 0.1570519655942917, "epoch": 0.5203468979319547, "step": 1560 }, { "epoch": 0.5203468979319547, "ref_ce_loss": 0.1424614042043686, "step": 1560 }, { "epoch": 0.5203468979319547, "loss": 0.8121156692504883, "step": 1560 }, { "ce_loss": 0.3326272964477539, "epoch": 0.5203468979319547, "step": 1560 }, { "distill_loss": 0.13725823163986206, "epoch": 0.5203468979319547, "step": 1560 }, { "epoch": 0.5203468979319547, "ref_ce_loss": 0.2615748643875122, "step": 1560 }, { "epoch": 0.5203468979319547, "loss": 0.8154400587081909, "step": 1560 }, { "ce_loss": 0.2914092242717743, "epoch": 0.5203468979319547, "step": 1560 }, { "distill_loss": 0.14719057083129883, "epoch": 0.5203468979319547, "step": 1560 }, { "epoch": 0.5203468979319547, "ref_ce_loss": 0.21865858137607574, "step": 1560 }, { "epoch": 0.52368245496998, "loss": 0.7554, "step": 1570 }, { "epoch": 0.52368245496998, "grad_norm": 3.8052194118499756, "step": 1570 }, { "epoch": 0.52368245496998, "learning_rate": 0.0002990129180954956, "step": 1570 }, { "epoch": 0.52368245496998, "loss": 0.7150813341140747, "step": 1570 }, { "ce_loss": 0.18721269071102142, "epoch": 0.52368245496998, "step": 1570 }, { "distill_loss": 0.10450568795204163, "epoch": 0.52368245496998, "step": 1570 }, { "epoch": 0.52368245496998, "ref_ce_loss": 0.10055077075958252, "step": 1570 }, { "epoch": 0.52368245496998, "loss": 0.7558162212371826, "step": 1570 }, { "ce_loss": 0.25569626688957214, "epoch": 0.52368245496998, "step": 1570 }, { "distill_loss": 0.12277135252952576, "epoch": 0.52368245496998, "step": 1570 }, { "epoch": 0.52368245496998, "ref_ce_loss": 0.12362133711576462, "step": 1570 }, { "epoch": 0.52368245496998, "loss": 1.325268030166626, "step": 1570 }, { "ce_loss": 0.3507300913333893, "epoch": 0.52368245496998, "step": 1570 }, { "distill_loss": 0.1343396157026291, "epoch": 0.52368245496998, "step": 1570 }, { "epoch": 0.52368245496998, "ref_ce_loss": 0.2221112996339798, "step": 1570 }, { "epoch": 0.52368245496998, "loss": 0.6346928477287292, "step": 1570 }, { "ce_loss": 0.2960171401500702, "epoch": 0.52368245496998, "step": 1570 }, { "distill_loss": 0.1436004638671875, "epoch": 0.52368245496998, "step": 1570 }, { "epoch": 0.52368245496998, "ref_ce_loss": 0.11697656661272049, "step": 1570 }, { "epoch": 0.5270180120080054, "loss": 0.7996, "step": 1580 }, { "epoch": 0.5270180120080054, "grad_norm": 1.8267759084701538, "step": 1580 }, { "epoch": 0.5270180120080054, "learning_rate": 0.0002989895822818651, "step": 1580 }, { "epoch": 0.5270180120080054, "loss": 0.6526615619659424, "step": 1580 }, { "ce_loss": 0.15909507870674133, "epoch": 0.5270180120080054, "step": 1580 }, { "distill_loss": 0.12408677488565445, "epoch": 0.5270180120080054, "step": 1580 }, { "epoch": 0.5270180120080054, "ref_ce_loss": 0.19116757810115814, "step": 1580 }, { "epoch": 0.5270180120080054, "loss": 0.5174135565757751, "step": 1580 }, { "ce_loss": 0.16595187783241272, "epoch": 0.5270180120080054, "step": 1580 }, { "distill_loss": 0.13370013236999512, "epoch": 0.5270180120080054, "step": 1580 }, { "epoch": 0.5270180120080054, "ref_ce_loss": 0.1358727216720581, "step": 1580 }, { "epoch": 0.5270180120080054, "loss": 0.6547532677650452, "step": 1580 }, { "ce_loss": 0.30841338634490967, "epoch": 0.5270180120080054, "step": 1580 }, { "distill_loss": 0.1410057544708252, "epoch": 0.5270180120080054, "step": 1580 }, { "epoch": 0.5270180120080054, "ref_ce_loss": 0.15567293763160706, "step": 1580 }, { "epoch": 0.5270180120080054, "loss": 0.6471787691116333, "step": 1580 }, { "ce_loss": 0.27511271834373474, "epoch": 0.5270180120080054, "step": 1580 }, { "distill_loss": 0.136945903301239, "epoch": 0.5270180120080054, "step": 1580 }, { "epoch": 0.5270180120080054, "ref_ce_loss": 0.23433144390583038, "step": 1580 }, { "epoch": 0.5303535690460307, "loss": 0.7405, "step": 1590 }, { "epoch": 0.5303535690460307, "grad_norm": 1.9112555980682373, "step": 1590 }, { "epoch": 0.5303535690460307, "learning_rate": 0.00029896597477000803, "step": 1590 }, { "epoch": 0.5303535690460307, "loss": 0.8480120301246643, "step": 1590 }, { "ce_loss": 0.2612304091453552, "epoch": 0.5303535690460307, "step": 1590 }, { "distill_loss": 0.12277919799089432, "epoch": 0.5303535690460307, "step": 1590 }, { "epoch": 0.5303535690460307, "ref_ce_loss": 0.14804160594940186, "step": 1590 }, { "epoch": 0.5303535690460307, "loss": 0.6312728524208069, "step": 1590 }, { "ce_loss": 0.299432635307312, "epoch": 0.5303535690460307, "step": 1590 }, { "distill_loss": 0.13859985768795013, "epoch": 0.5303535690460307, "step": 1590 }, { "epoch": 0.5303535690460307, "ref_ce_loss": 0.19271425902843475, "step": 1590 }, { "epoch": 0.5303535690460307, "loss": 0.5959420800209045, "step": 1590 }, { "ce_loss": 0.27959901094436646, "epoch": 0.5303535690460307, "step": 1590 }, { "distill_loss": 0.12362391501665115, "epoch": 0.5303535690460307, "step": 1590 }, { "epoch": 0.5303535690460307, "ref_ce_loss": 0.1926553100347519, "step": 1590 }, { "epoch": 0.5303535690460307, "loss": 0.6874289512634277, "step": 1590 }, { "ce_loss": 0.3542376160621643, "epoch": 0.5303535690460307, "step": 1590 }, { "distill_loss": 0.1559729278087616, "epoch": 0.5303535690460307, "step": 1590 }, { "epoch": 0.5303535690460307, "ref_ce_loss": 0.1770813763141632, "step": 1590 }, { "epoch": 0.533689126084056, "loss": 0.7949, "step": 1600 }, { "epoch": 0.533689126084056, "grad_norm": 1.7462742328643799, "step": 1600 }, { "epoch": 0.533689126084056, "learning_rate": 0.00029894209560297536, "step": 1600 }, { "epoch": 0.533689126084056, "loss": 0.7205913066864014, "step": 1600 }, { "ce_loss": 0.3525589406490326, "epoch": 0.533689126084056, "step": 1600 }, { "distill_loss": 0.16837641596794128, "epoch": 0.533689126084056, "step": 1600 }, { "epoch": 0.533689126084056, "ref_ce_loss": 0.19949723780155182, "step": 1600 }, { "epoch": 0.533689126084056, "loss": 1.0897554159164429, "step": 1600 }, { "ce_loss": 0.24649381637573242, "epoch": 0.533689126084056, "step": 1600 }, { "distill_loss": 0.14514781534671783, "epoch": 0.533689126084056, "step": 1600 }, { "epoch": 0.533689126084056, "ref_ce_loss": 0.20254717767238617, "step": 1600 }, { "epoch": 0.533689126084056, "loss": 0.6646853089332581, "step": 1600 }, { "ce_loss": 0.2682981491088867, "epoch": 0.533689126084056, "step": 1600 }, { "distill_loss": 0.1883355677127838, "epoch": 0.533689126084056, "step": 1600 }, { "epoch": 0.533689126084056, "ref_ce_loss": 0.11741480231285095, "step": 1600 }, { "epoch": 0.533689126084056, "loss": 0.6180636882781982, "step": 1600 }, { "ce_loss": 0.2548883855342865, "epoch": 0.533689126084056, "step": 1600 }, { "distill_loss": 0.16024237871170044, "epoch": 0.533689126084056, "step": 1600 }, { "epoch": 0.533689126084056, "ref_ce_loss": 0.10766757279634476, "step": 1600 }, { "epoch": 0.5370246831220814, "loss": 0.8356, "step": 1610 }, { "epoch": 0.5370246831220814, "grad_norm": 2.262484312057495, "step": 1610 }, { "epoch": 0.5370246831220814, "learning_rate": 0.00029891794482431313, "step": 1610 }, { "epoch": 0.5370246831220814, "loss": 1.4120798110961914, "step": 1610 }, { "ce_loss": 0.3346101939678192, "epoch": 0.5370246831220814, "step": 1610 }, { "distill_loss": 0.14435593783855438, "epoch": 0.5370246831220814, "step": 1610 }, { "epoch": 0.5370246831220814, "ref_ce_loss": 0.17607466876506805, "step": 1610 }, { "epoch": 0.5370246831220814, "loss": 0.8668168187141418, "step": 1610 }, { "ce_loss": 0.2469944804906845, "epoch": 0.5370246831220814, "step": 1610 }, { "distill_loss": 0.11261950433254242, "epoch": 0.5370246831220814, "step": 1610 }, { "epoch": 0.5370246831220814, "ref_ce_loss": 0.20903988182544708, "step": 1610 }, { "epoch": 0.5370246831220814, "loss": 0.6093182563781738, "step": 1610 }, { "ce_loss": 0.27436932921409607, "epoch": 0.5370246831220814, "step": 1610 }, { "distill_loss": 0.1148870661854744, "epoch": 0.5370246831220814, "step": 1610 }, { "epoch": 0.5370246831220814, "ref_ce_loss": 0.21992698311805725, "step": 1610 }, { "epoch": 0.5370246831220814, "loss": 0.9816043376922607, "step": 1610 }, { "ce_loss": 0.3318880796432495, "epoch": 0.5370246831220814, "step": 1610 }, { "distill_loss": 0.1468740850687027, "epoch": 0.5370246831220814, "step": 1610 }, { "epoch": 0.5370246831220814, "ref_ce_loss": 0.17123477160930634, "step": 1610 }, { "epoch": 0.5403602401601068, "loss": 0.907, "step": 1620 }, { "epoch": 0.5403602401601068, "grad_norm": 4.384459972381592, "step": 1620 }, { "epoch": 0.5403602401601068, "learning_rate": 0.0002988935224780629, "step": 1620 }, { "epoch": 0.5403602401601068, "loss": 0.7424207329750061, "step": 1620 }, { "ce_loss": 0.3590454161167145, "epoch": 0.5403602401601068, "step": 1620 }, { "distill_loss": 0.17425327003002167, "epoch": 0.5403602401601068, "step": 1620 }, { "epoch": 0.5403602401601068, "ref_ce_loss": 0.20896899700164795, "step": 1620 }, { "epoch": 0.5403602401601068, "loss": 0.6844779849052429, "step": 1620 }, { "ce_loss": 0.3099417984485626, "epoch": 0.5403602401601068, "step": 1620 }, { "distill_loss": 0.14133571088314056, "epoch": 0.5403602401601068, "step": 1620 }, { "epoch": 0.5403602401601068, "ref_ce_loss": 0.15446357429027557, "step": 1620 }, { "epoch": 0.5403602401601068, "loss": 0.7723664045333862, "step": 1620 }, { "ce_loss": 0.3412700593471527, "epoch": 0.5403602401601068, "step": 1620 }, { "distill_loss": 0.14123709499835968, "epoch": 0.5403602401601068, "step": 1620 }, { "epoch": 0.5403602401601068, "ref_ce_loss": 0.18426023423671722, "step": 1620 }, { "epoch": 0.5403602401601068, "loss": 1.0038042068481445, "step": 1620 }, { "ce_loss": 0.2636459171772003, "epoch": 0.5403602401601068, "step": 1620 }, { "distill_loss": 0.14897647500038147, "epoch": 0.5403602401601068, "step": 1620 }, { "epoch": 0.5403602401601068, "ref_ce_loss": 0.16534221172332764, "step": 1620 }, { "epoch": 0.5436957971981321, "loss": 0.767, "step": 1630 }, { "epoch": 0.5436957971981321, "grad_norm": 1.8759695291519165, "step": 1630 }, { "epoch": 0.5436957971981321, "learning_rate": 0.00029886882860876134, "step": 1630 }, { "epoch": 0.5436957971981321, "loss": 0.6272376775741577, "step": 1630 }, { "ce_loss": 0.21282872557640076, "epoch": 0.5436957971981321, "step": 1630 }, { "distill_loss": 0.11778245866298676, "epoch": 0.5436957971981321, "step": 1630 }, { "epoch": 0.5436957971981321, "ref_ce_loss": 0.20675376057624817, "step": 1630 }, { "epoch": 0.5436957971981321, "loss": 0.5564039945602417, "step": 1630 }, { "ce_loss": 0.19067807495594025, "epoch": 0.5436957971981321, "step": 1630 }, { "distill_loss": 0.12105537950992584, "epoch": 0.5436957971981321, "step": 1630 }, { "epoch": 0.5436957971981321, "ref_ce_loss": 0.1097465455532074, "step": 1630 }, { "epoch": 0.5436957971981321, "loss": 0.9002183675765991, "step": 1630 }, { "ce_loss": 0.386663019657135, "epoch": 0.5436957971981321, "step": 1630 }, { "distill_loss": 0.1761159598827362, "epoch": 0.5436957971981321, "step": 1630 }, { "epoch": 0.5436957971981321, "ref_ce_loss": 0.21742606163024902, "step": 1630 }, { "epoch": 0.5436957971981321, "loss": 0.7056699395179749, "step": 1630 }, { "ce_loss": 0.23579426109790802, "epoch": 0.5436957971981321, "step": 1630 }, { "distill_loss": 0.13872799277305603, "epoch": 0.5436957971981321, "step": 1630 }, { "epoch": 0.5436957971981321, "ref_ce_loss": 0.12659761309623718, "step": 1630 }, { "epoch": 0.5470313542361575, "loss": 0.8049, "step": 1640 }, { "epoch": 0.5470313542361575, "grad_norm": 1.8977726697921753, "step": 1640 }, { "epoch": 0.5470313542361575, "learning_rate": 0.0002988438632614404, "step": 1640 }, { "epoch": 0.5470313542361575, "loss": 0.73987877368927, "step": 1640 }, { "ce_loss": 0.337587833404541, "epoch": 0.5470313542361575, "step": 1640 }, { "distill_loss": 0.11915989220142365, "epoch": 0.5470313542361575, "step": 1640 }, { "epoch": 0.5470313542361575, "ref_ce_loss": 0.16175560653209686, "step": 1640 }, { "epoch": 0.5470313542361575, "loss": 1.201418161392212, "step": 1640 }, { "ce_loss": 0.3867391049861908, "epoch": 0.5470313542361575, "step": 1640 }, { "distill_loss": 0.1336623579263687, "epoch": 0.5470313542361575, "step": 1640 }, { "epoch": 0.5470313542361575, "ref_ce_loss": 0.20569270849227905, "step": 1640 }, { "epoch": 0.5470313542361575, "loss": 1.7483572959899902, "step": 1640 }, { "ce_loss": 0.2928985059261322, "epoch": 0.5470313542361575, "step": 1640 }, { "distill_loss": 0.1285347044467926, "epoch": 0.5470313542361575, "step": 1640 }, { "epoch": 0.5470313542361575, "ref_ce_loss": 0.19875165820121765, "step": 1640 }, { "epoch": 0.5470313542361575, "loss": 0.7967950701713562, "step": 1640 }, { "ce_loss": 0.39166033267974854, "epoch": 0.5470313542361575, "step": 1640 }, { "distill_loss": 0.1525496244430542, "epoch": 0.5470313542361575, "step": 1640 }, { "epoch": 0.5470313542361575, "ref_ce_loss": 0.25225165486335754, "step": 1640 }, { "epoch": 0.5503669112741828, "loss": 0.8833, "step": 1650 }, { "epoch": 0.5503669112741828, "grad_norm": 4.775393486022949, "step": 1650 }, { "epoch": 0.5503669112741828, "learning_rate": 0.00029881862648162695, "step": 1650 }, { "epoch": 0.5503669112741828, "loss": 0.8248935341835022, "step": 1650 }, { "ce_loss": 0.34251272678375244, "epoch": 0.5503669112741828, "step": 1650 }, { "distill_loss": 0.15160521864891052, "epoch": 0.5503669112741828, "step": 1650 }, { "epoch": 0.5503669112741828, "ref_ce_loss": 0.120022252202034, "step": 1650 }, { "epoch": 0.5503669112741828, "loss": 0.7766433358192444, "step": 1650 }, { "ce_loss": 0.2661416530609131, "epoch": 0.5503669112741828, "step": 1650 }, { "distill_loss": 0.11151763796806335, "epoch": 0.5503669112741828, "step": 1650 }, { "epoch": 0.5503669112741828, "ref_ce_loss": 0.18171195685863495, "step": 1650 }, { "epoch": 0.5503669112741828, "loss": 0.6041874885559082, "step": 1650 }, { "ce_loss": 0.25433608889579773, "epoch": 0.5503669112741828, "step": 1650 }, { "distill_loss": 0.12476801872253418, "epoch": 0.5503669112741828, "step": 1650 }, { "epoch": 0.5503669112741828, "ref_ce_loss": 0.1334761679172516, "step": 1650 }, { "epoch": 0.5503669112741828, "loss": 0.5187742710113525, "step": 1650 }, { "ce_loss": 0.26460695266723633, "epoch": 0.5503669112741828, "step": 1650 }, { "distill_loss": 0.13951659202575684, "epoch": 0.5503669112741828, "step": 1650 }, { "epoch": 0.5503669112741828, "ref_ce_loss": 0.10023458302021027, "step": 1650 }, { "epoch": 0.5537024683122082, "loss": 0.7497, "step": 1660 }, { "epoch": 0.5537024683122082, "grad_norm": 2.310690402984619, "step": 1660 }, { "epoch": 0.5537024683122082, "learning_rate": 0.0002987931183153429, "step": 1660 }, { "epoch": 0.5537024683122082, "loss": 0.5635334253311157, "step": 1660 }, { "ce_loss": 0.2891744375228882, "epoch": 0.5537024683122082, "step": 1660 }, { "distill_loss": 0.14947661757469177, "epoch": 0.5537024683122082, "step": 1660 }, { "epoch": 0.5537024683122082, "ref_ce_loss": 0.12474282830953598, "step": 1660 }, { "epoch": 0.5537024683122082, "loss": 0.8831844329833984, "step": 1660 }, { "ce_loss": 0.41249531507492065, "epoch": 0.5537024683122082, "step": 1660 }, { "distill_loss": 0.16750311851501465, "epoch": 0.5537024683122082, "step": 1660 }, { "epoch": 0.5537024683122082, "ref_ce_loss": 0.19170571863651276, "step": 1660 }, { "epoch": 0.5537024683122082, "loss": 0.8272402882575989, "step": 1660 }, { "ce_loss": 0.3753838539123535, "epoch": 0.5537024683122082, "step": 1660 }, { "distill_loss": 0.15682774782180786, "epoch": 0.5537024683122082, "step": 1660 }, { "epoch": 0.5537024683122082, "ref_ce_loss": 0.128701314330101, "step": 1660 }, { "epoch": 0.5537024683122082, "loss": 0.8349854946136475, "step": 1660 }, { "ce_loss": 0.24851606786251068, "epoch": 0.5537024683122082, "step": 1660 }, { "distill_loss": 0.10116152465343475, "epoch": 0.5537024683122082, "step": 1660 }, { "epoch": 0.5537024683122082, "ref_ce_loss": 0.22549472749233246, "step": 1660 }, { "epoch": 0.5570380253502335, "loss": 0.8069, "step": 1670 }, { "epoch": 0.5570380253502335, "grad_norm": 3.4684464931488037, "step": 1670 }, { "epoch": 0.5570380253502335, "learning_rate": 0.00029876733880910525, "step": 1670 }, { "epoch": 0.5570380253502335, "loss": 0.5241039991378784, "step": 1670 }, { "ce_loss": 0.2068227082490921, "epoch": 0.5570380253502335, "step": 1670 }, { "distill_loss": 0.10971537977457047, "epoch": 0.5570380253502335, "step": 1670 }, { "epoch": 0.5570380253502335, "ref_ce_loss": 0.14565956592559814, "step": 1670 }, { "epoch": 0.5570380253502335, "loss": 0.5024033784866333, "step": 1670 }, { "ce_loss": 0.23021751642227173, "epoch": 0.5570380253502335, "step": 1670 }, { "distill_loss": 0.10708209127187729, "epoch": 0.5570380253502335, "step": 1670 }, { "epoch": 0.5570380253502335, "ref_ce_loss": 0.1649056375026703, "step": 1670 }, { "epoch": 0.5570380253502335, "loss": 0.9969327449798584, "step": 1670 }, { "ce_loss": 0.33395951986312866, "epoch": 0.5570380253502335, "step": 1670 }, { "distill_loss": 0.1525161862373352, "epoch": 0.5570380253502335, "step": 1670 }, { "epoch": 0.5570380253502335, "ref_ce_loss": 0.16089755296707153, "step": 1670 }, { "epoch": 0.5570380253502335, "loss": 0.8883379101753235, "step": 1670 }, { "ce_loss": 0.3117881715297699, "epoch": 0.5570380253502335, "step": 1670 }, { "distill_loss": 0.11904864758253098, "epoch": 0.5570380253502335, "step": 1670 }, { "epoch": 0.5570380253502335, "ref_ce_loss": 0.21285606920719147, "step": 1670 }, { "epoch": 0.5603735823882589, "loss": 0.7748, "step": 1680 }, { "epoch": 0.5603735823882589, "grad_norm": 2.624232292175293, "step": 1680 }, { "epoch": 0.5603735823882589, "learning_rate": 0.00029874128800992547, "step": 1680 }, { "epoch": 0.5603735823882589, "loss": 0.6730021834373474, "step": 1680 }, { "ce_loss": 0.2944241762161255, "epoch": 0.5603735823882589, "step": 1680 }, { "distill_loss": 0.15666545927524567, "epoch": 0.5603735823882589, "step": 1680 }, { "epoch": 0.5603735823882589, "ref_ce_loss": 0.15864133834838867, "step": 1680 }, { "epoch": 0.5603735823882589, "loss": 0.8667658567428589, "step": 1680 }, { "ce_loss": 0.4391118288040161, "epoch": 0.5603735823882589, "step": 1680 }, { "distill_loss": 0.127987802028656, "epoch": 0.5603735823882589, "step": 1680 }, { "epoch": 0.5603735823882589, "ref_ce_loss": 0.21471063792705536, "step": 1680 }, { "epoch": 0.5603735823882589, "loss": 0.6558547616004944, "step": 1680 }, { "ce_loss": 0.3125191628932953, "epoch": 0.5603735823882589, "step": 1680 }, { "distill_loss": 0.11761464178562164, "epoch": 0.5603735823882589, "step": 1680 }, { "epoch": 0.5603735823882589, "ref_ce_loss": 0.17394804954528809, "step": 1680 }, { "epoch": 0.5603735823882589, "loss": 0.8694894313812256, "step": 1680 }, { "ce_loss": 0.38176044821739197, "epoch": 0.5603735823882589, "step": 1680 }, { "distill_loss": 0.13654875755310059, "epoch": 0.5603735823882589, "step": 1680 }, { "epoch": 0.5603735823882589, "ref_ce_loss": 0.20033635199069977, "step": 1680 }, { "epoch": 0.5637091394262842, "loss": 0.8365, "step": 1690 }, { "epoch": 0.5637091394262842, "grad_norm": 3.2551002502441406, "step": 1690 }, { "epoch": 0.5637091394262842, "learning_rate": 0.00029871496596531, "step": 1690 }, { "epoch": 0.5637091394262842, "loss": 1.111056923866272, "step": 1690 }, { "ce_loss": 0.23771966993808746, "epoch": 0.5637091394262842, "step": 1690 }, { "distill_loss": 0.11728285998106003, "epoch": 0.5637091394262842, "step": 1690 }, { "epoch": 0.5637091394262842, "ref_ce_loss": 0.10274969041347504, "step": 1690 }, { "epoch": 0.5637091394262842, "loss": 0.8305438756942749, "step": 1690 }, { "ce_loss": 0.33233582973480225, "epoch": 0.5637091394262842, "step": 1690 }, { "distill_loss": 0.1357385665178299, "epoch": 0.5637091394262842, "step": 1690 }, { "epoch": 0.5637091394262842, "ref_ce_loss": 0.118039071559906, "step": 1690 }, { "epoch": 0.5637091394262842, "loss": 0.844835638999939, "step": 1690 }, { "ce_loss": 0.25645163655281067, "epoch": 0.5637091394262842, "step": 1690 }, { "distill_loss": 0.10402382165193558, "epoch": 0.5637091394262842, "step": 1690 }, { "epoch": 0.5637091394262842, "ref_ce_loss": 0.1545470505952835, "step": 1690 }, { "epoch": 0.5637091394262842, "loss": 0.615341067314148, "step": 1690 }, { "ce_loss": 0.24444253742694855, "epoch": 0.5637091394262842, "step": 1690 }, { "distill_loss": 0.09630996733903885, "epoch": 0.5637091394262842, "step": 1690 }, { "epoch": 0.5637091394262842, "ref_ce_loss": 0.12920692563056946, "step": 1690 }, { "epoch": 0.5670446964643095, "loss": 0.7819, "step": 1700 }, { "epoch": 0.5670446964643095, "grad_norm": 2.2229459285736084, "step": 1700 }, { "epoch": 0.5670446964643095, "learning_rate": 0.00029868837272325994, "step": 1700 }, { "epoch": 0.5670446964643095, "loss": 0.6957921981811523, "step": 1700 }, { "ce_loss": 0.1739863008260727, "epoch": 0.5670446964643095, "step": 1700 }, { "distill_loss": 0.10056845843791962, "epoch": 0.5670446964643095, "step": 1700 }, { "epoch": 0.5670446964643095, "ref_ce_loss": 0.12460286170244217, "step": 1700 }, { "epoch": 0.5670446964643095, "loss": 0.725322425365448, "step": 1700 }, { "ce_loss": 0.3335936963558197, "epoch": 0.5670446964643095, "step": 1700 }, { "distill_loss": 0.12524354457855225, "epoch": 0.5670446964643095, "step": 1700 }, { "epoch": 0.5670446964643095, "ref_ce_loss": 0.15567870438098907, "step": 1700 }, { "epoch": 0.5670446964643095, "loss": 0.6163183450698853, "step": 1700 }, { "ce_loss": 0.24315491318702698, "epoch": 0.5670446964643095, "step": 1700 }, { "distill_loss": 0.1256161630153656, "epoch": 0.5670446964643095, "step": 1700 }, { "epoch": 0.5670446964643095, "ref_ce_loss": 0.1663728654384613, "step": 1700 }, { "epoch": 0.5670446964643095, "loss": 1.0663681030273438, "step": 1700 }, { "ce_loss": 0.2814297378063202, "epoch": 0.5670446964643095, "step": 1700 }, { "distill_loss": 0.13603630661964417, "epoch": 0.5670446964643095, "step": 1700 }, { "epoch": 0.5670446964643095, "ref_ce_loss": 0.1546018123626709, "step": 1700 }, { "epoch": 0.5703802535023349, "loss": 0.7102, "step": 1710 }, { "epoch": 0.5703802535023349, "grad_norm": 2.1799120903015137, "step": 1710 }, { "epoch": 0.5703802535023349, "learning_rate": 0.0002986615083322708, "step": 1710 }, { "epoch": 0.5703802535023349, "loss": 0.7568995952606201, "step": 1710 }, { "ce_loss": 0.3033449053764343, "epoch": 0.5703802535023349, "step": 1710 }, { "distill_loss": 0.10636651515960693, "epoch": 0.5703802535023349, "step": 1710 }, { "epoch": 0.5703802535023349, "ref_ce_loss": 0.1264645904302597, "step": 1710 }, { "epoch": 0.5703802535023349, "loss": 0.6913808584213257, "step": 1710 }, { "ce_loss": 0.2926919460296631, "epoch": 0.5703802535023349, "step": 1710 }, { "distill_loss": 0.14209294319152832, "epoch": 0.5703802535023349, "step": 1710 }, { "epoch": 0.5703802535023349, "ref_ce_loss": 0.16757725179195404, "step": 1710 }, { "epoch": 0.5703802535023349, "loss": 0.5459708571434021, "step": 1710 }, { "ce_loss": 0.2636614441871643, "epoch": 0.5703802535023349, "step": 1710 }, { "distill_loss": 0.14287355542182922, "epoch": 0.5703802535023349, "step": 1710 }, { "epoch": 0.5703802535023349, "ref_ce_loss": 0.13888999819755554, "step": 1710 }, { "epoch": 0.5703802535023349, "loss": 1.1922392845153809, "step": 1710 }, { "ce_loss": 0.22226674854755402, "epoch": 0.5703802535023349, "step": 1710 }, { "distill_loss": 0.10548727214336395, "epoch": 0.5703802535023349, "step": 1710 }, { "epoch": 0.5703802535023349, "ref_ce_loss": 0.12790684401988983, "step": 1710 }, { "epoch": 0.5737158105403602, "loss": 0.8126, "step": 1720 }, { "epoch": 0.5737158105403602, "grad_norm": 3.163691759109497, "step": 1720 }, { "epoch": 0.5737158105403602, "learning_rate": 0.0002986343728413326, "step": 1720 }, { "epoch": 0.5737158105403602, "loss": 0.9181846380233765, "step": 1720 }, { "ce_loss": 0.3268508017063141, "epoch": 0.5737158105403602, "step": 1720 }, { "distill_loss": 0.13751642405986786, "epoch": 0.5737158105403602, "step": 1720 }, { "epoch": 0.5737158105403602, "ref_ce_loss": 0.16825208067893982, "step": 1720 }, { "epoch": 0.5737158105403602, "loss": 0.8629528284072876, "step": 1720 }, { "ce_loss": 0.27521952986717224, "epoch": 0.5737158105403602, "step": 1720 }, { "distill_loss": 0.14840242266654968, "epoch": 0.5737158105403602, "step": 1720 }, { "epoch": 0.5737158105403602, "ref_ce_loss": 0.11532442271709442, "step": 1720 }, { "epoch": 0.5737158105403602, "loss": 0.5458701848983765, "step": 1720 }, { "ce_loss": 0.24639998376369476, "epoch": 0.5737158105403602, "step": 1720 }, { "distill_loss": 0.13376955687999725, "epoch": 0.5737158105403602, "step": 1720 }, { "epoch": 0.5737158105403602, "ref_ce_loss": 0.16517247259616852, "step": 1720 }, { "epoch": 0.5737158105403602, "loss": 0.6869808435440063, "step": 1720 }, { "ce_loss": 0.254602313041687, "epoch": 0.5737158105403602, "step": 1720 }, { "distill_loss": 0.13055302202701569, "epoch": 0.5737158105403602, "step": 1720 }, { "epoch": 0.5737158105403602, "ref_ce_loss": 0.152826726436615, "step": 1720 }, { "epoch": 0.5770513675783856, "loss": 0.8095, "step": 1730 }, { "epoch": 0.5770513675783856, "grad_norm": 5.802720546722412, "step": 1730 }, { "epoch": 0.5770513675783856, "learning_rate": 0.0002986069662999298, "step": 1730 }, { "epoch": 0.5770513675783856, "loss": 0.8158540725708008, "step": 1730 }, { "ce_loss": 0.2759875953197479, "epoch": 0.5770513675783856, "step": 1730 }, { "distill_loss": 0.151884526014328, "epoch": 0.5770513675783856, "step": 1730 }, { "epoch": 0.5770513675783856, "ref_ce_loss": 0.1573365181684494, "step": 1730 }, { "epoch": 0.5770513675783856, "loss": 0.932378888130188, "step": 1730 }, { "ce_loss": 0.3446933329105377, "epoch": 0.5770513675783856, "step": 1730 }, { "distill_loss": 0.1564919650554657, "epoch": 0.5770513675783856, "step": 1730 }, { "epoch": 0.5770513675783856, "ref_ce_loss": 0.16395673155784607, "step": 1730 }, { "epoch": 0.5770513675783856, "loss": 0.7885000705718994, "step": 1730 }, { "ce_loss": 0.26280203461647034, "epoch": 0.5770513675783856, "step": 1730 }, { "distill_loss": 0.134730726480484, "epoch": 0.5770513675783856, "step": 1730 }, { "epoch": 0.5770513675783856, "ref_ce_loss": 0.12668967247009277, "step": 1730 }, { "epoch": 0.5770513675783856, "loss": 0.7057376503944397, "step": 1730 }, { "ce_loss": 0.29862749576568604, "epoch": 0.5770513675783856, "step": 1730 }, { "distill_loss": 0.1523112654685974, "epoch": 0.5770513675783856, "step": 1730 }, { "epoch": 0.5770513675783856, "ref_ce_loss": 0.18865908682346344, "step": 1730 }, { "epoch": 0.580386924616411, "loss": 0.8146, "step": 1740 }, { "epoch": 0.580386924616411, "grad_norm": 2.1819822788238525, "step": 1740 }, { "epoch": 0.580386924616411, "learning_rate": 0.0002985792887580412, "step": 1740 }, { "epoch": 0.580386924616411, "loss": 0.4795108139514923, "step": 1740 }, { "ce_loss": 0.1699635088443756, "epoch": 0.580386924616411, "step": 1740 }, { "distill_loss": 0.12625162303447723, "epoch": 0.580386924616411, "step": 1740 }, { "epoch": 0.580386924616411, "ref_ce_loss": 0.1395426243543625, "step": 1740 }, { "epoch": 0.580386924616411, "loss": 1.1031620502471924, "step": 1740 }, { "ce_loss": 0.2764243185520172, "epoch": 0.580386924616411, "step": 1740 }, { "distill_loss": 0.16979770362377167, "epoch": 0.580386924616411, "step": 1740 }, { "epoch": 0.580386924616411, "ref_ce_loss": 0.22301478683948517, "step": 1740 }, { "epoch": 0.580386924616411, "loss": 0.7303873896598816, "step": 1740 }, { "ce_loss": 0.3002098500728607, "epoch": 0.580386924616411, "step": 1740 }, { "distill_loss": 0.15270881354808807, "epoch": 0.580386924616411, "step": 1740 }, { "epoch": 0.580386924616411, "ref_ce_loss": 0.17616495490074158, "step": 1740 }, { "epoch": 0.580386924616411, "loss": 0.7694774866104126, "step": 1740 }, { "ce_loss": 0.30607903003692627, "epoch": 0.580386924616411, "step": 1740 }, { "distill_loss": 0.13875719904899597, "epoch": 0.580386924616411, "step": 1740 }, { "epoch": 0.580386924616411, "ref_ce_loss": 0.20177951455116272, "step": 1740 }, { "epoch": 0.5837224816544363, "loss": 0.8148, "step": 1750 }, { "epoch": 0.5837224816544363, "grad_norm": 2.0567026138305664, "step": 1750 }, { "epoch": 0.5837224816544363, "learning_rate": 0.00029855134026613963, "step": 1750 }, { "epoch": 0.5837224816544363, "loss": 0.6456379294395447, "step": 1750 }, { "ce_loss": 0.2544163763523102, "epoch": 0.5837224816544363, "step": 1750 }, { "distill_loss": 0.15112867951393127, "epoch": 0.5837224816544363, "step": 1750 }, { "epoch": 0.5837224816544363, "ref_ce_loss": 0.13789910078048706, "step": 1750 }, { "epoch": 0.5837224816544363, "loss": 0.6710749864578247, "step": 1750 }, { "ce_loss": 0.22394444048404694, "epoch": 0.5837224816544363, "step": 1750 }, { "distill_loss": 0.13627192378044128, "epoch": 0.5837224816544363, "step": 1750 }, { "epoch": 0.5837224816544363, "ref_ce_loss": 0.21288058161735535, "step": 1750 }, { "epoch": 0.5837224816544363, "loss": 0.8853602409362793, "step": 1750 }, { "ce_loss": 0.33082449436187744, "epoch": 0.5837224816544363, "step": 1750 }, { "distill_loss": 0.17533904314041138, "epoch": 0.5837224816544363, "step": 1750 }, { "epoch": 0.5837224816544363, "ref_ce_loss": 0.23648829758167267, "step": 1750 }, { "epoch": 0.5837224816544363, "loss": 0.6773439645767212, "step": 1750 }, { "ce_loss": 0.258245587348938, "epoch": 0.5837224816544363, "step": 1750 }, { "distill_loss": 0.1750703752040863, "epoch": 0.5837224816544363, "step": 1750 }, { "epoch": 0.5837224816544363, "ref_ce_loss": 0.16324280202388763, "step": 1750 }, { "epoch": 0.5870580386924616, "loss": 0.7823, "step": 1760 }, { "epoch": 0.5870580386924616, "grad_norm": 3.1371705532073975, "step": 1760 }, { "epoch": 0.5870580386924616, "learning_rate": 0.0002985231208751921, "step": 1760 }, { "epoch": 0.5870580386924616, "loss": 1.2690577507019043, "step": 1760 }, { "ce_loss": 0.36192673444747925, "epoch": 0.5870580386924616, "step": 1760 }, { "distill_loss": 0.1863507330417633, "epoch": 0.5870580386924616, "step": 1760 }, { "epoch": 0.5870580386924616, "ref_ce_loss": 0.1751195639371872, "step": 1760 }, { "epoch": 0.5870580386924616, "loss": 0.8799837231636047, "step": 1760 }, { "ce_loss": 0.349637895822525, "epoch": 0.5870580386924616, "step": 1760 }, { "distill_loss": 0.1971992552280426, "epoch": 0.5870580386924616, "step": 1760 }, { "epoch": 0.5870580386924616, "ref_ce_loss": 0.08908438682556152, "step": 1760 }, { "epoch": 0.5870580386924616, "loss": 0.8331767916679382, "step": 1760 }, { "ce_loss": 0.32722359895706177, "epoch": 0.5870580386924616, "step": 1760 }, { "distill_loss": 0.17739370465278625, "epoch": 0.5870580386924616, "step": 1760 }, { "epoch": 0.5870580386924616, "ref_ce_loss": 0.17605829238891602, "step": 1760 }, { "epoch": 0.5870580386924616, "loss": 0.6821324825286865, "step": 1760 }, { "ce_loss": 0.28926607966423035, "epoch": 0.5870580386924616, "step": 1760 }, { "distill_loss": 0.20537421107292175, "epoch": 0.5870580386924616, "step": 1760 }, { "epoch": 0.5870580386924616, "ref_ce_loss": 0.12078510224819183, "step": 1760 }, { "epoch": 0.590393595730487, "loss": 0.8654, "step": 1770 }, { "epoch": 0.590393595730487, "grad_norm": 4.211384296417236, "step": 1770 }, { "epoch": 0.590393595730487, "learning_rate": 0.00029849463063665965, "step": 1770 }, { "epoch": 0.590393595730487, "loss": 0.622290849685669, "step": 1770 }, { "ce_loss": 0.2384355068206787, "epoch": 0.590393595730487, "step": 1770 }, { "distill_loss": 0.14726680517196655, "epoch": 0.590393595730487, "step": 1770 }, { "epoch": 0.590393595730487, "ref_ce_loss": 0.18940134346485138, "step": 1770 }, { "epoch": 0.590393595730487, "loss": 0.5565788149833679, "step": 1770 }, { "ce_loss": 0.2837248146533966, "epoch": 0.590393595730487, "step": 1770 }, { "distill_loss": 0.1558532416820526, "epoch": 0.590393595730487, "step": 1770 }, { "epoch": 0.590393595730487, "ref_ce_loss": 0.11656032502651215, "step": 1770 }, { "epoch": 0.590393595730487, "loss": 1.1380963325500488, "step": 1770 }, { "ce_loss": 0.41929349303245544, "epoch": 0.590393595730487, "step": 1770 }, { "distill_loss": 0.2181171178817749, "epoch": 0.590393595730487, "step": 1770 }, { "epoch": 0.590393595730487, "ref_ce_loss": 0.17369946837425232, "step": 1770 }, { "epoch": 0.590393595730487, "loss": 0.6213510632514954, "step": 1770 }, { "ce_loss": 0.24765095114707947, "epoch": 0.590393595730487, "step": 1770 }, { "distill_loss": 0.193634033203125, "epoch": 0.590393595730487, "step": 1770 }, { "epoch": 0.590393595730487, "ref_ce_loss": 0.12431634962558746, "step": 1770 }, { "epoch": 0.5937291527685123, "loss": 0.8174, "step": 1780 }, { "epoch": 0.5937291527685123, "grad_norm": 2.910914659500122, "step": 1780 }, { "epoch": 0.5937291527685123, "learning_rate": 0.00029846586960249736, "step": 1780 }, { "epoch": 0.5937291527685123, "loss": 1.329070806503296, "step": 1780 }, { "ce_loss": 0.19489771127700806, "epoch": 0.5937291527685123, "step": 1780 }, { "distill_loss": 0.1673344075679779, "epoch": 0.5937291527685123, "step": 1780 }, { "epoch": 0.5937291527685123, "ref_ce_loss": 0.1715712547302246, "step": 1780 }, { "epoch": 0.5937291527685123, "loss": 0.7473481297492981, "step": 1780 }, { "ce_loss": 0.2816266715526581, "epoch": 0.5937291527685123, "step": 1780 }, { "distill_loss": 0.20028121769428253, "epoch": 0.5937291527685123, "step": 1780 }, { "epoch": 0.5937291527685123, "ref_ce_loss": 0.13073772192001343, "step": 1780 }, { "epoch": 0.5937291527685123, "loss": 1.6454558372497559, "step": 1780 }, { "ce_loss": 0.4495353102684021, "epoch": 0.5937291527685123, "step": 1780 }, { "distill_loss": 0.18965190649032593, "epoch": 0.5937291527685123, "step": 1780 }, { "epoch": 0.5937291527685123, "ref_ce_loss": 0.19479434192180634, "step": 1780 }, { "epoch": 0.5937291527685123, "loss": 0.7187461256980896, "step": 1780 }, { "ce_loss": 0.2073751837015152, "epoch": 0.5937291527685123, "step": 1780 }, { "distill_loss": 0.18652737140655518, "epoch": 0.5937291527685123, "step": 1780 }, { "epoch": 0.5937291527685123, "ref_ce_loss": 0.12575000524520874, "step": 1780 }, { "epoch": 0.5970647098065377, "loss": 0.8508, "step": 1790 }, { "epoch": 0.5970647098065377, "grad_norm": 2.6389963626861572, "step": 1790 }, { "epoch": 0.5970647098065377, "learning_rate": 0.0002984368378251539, "step": 1790 }, { "epoch": 0.5970647098065377, "loss": 0.9331235885620117, "step": 1790 }, { "ce_loss": 0.3901205360889435, "epoch": 0.5970647098065377, "step": 1790 }, { "distill_loss": 0.26576167345046997, "epoch": 0.5970647098065377, "step": 1790 }, { "epoch": 0.5970647098065377, "ref_ce_loss": 0.15986163914203644, "step": 1790 }, { "epoch": 0.5970647098065377, "loss": 0.6947316527366638, "step": 1790 }, { "ce_loss": 0.21353313326835632, "epoch": 0.5970647098065377, "step": 1790 }, { "distill_loss": 0.2191469818353653, "epoch": 0.5970647098065377, "step": 1790 }, { "epoch": 0.5970647098065377, "ref_ce_loss": 0.15555934607982635, "step": 1790 }, { "epoch": 0.5970647098065377, "loss": 0.8559675216674805, "step": 1790 }, { "ce_loss": 0.3997640013694763, "epoch": 0.5970647098065377, "step": 1790 }, { "distill_loss": 0.20023198425769806, "epoch": 0.5970647098065377, "step": 1790 }, { "epoch": 0.5970647098065377, "ref_ce_loss": 0.2548025846481323, "step": 1790 }, { "epoch": 0.5970647098065377, "loss": 0.5783416032791138, "step": 1790 }, { "ce_loss": 0.23126862943172455, "epoch": 0.5970647098065377, "step": 1790 }, { "distill_loss": 0.19612915813922882, "epoch": 0.5970647098065377, "step": 1790 }, { "epoch": 0.5970647098065377, "ref_ce_loss": 0.1506781429052353, "step": 1790 }, { "epoch": 0.600400266844563, "loss": 0.8459, "step": 1800 }, { "epoch": 0.600400266844563, "grad_norm": 2.6187963485717773, "step": 1800 }, { "epoch": 0.600400266844563, "learning_rate": 0.0002984075353575718, "step": 1800 }, { "epoch": 0.600400266844563, "loss": 0.7321758270263672, "step": 1800 }, { "ce_loss": 0.2416120320558548, "epoch": 0.600400266844563, "step": 1800 }, { "distill_loss": 0.2531052231788635, "epoch": 0.600400266844563, "step": 1800 }, { "epoch": 0.600400266844563, "ref_ce_loss": 0.1594499945640564, "step": 1800 }, { "epoch": 0.600400266844563, "loss": 1.1592686176300049, "step": 1800 }, { "ce_loss": 0.24422048032283783, "epoch": 0.600400266844563, "step": 1800 }, { "distill_loss": 0.29225999116897583, "epoch": 0.600400266844563, "step": 1800 }, { "epoch": 0.600400266844563, "ref_ce_loss": 0.17981497943401337, "step": 1800 }, { "epoch": 0.600400266844563, "loss": 0.8788959980010986, "step": 1800 }, { "ce_loss": 0.331389844417572, "epoch": 0.600400266844563, "step": 1800 }, { "distill_loss": 0.30712389945983887, "epoch": 0.600400266844563, "step": 1800 }, { "epoch": 0.600400266844563, "ref_ce_loss": 0.13194140791893005, "step": 1800 }, { "epoch": 0.600400266844563, "loss": 0.7174859046936035, "step": 1800 }, { "ce_loss": 0.26181110739707947, "epoch": 0.600400266844563, "step": 1800 }, { "distill_loss": 0.25586360692977905, "epoch": 0.600400266844563, "step": 1800 }, { "epoch": 0.600400266844563, "ref_ce_loss": 0.1297745257616043, "step": 1800 }, { "epoch": 0.6037358238825884, "loss": 0.9145, "step": 1810 }, { "epoch": 0.6037358238825884, "grad_norm": 2.9618608951568604, "step": 1810 }, { "epoch": 0.6037358238825884, "learning_rate": 0.00029837796225318713, "step": 1810 }, { "epoch": 0.6037358238825884, "loss": 0.7965373992919922, "step": 1810 }, { "ce_loss": 0.2630294859409332, "epoch": 0.6037358238825884, "step": 1810 }, { "distill_loss": 0.25887030363082886, "epoch": 0.6037358238825884, "step": 1810 }, { "epoch": 0.6037358238825884, "ref_ce_loss": 0.12762102484703064, "step": 1810 }, { "epoch": 0.6037358238825884, "loss": 1.3940396308898926, "step": 1810 }, { "ce_loss": 0.4177006781101227, "epoch": 0.6037358238825884, "step": 1810 }, { "distill_loss": 0.2584906816482544, "epoch": 0.6037358238825884, "step": 1810 }, { "epoch": 0.6037358238825884, "ref_ce_loss": 0.22328118979930878, "step": 1810 }, { "epoch": 0.6037358238825884, "loss": 0.7931407690048218, "step": 1810 }, { "ce_loss": 0.3327951729297638, "epoch": 0.6037358238825884, "step": 1810 }, { "distill_loss": 0.24718835949897766, "epoch": 0.6037358238825884, "step": 1810 }, { "epoch": 0.6037358238825884, "ref_ce_loss": 0.21300888061523438, "step": 1810 }, { "epoch": 0.6037358238825884, "loss": 1.0378825664520264, "step": 1810 }, { "ce_loss": 0.34166228771209717, "epoch": 0.6037358238825884, "step": 1810 }, { "distill_loss": 0.2605835795402527, "epoch": 0.6037358238825884, "step": 1810 }, { "epoch": 0.6037358238825884, "ref_ce_loss": 0.14847081899642944, "step": 1810 }, { "epoch": 0.6070713809206137, "loss": 0.8354, "step": 1820 }, { "epoch": 0.6070713809206137, "grad_norm": 2.209144353866577, "step": 1820 }, { "epoch": 0.6070713809206137, "learning_rate": 0.00029834811856592974, "step": 1820 }, { "epoch": 0.6070713809206137, "loss": 1.0870418548583984, "step": 1820 }, { "ce_loss": 0.3245229125022888, "epoch": 0.6070713809206137, "step": 1820 }, { "distill_loss": 0.20548105239868164, "epoch": 0.6070713809206137, "step": 1820 }, { "epoch": 0.6070713809206137, "ref_ce_loss": 0.15288129448890686, "step": 1820 }, { "epoch": 0.6070713809206137, "loss": 1.2108877897262573, "step": 1820 }, { "ce_loss": 0.2942400276660919, "epoch": 0.6070713809206137, "step": 1820 }, { "distill_loss": 0.18730758130550385, "epoch": 0.6070713809206137, "step": 1820 }, { "epoch": 0.6070713809206137, "ref_ce_loss": 0.12936684489250183, "step": 1820 }, { "epoch": 0.6070713809206137, "loss": 0.8238207101821899, "step": 1820 }, { "ce_loss": 0.26211613416671753, "epoch": 0.6070713809206137, "step": 1820 }, { "distill_loss": 0.15576207637786865, "epoch": 0.6070713809206137, "step": 1820 }, { "epoch": 0.6070713809206137, "ref_ce_loss": 0.1929386556148529, "step": 1820 }, { "epoch": 0.6070713809206137, "loss": 1.1510733366012573, "step": 1820 }, { "ce_loss": 0.31120988726615906, "epoch": 0.6070713809206137, "step": 1820 }, { "distill_loss": 0.15629993379116058, "epoch": 0.6070713809206137, "step": 1820 }, { "epoch": 0.6070713809206137, "ref_ce_loss": 0.20821838080883026, "step": 1820 }, { "epoch": 0.6104069379586391, "loss": 0.8131, "step": 1830 }, { "epoch": 0.6104069379586391, "grad_norm": 2.3518996238708496, "step": 1830 }, { "epoch": 0.6104069379586391, "learning_rate": 0.0002983180043502226, "step": 1830 }, { "epoch": 0.6104069379586391, "loss": 0.9673494100570679, "step": 1830 }, { "ce_loss": 0.2245025634765625, "epoch": 0.6104069379586391, "step": 1830 }, { "distill_loss": 0.15006595849990845, "epoch": 0.6104069379586391, "step": 1830 }, { "epoch": 0.6104069379586391, "ref_ce_loss": 0.14450837671756744, "step": 1830 }, { "epoch": 0.6104069379586391, "loss": 1.431778907775879, "step": 1830 }, { "ce_loss": 0.3772394061088562, "epoch": 0.6104069379586391, "step": 1830 }, { "distill_loss": 0.18321493268013, "epoch": 0.6104069379586391, "step": 1830 }, { "epoch": 0.6104069379586391, "ref_ce_loss": 0.2277681529521942, "step": 1830 }, { "epoch": 0.6104069379586391, "loss": 0.620803713798523, "step": 1830 }, { "ce_loss": 0.30052056908607483, "epoch": 0.6104069379586391, "step": 1830 }, { "distill_loss": 0.18863649666309357, "epoch": 0.6104069379586391, "step": 1830 }, { "epoch": 0.6104069379586391, "ref_ce_loss": 0.13118411600589752, "step": 1830 }, { "epoch": 0.6104069379586391, "loss": 0.9219392538070679, "step": 1830 }, { "ce_loss": 0.3350750207901001, "epoch": 0.6104069379586391, "step": 1830 }, { "distill_loss": 0.19591687619686127, "epoch": 0.6104069379586391, "step": 1830 }, { "epoch": 0.6104069379586391, "ref_ce_loss": 0.18633542954921722, "step": 1830 }, { "epoch": 0.6137424949966644, "loss": 0.8355, "step": 1840 }, { "epoch": 0.6137424949966644, "grad_norm": 2.3891096115112305, "step": 1840 }, { "epoch": 0.6137424949966644, "learning_rate": 0.0002982876196609822, "step": 1840 }, { "epoch": 0.6137424949966644, "loss": 0.719983696937561, "step": 1840 }, { "ce_loss": 0.25666162371635437, "epoch": 0.6137424949966644, "step": 1840 }, { "distill_loss": 0.12700149416923523, "epoch": 0.6137424949966644, "step": 1840 }, { "epoch": 0.6137424949966644, "ref_ce_loss": 0.12065713107585907, "step": 1840 }, { "epoch": 0.6137424949966644, "loss": 0.9144517183303833, "step": 1840 }, { "ce_loss": 0.3790566921234131, "epoch": 0.6137424949966644, "step": 1840 }, { "distill_loss": 0.1404389590024948, "epoch": 0.6137424949966644, "step": 1840 }, { "epoch": 0.6137424949966644, "ref_ce_loss": 0.19421830773353577, "step": 1840 }, { "epoch": 0.6137424949966644, "loss": 0.6303397417068481, "step": 1840 }, { "ce_loss": 0.35228684544563293, "epoch": 0.6137424949966644, "step": 1840 }, { "distill_loss": 0.12840238213539124, "epoch": 0.6137424949966644, "step": 1840 }, { "epoch": 0.6137424949966644, "ref_ce_loss": 0.14961020648479462, "step": 1840 }, { "epoch": 0.6137424949966644, "loss": 0.6458494663238525, "step": 1840 }, { "ce_loss": 0.3226067125797272, "epoch": 0.6137424949966644, "step": 1840 }, { "distill_loss": 0.14511506259441376, "epoch": 0.6137424949966644, "step": 1840 }, { "epoch": 0.6137424949966644, "ref_ce_loss": 0.1088472381234169, "step": 1840 }, { "epoch": 0.6170780520346898, "loss": 0.827, "step": 1850 }, { "epoch": 0.6170780520346898, "grad_norm": 2.2725181579589844, "step": 1850 }, { "epoch": 0.6170780520346898, "learning_rate": 0.00029825696455361824, "step": 1850 }, { "epoch": 0.6170780520346898, "loss": 0.9817647933959961, "step": 1850 }, { "ce_loss": 0.30564960837364197, "epoch": 0.6170780520346898, "step": 1850 }, { "distill_loss": 0.15042169392108917, "epoch": 0.6170780520346898, "step": 1850 }, { "epoch": 0.6170780520346898, "ref_ce_loss": 0.18882113695144653, "step": 1850 }, { "epoch": 0.6170780520346898, "loss": 0.8808572292327881, "step": 1850 }, { "ce_loss": 0.4024255871772766, "epoch": 0.6170780520346898, "step": 1850 }, { "distill_loss": 0.14410771429538727, "epoch": 0.6170780520346898, "step": 1850 }, { "epoch": 0.6170780520346898, "ref_ce_loss": 0.18836361169815063, "step": 1850 }, { "epoch": 0.6170780520346898, "loss": 0.585471510887146, "step": 1850 }, { "ce_loss": 0.30692368745803833, "epoch": 0.6170780520346898, "step": 1850 }, { "distill_loss": 0.1264827847480774, "epoch": 0.6170780520346898, "step": 1850 }, { "epoch": 0.6170780520346898, "ref_ce_loss": 0.14431552588939667, "step": 1850 }, { "epoch": 0.6170780520346898, "loss": 0.8242551684379578, "step": 1850 }, { "ce_loss": 0.34889402985572815, "epoch": 0.6170780520346898, "step": 1850 }, { "distill_loss": 0.14129848778247833, "epoch": 0.6170780520346898, "step": 1850 }, { "epoch": 0.6170780520346898, "ref_ce_loss": 0.1428593099117279, "step": 1850 }, { "epoch": 0.6204136090727151, "loss": 0.7225, "step": 1860 }, { "epoch": 0.6204136090727151, "grad_norm": 2.643566846847534, "step": 1860 }, { "epoch": 0.6204136090727151, "learning_rate": 0.0002982260390840335, "step": 1860 }, { "epoch": 0.6204136090727151, "loss": 0.7219395637512207, "step": 1860 }, { "ce_loss": 0.30792996287345886, "epoch": 0.6204136090727151, "step": 1860 }, { "distill_loss": 0.12749861180782318, "epoch": 0.6204136090727151, "step": 1860 }, { "epoch": 0.6204136090727151, "ref_ce_loss": 0.1780974119901657, "step": 1860 }, { "epoch": 0.6204136090727151, "loss": 0.5392085909843445, "step": 1860 }, { "ce_loss": 0.21018683910369873, "epoch": 0.6204136090727151, "step": 1860 }, { "distill_loss": 0.11033669859170914, "epoch": 0.6204136090727151, "step": 1860 }, { "epoch": 0.6204136090727151, "ref_ce_loss": 0.14142484962940216, "step": 1860 }, { "epoch": 0.6204136090727151, "loss": 1.3683431148529053, "step": 1860 }, { "ce_loss": 0.28953924775123596, "epoch": 0.6204136090727151, "step": 1860 }, { "distill_loss": 0.12735016644001007, "epoch": 0.6204136090727151, "step": 1860 }, { "epoch": 0.6204136090727151, "ref_ce_loss": 0.16517490148544312, "step": 1860 }, { "epoch": 0.6204136090727151, "loss": 0.9593379497528076, "step": 1860 }, { "ce_loss": 0.27006015181541443, "epoch": 0.6204136090727151, "step": 1860 }, { "distill_loss": 0.10672731697559357, "epoch": 0.6204136090727151, "step": 1860 }, { "epoch": 0.6204136090727151, "ref_ce_loss": 0.14056552946567535, "step": 1860 }, { "epoch": 0.6237491661107405, "loss": 0.8208, "step": 1870 }, { "epoch": 0.6237491661107405, "grad_norm": 2.525805711746216, "step": 1870 }, { "epoch": 0.6237491661107405, "learning_rate": 0.00029819484330862394, "step": 1870 }, { "epoch": 0.6237491661107405, "loss": 0.5869095921516418, "step": 1870 }, { "ce_loss": 0.27092117071151733, "epoch": 0.6237491661107405, "step": 1870 }, { "distill_loss": 0.11845798045396805, "epoch": 0.6237491661107405, "step": 1870 }, { "epoch": 0.6237491661107405, "ref_ce_loss": 0.11767851561307907, "step": 1870 }, { "epoch": 0.6237491661107405, "loss": 0.4167972505092621, "step": 1870 }, { "ce_loss": 0.13532961905002594, "epoch": 0.6237491661107405, "step": 1870 }, { "distill_loss": 0.09994292259216309, "epoch": 0.6237491661107405, "step": 1870 }, { "epoch": 0.6237491661107405, "ref_ce_loss": 0.1033366397023201, "step": 1870 }, { "epoch": 0.6237491661107405, "loss": 0.718509316444397, "step": 1870 }, { "ce_loss": 0.3200066089630127, "epoch": 0.6237491661107405, "step": 1870 }, { "distill_loss": 0.15193799138069153, "epoch": 0.6237491661107405, "step": 1870 }, { "epoch": 0.6237491661107405, "ref_ce_loss": 0.15644268691539764, "step": 1870 }, { "epoch": 0.6237491661107405, "loss": 1.9445092678070068, "step": 1870 }, { "ce_loss": 0.27009880542755127, "epoch": 0.6237491661107405, "step": 1870 }, { "distill_loss": 0.11126744747161865, "epoch": 0.6237491661107405, "step": 1870 }, { "epoch": 0.6237491661107405, "ref_ce_loss": 0.21282550692558289, "step": 1870 }, { "epoch": 0.6270847231487658, "loss": 0.7884, "step": 1880 }, { "epoch": 0.6270847231487658, "grad_norm": 3.6953253746032715, "step": 1880 }, { "epoch": 0.6270847231487658, "learning_rate": 0.0002981633772842782, "step": 1880 }, { "epoch": 0.6270847231487658, "loss": 0.6420892477035522, "step": 1880 }, { "ce_loss": 0.36057618260383606, "epoch": 0.6270847231487658, "step": 1880 }, { "distill_loss": 0.1327221393585205, "epoch": 0.6270847231487658, "step": 1880 }, { "epoch": 0.6270847231487658, "ref_ce_loss": 0.14876747131347656, "step": 1880 }, { "epoch": 0.6270847231487658, "loss": 0.6036034822463989, "step": 1880 }, { "ce_loss": 0.22983922064304352, "epoch": 0.6270847231487658, "step": 1880 }, { "distill_loss": 0.11601034551858902, "epoch": 0.6270847231487658, "step": 1880 }, { "epoch": 0.6270847231487658, "ref_ce_loss": 0.18308429419994354, "step": 1880 }, { "epoch": 0.6270847231487658, "loss": 0.4442928433418274, "step": 1880 }, { "ce_loss": 0.17181488871574402, "epoch": 0.6270847231487658, "step": 1880 }, { "distill_loss": 0.09428546577692032, "epoch": 0.6270847231487658, "step": 1880 }, { "epoch": 0.6270847231487658, "ref_ce_loss": 0.10472335666418076, "step": 1880 }, { "epoch": 0.6270847231487658, "loss": 0.7077046632766724, "step": 1880 }, { "ce_loss": 0.2727658152580261, "epoch": 0.6270847231487658, "step": 1880 }, { "distill_loss": 0.12032405287027359, "epoch": 0.6270847231487658, "step": 1880 }, { "epoch": 0.6270847231487658, "ref_ce_loss": 0.19070985913276672, "step": 1880 }, { "epoch": 0.6304202801867912, "loss": 0.7724, "step": 1890 }, { "epoch": 0.6304202801867912, "grad_norm": 3.7794764041900635, "step": 1890 }, { "epoch": 0.6304202801867912, "learning_rate": 0.00029813164106837805, "step": 1890 }, { "epoch": 0.6304202801867912, "loss": 0.8937748074531555, "step": 1890 }, { "ce_loss": 0.47305163741111755, "epoch": 0.6304202801867912, "step": 1890 }, { "distill_loss": 0.16846346855163574, "epoch": 0.6304202801867912, "step": 1890 }, { "epoch": 0.6304202801867912, "ref_ce_loss": 0.15080559253692627, "step": 1890 }, { "epoch": 0.6304202801867912, "loss": 0.5890235304832458, "step": 1890 }, { "ce_loss": 0.23578114807605743, "epoch": 0.6304202801867912, "step": 1890 }, { "distill_loss": 0.15570217370986938, "epoch": 0.6304202801867912, "step": 1890 }, { "epoch": 0.6304202801867912, "ref_ce_loss": 0.12087680399417877, "step": 1890 }, { "epoch": 0.6304202801867912, "loss": 0.9535101652145386, "step": 1890 }, { "ce_loss": 0.24146515130996704, "epoch": 0.6304202801867912, "step": 1890 }, { "distill_loss": 0.1533411592245102, "epoch": 0.6304202801867912, "step": 1890 }, { "epoch": 0.6304202801867912, "ref_ce_loss": 0.11719386279582977, "step": 1890 }, { "epoch": 0.6304202801867912, "loss": 0.7761451005935669, "step": 1890 }, { "ce_loss": 0.22938808798789978, "epoch": 0.6304202801867912, "step": 1890 }, { "distill_loss": 0.12226255238056183, "epoch": 0.6304202801867912, "step": 1890 }, { "epoch": 0.6304202801867912, "ref_ce_loss": 0.13468782603740692, "step": 1890 }, { "epoch": 0.6337558372248165, "loss": 0.7423, "step": 1900 }, { "epoch": 0.6337558372248165, "grad_norm": 2.9295172691345215, "step": 1900 }, { "epoch": 0.6337558372248165, "learning_rate": 0.0002980996347187977, "step": 1900 }, { "epoch": 0.6337558372248165, "loss": 0.6180705428123474, "step": 1900 }, { "ce_loss": 0.2857638895511627, "epoch": 0.6337558372248165, "step": 1900 }, { "distill_loss": 0.1499849557876587, "epoch": 0.6337558372248165, "step": 1900 }, { "epoch": 0.6337558372248165, "ref_ce_loss": 0.13046589493751526, "step": 1900 }, { "epoch": 0.6337558372248165, "loss": 0.6940881609916687, "step": 1900 }, { "ce_loss": 0.27692747116088867, "epoch": 0.6337558372248165, "step": 1900 }, { "distill_loss": 0.1486843228340149, "epoch": 0.6337558372248165, "step": 1900 }, { "epoch": 0.6337558372248165, "ref_ce_loss": 0.14957310259342194, "step": 1900 }, { "epoch": 0.6337558372248165, "loss": 0.5118992328643799, "step": 1900 }, { "ce_loss": 0.22834275662899017, "epoch": 0.6337558372248165, "step": 1900 }, { "distill_loss": 0.1341458261013031, "epoch": 0.6337558372248165, "step": 1900 }, { "epoch": 0.6337558372248165, "ref_ce_loss": 0.14600332081317902, "step": 1900 }, { "epoch": 0.6337558372248165, "loss": 1.128795862197876, "step": 1900 }, { "ce_loss": 0.4237273335456848, "epoch": 0.6337558372248165, "step": 1900 }, { "distill_loss": 0.19581244885921478, "epoch": 0.6337558372248165, "step": 1900 }, { "epoch": 0.6337558372248165, "ref_ce_loss": 0.2021128535270691, "step": 1900 }, { "epoch": 0.6370913942628419, "loss": 0.8588, "step": 1910 }, { "epoch": 0.6370913942628419, "grad_norm": 3.859954833984375, "step": 1910 }, { "epoch": 0.6370913942628419, "learning_rate": 0.00029806735829390415, "step": 1910 }, { "epoch": 0.6370913942628419, "loss": 1.5192310810089111, "step": 1910 }, { "ce_loss": 0.3459034264087677, "epoch": 0.6370913942628419, "step": 1910 }, { "distill_loss": 0.17199254035949707, "epoch": 0.6370913942628419, "step": 1910 }, { "epoch": 0.6370913942628419, "ref_ce_loss": 0.229523167014122, "step": 1910 }, { "epoch": 0.6370913942628419, "loss": 1.2944340705871582, "step": 1910 }, { "ce_loss": 0.2584053575992584, "epoch": 0.6370913942628419, "step": 1910 }, { "distill_loss": 0.17071333527565002, "epoch": 0.6370913942628419, "step": 1910 }, { "epoch": 0.6370913942628419, "ref_ce_loss": 0.1695045530796051, "step": 1910 }, { "epoch": 0.6370913942628419, "loss": 0.6828786134719849, "step": 1910 }, { "ce_loss": 0.31047752499580383, "epoch": 0.6370913942628419, "step": 1910 }, { "distill_loss": 0.1672109067440033, "epoch": 0.6370913942628419, "step": 1910 }, { "epoch": 0.6370913942628419, "ref_ce_loss": 0.14789322018623352, "step": 1910 }, { "epoch": 0.6370913942628419, "loss": 0.5672518610954285, "step": 1910 }, { "ce_loss": 0.24659834802150726, "epoch": 0.6370913942628419, "step": 1910 }, { "distill_loss": 0.13652631640434265, "epoch": 0.6370913942628419, "step": 1910 }, { "epoch": 0.6370913942628419, "ref_ce_loss": 0.18395711481571198, "step": 1910 }, { "epoch": 0.6404269513008672, "loss": 0.8946, "step": 1920 }, { "epoch": 0.6404269513008672, "grad_norm": 2.7981321811676025, "step": 1920 }, { "epoch": 0.6404269513008672, "learning_rate": 0.00029803481185255694, "step": 1920 }, { "epoch": 0.6404269513008672, "loss": 0.6429210901260376, "step": 1920 }, { "ce_loss": 0.28088638186454773, "epoch": 0.6404269513008672, "step": 1920 }, { "distill_loss": 0.16700200736522675, "epoch": 0.6404269513008672, "step": 1920 }, { "epoch": 0.6404269513008672, "ref_ce_loss": 0.14392881095409393, "step": 1920 }, { "epoch": 0.6404269513008672, "loss": 1.208841323852539, "step": 1920 }, { "ce_loss": 0.3831193745136261, "epoch": 0.6404269513008672, "step": 1920 }, { "distill_loss": 0.1752890944480896, "epoch": 0.6404269513008672, "step": 1920 }, { "epoch": 0.6404269513008672, "ref_ce_loss": 0.2569045424461365, "step": 1920 }, { "epoch": 0.6404269513008672, "loss": 1.2274980545043945, "step": 1920 }, { "ce_loss": 0.4168337285518646, "epoch": 0.6404269513008672, "step": 1920 }, { "distill_loss": 0.19316238164901733, "epoch": 0.6404269513008672, "step": 1920 }, { "epoch": 0.6404269513008672, "ref_ce_loss": 0.16584856808185577, "step": 1920 }, { "epoch": 0.6404269513008672, "loss": 0.8652777671813965, "step": 1920 }, { "ce_loss": 0.36839842796325684, "epoch": 0.6404269513008672, "step": 1920 }, { "distill_loss": 0.1646052449941635, "epoch": 0.6404269513008672, "step": 1920 }, { "epoch": 0.6404269513008672, "ref_ce_loss": 0.2286727875471115, "step": 1920 }, { "epoch": 0.6437625083388926, "loss": 0.8195, "step": 1930 }, { "epoch": 0.6437625083388926, "grad_norm": 3.6174919605255127, "step": 1930 }, { "epoch": 0.6437625083388926, "learning_rate": 0.00029800199545410787, "step": 1930 }, { "epoch": 0.6437625083388926, "loss": 0.5298856496810913, "step": 1930 }, { "ce_loss": 0.2486499845981598, "epoch": 0.6437625083388926, "step": 1930 }, { "distill_loss": 0.16173726320266724, "epoch": 0.6437625083388926, "step": 1930 }, { "epoch": 0.6437625083388926, "ref_ce_loss": 0.1191975474357605, "step": 1930 }, { "epoch": 0.6437625083388926, "loss": 0.5920172333717346, "step": 1930 }, { "ce_loss": 0.17179836332798004, "epoch": 0.6437625083388926, "step": 1930 }, { "distill_loss": 0.213221475481987, "epoch": 0.6437625083388926, "step": 1930 }, { "epoch": 0.6437625083388926, "ref_ce_loss": 0.14253775775432587, "step": 1930 }, { "epoch": 0.6437625083388926, "loss": 0.9216732978820801, "step": 1930 }, { "ce_loss": 0.2662244737148285, "epoch": 0.6437625083388926, "step": 1930 }, { "distill_loss": 0.20773352682590485, "epoch": 0.6437625083388926, "step": 1930 }, { "epoch": 0.6437625083388926, "ref_ce_loss": 0.1405613273382187, "step": 1930 }, { "epoch": 0.6437625083388926, "loss": 0.5762615203857422, "step": 1930 }, { "ce_loss": 0.21583402156829834, "epoch": 0.6437625083388926, "step": 1930 }, { "distill_loss": 0.15604069828987122, "epoch": 0.6437625083388926, "step": 1930 }, { "epoch": 0.6437625083388926, "ref_ce_loss": 0.1498086303472519, "step": 1930 }, { "epoch": 0.6470980653769179, "loss": 0.8172, "step": 1940 }, { "epoch": 0.6470980653769179, "grad_norm": 4.659943580627441, "step": 1940 }, { "epoch": 0.6470980653769179, "learning_rate": 0.0002979689091584011, "step": 1940 }, { "epoch": 0.6470980653769179, "loss": 0.5301949977874756, "step": 1940 }, { "ce_loss": 0.1708390712738037, "epoch": 0.6470980653769179, "step": 1940 }, { "distill_loss": 0.12711238861083984, "epoch": 0.6470980653769179, "step": 1940 }, { "epoch": 0.6470980653769179, "ref_ce_loss": 0.13796593248844147, "step": 1940 }, { "epoch": 0.6470980653769179, "loss": 1.0410027503967285, "step": 1940 }, { "ce_loss": 0.37468576431274414, "epoch": 0.6470980653769179, "step": 1940 }, { "distill_loss": 0.17693452537059784, "epoch": 0.6470980653769179, "step": 1940 }, { "epoch": 0.6470980653769179, "ref_ce_loss": 0.14826247096061707, "step": 1940 }, { "epoch": 0.6470980653769179, "loss": 0.7805930376052856, "step": 1940 }, { "ce_loss": 0.29428359866142273, "epoch": 0.6470980653769179, "step": 1940 }, { "distill_loss": 0.15377718210220337, "epoch": 0.6470980653769179, "step": 1940 }, { "epoch": 0.6470980653769179, "ref_ce_loss": 0.18468515574932098, "step": 1940 }, { "epoch": 0.6470980653769179, "loss": 0.5379800200462341, "step": 1940 }, { "ce_loss": 0.25719043612480164, "epoch": 0.6470980653769179, "step": 1940 }, { "distill_loss": 0.14073030650615692, "epoch": 0.6470980653769179, "step": 1940 }, { "epoch": 0.6470980653769179, "ref_ce_loss": 0.13994979858398438, "step": 1940 }, { "epoch": 0.6504336224149433, "loss": 0.7606, "step": 1950 }, { "epoch": 0.6504336224149433, "grad_norm": 2.321260929107666, "step": 1950 }, { "epoch": 0.6504336224149433, "learning_rate": 0.000297935553025773, "step": 1950 }, { "epoch": 0.6504336224149433, "loss": 0.5624473094940186, "step": 1950 }, { "ce_loss": 0.2591063976287842, "epoch": 0.6504336224149433, "step": 1950 }, { "distill_loss": 0.13360954821109772, "epoch": 0.6504336224149433, "step": 1950 }, { "epoch": 0.6504336224149433, "ref_ce_loss": 0.16947798430919647, "step": 1950 }, { "epoch": 0.6504336224149433, "loss": 0.6944127082824707, "step": 1950 }, { "ce_loss": 0.3012009263038635, "epoch": 0.6504336224149433, "step": 1950 }, { "distill_loss": 0.16501295566558838, "epoch": 0.6504336224149433, "step": 1950 }, { "epoch": 0.6504336224149433, "ref_ce_loss": 0.1470092236995697, "step": 1950 }, { "epoch": 0.6504336224149433, "loss": 0.7297142744064331, "step": 1950 }, { "ce_loss": 0.3316505253314972, "epoch": 0.6504336224149433, "step": 1950 }, { "distill_loss": 0.1685563027858734, "epoch": 0.6504336224149433, "step": 1950 }, { "epoch": 0.6504336224149433, "ref_ce_loss": 0.14613375067710876, "step": 1950 }, { "epoch": 0.6504336224149433, "loss": 0.5265622735023499, "step": 1950 }, { "ce_loss": 0.1956728845834732, "epoch": 0.6504336224149433, "step": 1950 }, { "distill_loss": 0.15365520119667053, "epoch": 0.6504336224149433, "step": 1950 }, { "epoch": 0.6504336224149433, "ref_ce_loss": 0.17705023288726807, "step": 1950 }, { "epoch": 0.6537691794529686, "loss": 0.8083, "step": 1960 }, { "epoch": 0.6537691794529686, "grad_norm": 2.691359758377075, "step": 1960 }, { "epoch": 0.6537691794529686, "learning_rate": 0.00029790192711705196, "step": 1960 }, { "epoch": 0.6537691794529686, "loss": 0.8530191779136658, "step": 1960 }, { "ce_loss": 0.3192751109600067, "epoch": 0.6537691794529686, "step": 1960 }, { "distill_loss": 0.18763959407806396, "epoch": 0.6537691794529686, "step": 1960 }, { "epoch": 0.6537691794529686, "ref_ce_loss": 0.13116642832756042, "step": 1960 }, { "epoch": 0.6537691794529686, "loss": 0.8203089237213135, "step": 1960 }, { "ce_loss": 0.3802758455276489, "epoch": 0.6537691794529686, "step": 1960 }, { "distill_loss": 0.16498608887195587, "epoch": 0.6537691794529686, "step": 1960 }, { "epoch": 0.6537691794529686, "ref_ce_loss": 0.2160835862159729, "step": 1960 }, { "epoch": 0.6537691794529686, "loss": 0.995650053024292, "step": 1960 }, { "ce_loss": 0.3241329789161682, "epoch": 0.6537691794529686, "step": 1960 }, { "distill_loss": 0.14752744138240814, "epoch": 0.6537691794529686, "step": 1960 }, { "epoch": 0.6537691794529686, "ref_ce_loss": 0.17125451564788818, "step": 1960 }, { "epoch": 0.6537691794529686, "loss": 0.631495475769043, "step": 1960 }, { "ce_loss": 0.15118324756622314, "epoch": 0.6537691794529686, "step": 1960 }, { "distill_loss": 0.12421630322933197, "epoch": 0.6537691794529686, "step": 1960 }, { "epoch": 0.6537691794529686, "ref_ce_loss": 0.11524390429258347, "step": 1960 }, { "epoch": 0.657104736490994, "loss": 0.7711, "step": 1970 }, { "epoch": 0.657104736490994, "grad_norm": 2.697709798812866, "step": 1970 }, { "epoch": 0.657104736490994, "learning_rate": 0.00029786803149355843, "step": 1970 }, { "epoch": 0.657104736490994, "loss": 0.44634926319122314, "step": 1970 }, { "ce_loss": 0.18240369856357574, "epoch": 0.657104736490994, "step": 1970 }, { "distill_loss": 0.15074263513088226, "epoch": 0.657104736490994, "step": 1970 }, { "epoch": 0.657104736490994, "ref_ce_loss": 0.1105746328830719, "step": 1970 }, { "epoch": 0.657104736490994, "loss": 0.5571528077125549, "step": 1970 }, { "ce_loss": 0.2294694185256958, "epoch": 0.657104736490994, "step": 1970 }, { "distill_loss": 0.1737332046031952, "epoch": 0.657104736490994, "step": 1970 }, { "epoch": 0.657104736490994, "ref_ce_loss": 0.14419282972812653, "step": 1970 }, { "epoch": 0.657104736490994, "loss": 1.3893582820892334, "step": 1970 }, { "ce_loss": 0.3939763307571411, "epoch": 0.657104736490994, "step": 1970 }, { "distill_loss": 0.1607140153646469, "epoch": 0.657104736490994, "step": 1970 }, { "epoch": 0.657104736490994, "ref_ce_loss": 0.17446056008338928, "step": 1970 }, { "epoch": 0.657104736490994, "loss": 0.8046218156814575, "step": 1970 }, { "ce_loss": 0.28628915548324585, "epoch": 0.657104736490994, "step": 1970 }, { "distill_loss": 0.16188013553619385, "epoch": 0.657104736490994, "step": 1970 }, { "epoch": 0.657104736490994, "ref_ce_loss": 0.16178050637245178, "step": 1970 }, { "epoch": 0.6604402935290193, "loss": 0.7564, "step": 1980 }, { "epoch": 0.6604402935290193, "grad_norm": 2.1782381534576416, "step": 1980 }, { "epoch": 0.6604402935290193, "learning_rate": 0.00029783386621710467, "step": 1980 }, { "epoch": 0.6604402935290193, "loss": 0.6052896976470947, "step": 1980 }, { "ce_loss": 0.21867820620536804, "epoch": 0.6604402935290193, "step": 1980 }, { "distill_loss": 0.12602409720420837, "epoch": 0.6604402935290193, "step": 1980 }, { "epoch": 0.6604402935290193, "ref_ce_loss": 0.1373637467622757, "step": 1980 }, { "epoch": 0.6604402935290193, "loss": 1.0979907512664795, "step": 1980 }, { "ce_loss": 0.4194616973400116, "epoch": 0.6604402935290193, "step": 1980 }, { "distill_loss": 0.19360217452049255, "epoch": 0.6604402935290193, "step": 1980 }, { "epoch": 0.6604402935290193, "ref_ce_loss": 0.21660290658473969, "step": 1980 }, { "epoch": 0.6604402935290193, "loss": 0.9994645118713379, "step": 1980 }, { "ce_loss": 0.26734447479248047, "epoch": 0.6604402935290193, "step": 1980 }, { "distill_loss": 0.14631450176239014, "epoch": 0.6604402935290193, "step": 1980 }, { "epoch": 0.6604402935290193, "ref_ce_loss": 0.19095124304294586, "step": 1980 }, { "epoch": 0.6604402935290193, "loss": 1.1816197633743286, "step": 1980 }, { "ce_loss": 0.32602429389953613, "epoch": 0.6604402935290193, "step": 1980 }, { "distill_loss": 0.17026512324810028, "epoch": 0.6604402935290193, "step": 1980 }, { "epoch": 0.6604402935290193, "ref_ce_loss": 0.16700991988182068, "step": 1980 }, { "epoch": 0.6637758505670447, "loss": 0.7854, "step": 1990 }, { "epoch": 0.6637758505670447, "grad_norm": 2.9347715377807617, "step": 1990 }, { "epoch": 0.6637758505670447, "learning_rate": 0.0002977994313499946, "step": 1990 }, { "epoch": 0.6637758505670447, "loss": 0.9755704402923584, "step": 1990 }, { "ce_loss": 0.26383841037750244, "epoch": 0.6637758505670447, "step": 1990 }, { "distill_loss": 0.10116223245859146, "epoch": 0.6637758505670447, "step": 1990 }, { "epoch": 0.6637758505670447, "ref_ce_loss": 0.15708518028259277, "step": 1990 }, { "epoch": 0.6637758505670447, "loss": 0.5551912784576416, "step": 1990 }, { "ce_loss": 0.21296478807926178, "epoch": 0.6637758505670447, "step": 1990 }, { "distill_loss": 0.12776599824428558, "epoch": 0.6637758505670447, "step": 1990 }, { "epoch": 0.6637758505670447, "ref_ce_loss": 0.10435923933982849, "step": 1990 }, { "epoch": 0.6637758505670447, "loss": 0.5524492859840393, "step": 1990 }, { "ce_loss": 0.26941555738449097, "epoch": 0.6637758505670447, "step": 1990 }, { "distill_loss": 0.11517900228500366, "epoch": 0.6637758505670447, "step": 1990 }, { "epoch": 0.6637758505670447, "ref_ce_loss": 0.16757942736148834, "step": 1990 }, { "epoch": 0.6637758505670447, "loss": 0.5536325573921204, "step": 1990 }, { "ce_loss": 0.30605706572532654, "epoch": 0.6637758505670447, "step": 1990 }, { "distill_loss": 0.1278000921010971, "epoch": 0.6637758505670447, "step": 1990 }, { "epoch": 0.6637758505670447, "ref_ce_loss": 0.11955641955137253, "step": 1990 }, { "epoch": 0.66711140760507, "loss": 0.9327, "step": 2000 }, { "epoch": 0.66711140760507, "grad_norm": 7.14279317855835, "step": 2000 }, { "epoch": 0.66711140760507, "learning_rate": 0.00029776472695502385, "step": 2000 }, { "epoch": 0.66711140760507, "loss": 1.3240374326705933, "step": 2000 }, { "ce_loss": 0.14523351192474365, "epoch": 0.66711140760507, "step": 2000 }, { "distill_loss": 0.945601224899292, "epoch": 0.66711140760507, "step": 2000 }, { "epoch": 0.66711140760507, "ref_ce_loss": 0.13358724117279053, "step": 2000 }, { "epoch": 0.66711140760507, "loss": 1.9700191020965576, "step": 2000 }, { "ce_loss": 0.37845736742019653, "epoch": 0.66711140760507, "step": 2000 }, { "distill_loss": 1.0867323875427246, "epoch": 0.66711140760507, "step": 2000 }, { "epoch": 0.66711140760507, "ref_ce_loss": 0.23238378763198853, "step": 2000 }, { "epoch": 0.66711140760507, "loss": 1.733937382698059, "step": 2000 }, { "ce_loss": 0.30824780464172363, "epoch": 0.66711140760507, "step": 2000 }, { "distill_loss": 0.8462389707565308, "epoch": 0.66711140760507, "step": 2000 }, { "epoch": 0.66711140760507, "ref_ce_loss": 0.13755394518375397, "step": 2000 }, { "epoch": 0.66711140760507, "loss": 1.9826970100402832, "step": 2000 }, { "ce_loss": 0.3606218099594116, "epoch": 0.66711140760507, "step": 2000 }, { "distill_loss": 1.0254515409469604, "epoch": 0.66711140760507, "step": 2000 }, { "epoch": 0.66711140760507, "ref_ce_loss": 0.16944263875484467, "step": 2000 }, { "epoch": 0.6704469646430954, "loss": 1.0835, "step": 2010 }, { "epoch": 0.6704469646430954, "grad_norm": 1.8303823471069336, "step": 2010 }, { "epoch": 0.6704469646430954, "learning_rate": 0.0002977297530954796, "step": 2010 }, { "epoch": 0.6704469646430954, "loss": 1.0600343942642212, "step": 2010 }, { "ce_loss": 0.22668348252773285, "epoch": 0.6704469646430954, "step": 2010 }, { "distill_loss": 0.2836839556694031, "epoch": 0.6704469646430954, "step": 2010 }, { "epoch": 0.6704469646430954, "ref_ce_loss": 0.1294415295124054, "step": 2010 }, { "epoch": 0.6704469646430954, "loss": 0.828821063041687, "step": 2010 }, { "ce_loss": 0.24913601577281952, "epoch": 0.6704469646430954, "step": 2010 }, { "distill_loss": 0.24501857161521912, "epoch": 0.6704469646430954, "step": 2010 }, { "epoch": 0.6704469646430954, "ref_ce_loss": 0.17393875122070312, "step": 2010 }, { "epoch": 0.6704469646430954, "loss": 1.1273013353347778, "step": 2010 }, { "ce_loss": 0.2586398124694824, "epoch": 0.6704469646430954, "step": 2010 }, { "distill_loss": 0.2831948399543762, "epoch": 0.6704469646430954, "step": 2010 }, { "epoch": 0.6704469646430954, "ref_ce_loss": 0.18231160938739777, "step": 2010 }, { "epoch": 0.6704469646430954, "loss": 0.5938960909843445, "step": 2010 }, { "ce_loss": 0.2319275587797165, "epoch": 0.6704469646430954, "step": 2010 }, { "distill_loss": 0.22680506110191345, "epoch": 0.6704469646430954, "step": 2010 }, { "epoch": 0.6704469646430954, "ref_ce_loss": 0.1350928097963333, "step": 2010 }, { "epoch": 0.6737825216811207, "loss": 0.9008, "step": 2020 }, { "epoch": 0.6737825216811207, "grad_norm": 2.8291239738464355, "step": 2020 }, { "epoch": 0.6737825216811207, "learning_rate": 0.0002976945098351403, "step": 2020 }, { "epoch": 0.6737825216811207, "loss": 0.590222954750061, "step": 2020 }, { "ce_loss": 0.27437344193458557, "epoch": 0.6737825216811207, "step": 2020 }, { "distill_loss": 0.17268088459968567, "epoch": 0.6737825216811207, "step": 2020 }, { "epoch": 0.6737825216811207, "ref_ce_loss": 0.14304281771183014, "step": 2020 }, { "epoch": 0.6737825216811207, "loss": 0.8302665948867798, "step": 2020 }, { "ce_loss": 0.2411762773990631, "epoch": 0.6737825216811207, "step": 2020 }, { "distill_loss": 0.1841261237859726, "epoch": 0.6737825216811207, "step": 2020 }, { "epoch": 0.6737825216811207, "ref_ce_loss": 0.1724471151828766, "step": 2020 }, { "epoch": 0.6737825216811207, "loss": 0.6391019821166992, "step": 2020 }, { "ce_loss": 0.19330915808677673, "epoch": 0.6737825216811207, "step": 2020 }, { "distill_loss": 0.16628727316856384, "epoch": 0.6737825216811207, "step": 2020 }, { "epoch": 0.6737825216811207, "ref_ce_loss": 0.14351928234100342, "step": 2020 }, { "epoch": 0.6737825216811207, "loss": 0.9922801852226257, "step": 2020 }, { "ce_loss": 0.18819864094257355, "epoch": 0.6737825216811207, "step": 2020 }, { "distill_loss": 0.13531914353370667, "epoch": 0.6737825216811207, "step": 2020 }, { "epoch": 0.6737825216811207, "ref_ce_loss": 0.1615561842918396, "step": 2020 }, { "epoch": 0.6771180787191461, "loss": 0.84, "step": 2030 }, { "epoch": 0.6771180787191461, "grad_norm": 3.3795111179351807, "step": 2030 }, { "epoch": 0.6771180787191461, "learning_rate": 0.00029765899723827575, "step": 2030 }, { "epoch": 0.6771180787191461, "loss": 0.8877922892570496, "step": 2030 }, { "ce_loss": 0.2900596261024475, "epoch": 0.6771180787191461, "step": 2030 }, { "distill_loss": 0.24294210970401764, "epoch": 0.6771180787191461, "step": 2030 }, { "epoch": 0.6771180787191461, "ref_ce_loss": 0.17468880116939545, "step": 2030 }, { "epoch": 0.6771180787191461, "loss": 0.8432213068008423, "step": 2030 }, { "ce_loss": 0.3750610053539276, "epoch": 0.6771180787191461, "step": 2030 }, { "distill_loss": 0.22893035411834717, "epoch": 0.6771180787191461, "step": 2030 }, { "epoch": 0.6771180787191461, "ref_ce_loss": 0.1852346807718277, "step": 2030 }, { "epoch": 0.6771180787191461, "loss": 0.6145089864730835, "step": 2030 }, { "ce_loss": 0.25153613090515137, "epoch": 0.6771180787191461, "step": 2030 }, { "distill_loss": 0.18749454617500305, "epoch": 0.6771180787191461, "step": 2030 }, { "epoch": 0.6771180787191461, "ref_ce_loss": 0.1751817911863327, "step": 2030 }, { "epoch": 0.6771180787191461, "loss": 1.0318095684051514, "step": 2030 }, { "ce_loss": 0.17771191895008087, "epoch": 0.6771180787191461, "step": 2030 }, { "distill_loss": 0.17989817261695862, "epoch": 0.6771180787191461, "step": 2030 }, { "epoch": 0.6771180787191461, "ref_ce_loss": 0.12897759675979614, "step": 2030 }, { "epoch": 0.6804536357571714, "loss": 0.7896, "step": 2040 }, { "epoch": 0.6804536357571714, "grad_norm": 3.249816656112671, "step": 2040 }, { "epoch": 0.6804536357571714, "learning_rate": 0.00029762321536964704, "step": 2040 }, { "epoch": 0.6804536357571714, "loss": 0.9905202388763428, "step": 2040 }, { "ce_loss": 0.37102267146110535, "epoch": 0.6804536357571714, "step": 2040 }, { "distill_loss": 0.21594317257404327, "epoch": 0.6804536357571714, "step": 2040 }, { "epoch": 0.6804536357571714, "ref_ce_loss": 0.2302805334329605, "step": 2040 }, { "epoch": 0.6804536357571714, "loss": 0.8301817178726196, "step": 2040 }, { "ce_loss": 0.22139473259449005, "epoch": 0.6804536357571714, "step": 2040 }, { "distill_loss": 0.17883756756782532, "epoch": 0.6804536357571714, "step": 2040 }, { "epoch": 0.6804536357571714, "ref_ce_loss": 0.15354622900485992, "step": 2040 }, { "epoch": 0.6804536357571714, "loss": 0.5395179390907288, "step": 2040 }, { "ce_loss": 0.2128942608833313, "epoch": 0.6804536357571714, "step": 2040 }, { "distill_loss": 0.1688617467880249, "epoch": 0.6804536357571714, "step": 2040 }, { "epoch": 0.6804536357571714, "ref_ce_loss": 0.10727988183498383, "step": 2040 }, { "epoch": 0.6804536357571714, "loss": 0.8493964672088623, "step": 2040 }, { "ce_loss": 0.29065755009651184, "epoch": 0.6804536357571714, "step": 2040 }, { "distill_loss": 0.21070683002471924, "epoch": 0.6804536357571714, "step": 2040 }, { "epoch": 0.6804536357571714, "ref_ce_loss": 0.17182712256908417, "step": 2040 }, { "epoch": 0.6837891927951968, "loss": 0.8114, "step": 2050 }, { "epoch": 0.6837891927951968, "grad_norm": 2.234649658203125, "step": 2050 }, { "epoch": 0.6837891927951968, "learning_rate": 0.0002975871642945061, "step": 2050 }, { "epoch": 0.6837891927951968, "loss": 1.5577614307403564, "step": 2050 }, { "ce_loss": 0.32842814922332764, "epoch": 0.6837891927951968, "step": 2050 }, { "distill_loss": 0.1840927004814148, "epoch": 0.6837891927951968, "step": 2050 }, { "epoch": 0.6837891927951968, "ref_ce_loss": 0.1488945633172989, "step": 2050 }, { "epoch": 0.6837891927951968, "loss": 0.5667079091072083, "step": 2050 }, { "ce_loss": 0.1807955801486969, "epoch": 0.6837891927951968, "step": 2050 }, { "distill_loss": 0.16267137229442596, "epoch": 0.6837891927951968, "step": 2050 }, { "epoch": 0.6837891927951968, "ref_ce_loss": 0.10108146071434021, "step": 2050 }, { "epoch": 0.6837891927951968, "loss": 0.35961073637008667, "step": 2050 }, { "ce_loss": 0.1420925408601761, "epoch": 0.6837891927951968, "step": 2050 }, { "distill_loss": 0.13304996490478516, "epoch": 0.6837891927951968, "step": 2050 }, { "epoch": 0.6837891927951968, "ref_ce_loss": 0.08395658433437347, "step": 2050 }, { "epoch": 0.6837891927951968, "loss": 1.5085173845291138, "step": 2050 }, { "ce_loss": 0.2386820912361145, "epoch": 0.6837891927951968, "step": 2050 }, { "distill_loss": 0.1670990139245987, "epoch": 0.6837891927951968, "step": 2050 }, { "epoch": 0.6837891927951968, "ref_ce_loss": 0.16288962960243225, "step": 2050 }, { "epoch": 0.6871247498332221, "loss": 0.8301, "step": 2060 }, { "epoch": 0.6871247498332221, "grad_norm": 4.133786201477051, "step": 2060 }, { "epoch": 0.6871247498332221, "learning_rate": 0.0002975508440785958, "step": 2060 }, { "epoch": 0.6871247498332221, "loss": 0.7842780351638794, "step": 2060 }, { "ce_loss": 0.36025092005729675, "epoch": 0.6871247498332221, "step": 2060 }, { "distill_loss": 0.1954689919948578, "epoch": 0.6871247498332221, "step": 2060 }, { "epoch": 0.6871247498332221, "ref_ce_loss": 0.22843322157859802, "step": 2060 }, { "epoch": 0.6871247498332221, "loss": 0.7526856064796448, "step": 2060 }, { "ce_loss": 0.24765729904174805, "epoch": 0.6871247498332221, "step": 2060 }, { "distill_loss": 0.20760603249073029, "epoch": 0.6871247498332221, "step": 2060 }, { "epoch": 0.6871247498332221, "ref_ce_loss": 0.19382600486278534, "step": 2060 }, { "epoch": 0.6871247498332221, "loss": 0.9192790985107422, "step": 2060 }, { "ce_loss": 0.2992182970046997, "epoch": 0.6871247498332221, "step": 2060 }, { "distill_loss": 0.20960591733455658, "epoch": 0.6871247498332221, "step": 2060 }, { "epoch": 0.6871247498332221, "ref_ce_loss": 0.16578657925128937, "step": 2060 }, { "epoch": 0.6871247498332221, "loss": 0.7168542742729187, "step": 2060 }, { "ce_loss": 0.2953929603099823, "epoch": 0.6871247498332221, "step": 2060 }, { "distill_loss": 0.2222084403038025, "epoch": 0.6871247498332221, "step": 2060 }, { "epoch": 0.6871247498332221, "ref_ce_loss": 0.1362280398607254, "step": 2060 }, { "epoch": 0.6904603068712475, "loss": 0.8254, "step": 2070 }, { "epoch": 0.6904603068712475, "grad_norm": 1.7999627590179443, "step": 2070 }, { "epoch": 0.6904603068712475, "learning_rate": 0.0002975142547881501, "step": 2070 }, { "epoch": 0.6904603068712475, "loss": 0.8302853107452393, "step": 2070 }, { "ce_loss": 0.3683020770549774, "epoch": 0.6904603068712475, "step": 2070 }, { "distill_loss": 0.14901624619960785, "epoch": 0.6904603068712475, "step": 2070 }, { "epoch": 0.6904603068712475, "ref_ce_loss": 0.21112219989299774, "step": 2070 }, { "epoch": 0.6904603068712475, "loss": 0.8270956873893738, "step": 2070 }, { "ce_loss": 0.3249755799770355, "epoch": 0.6904603068712475, "step": 2070 }, { "distill_loss": 0.14936313033103943, "epoch": 0.6904603068712475, "step": 2070 }, { "epoch": 0.6904603068712475, "ref_ce_loss": 0.17302238941192627, "step": 2070 }, { "epoch": 0.6904603068712475, "loss": 1.6876327991485596, "step": 2070 }, { "ce_loss": 0.30081063508987427, "epoch": 0.6904603068712475, "step": 2070 }, { "distill_loss": 0.16745862364768982, "epoch": 0.6904603068712475, "step": 2070 }, { "epoch": 0.6904603068712475, "ref_ce_loss": 0.12626586854457855, "step": 2070 }, { "epoch": 0.6904603068712475, "loss": 0.8751262426376343, "step": 2070 }, { "ce_loss": 0.31323501467704773, "epoch": 0.6904603068712475, "step": 2070 }, { "distill_loss": 0.16205358505249023, "epoch": 0.6904603068712475, "step": 2070 }, { "epoch": 0.6904603068712475, "ref_ce_loss": 0.2028389722108841, "step": 2070 }, { "epoch": 0.6937958639092728, "loss": 0.8444, "step": 2080 }, { "epoch": 0.6937958639092728, "grad_norm": 2.48443341255188, "step": 2080 }, { "epoch": 0.6937958639092728, "learning_rate": 0.00029747739648989315, "step": 2080 }, { "epoch": 0.6937958639092728, "loss": 0.7376833558082581, "step": 2080 }, { "ce_loss": 0.28451669216156006, "epoch": 0.6937958639092728, "step": 2080 }, { "distill_loss": 0.20127324759960175, "epoch": 0.6937958639092728, "step": 2080 }, { "epoch": 0.6937958639092728, "ref_ce_loss": 0.1741037368774414, "step": 2080 }, { "epoch": 0.6937958639092728, "loss": 0.4693785607814789, "step": 2080 }, { "ce_loss": 0.16751381754875183, "epoch": 0.6937958639092728, "step": 2080 }, { "distill_loss": 0.15593096613883972, "epoch": 0.6937958639092728, "step": 2080 }, { "epoch": 0.6937958639092728, "ref_ce_loss": 0.14585992693901062, "step": 2080 }, { "epoch": 0.6937958639092728, "loss": 1.0831847190856934, "step": 2080 }, { "ce_loss": 0.31920549273490906, "epoch": 0.6937958639092728, "step": 2080 }, { "distill_loss": 0.23243792355060577, "epoch": 0.6937958639092728, "step": 2080 }, { "epoch": 0.6937958639092728, "ref_ce_loss": 0.1392020583152771, "step": 2080 }, { "epoch": 0.6937958639092728, "loss": 0.6108224391937256, "step": 2080 }, { "ce_loss": 0.19906829297542572, "epoch": 0.6937958639092728, "step": 2080 }, { "distill_loss": 0.21175572276115417, "epoch": 0.6937958639092728, "step": 2080 }, { "epoch": 0.6937958639092728, "ref_ce_loss": 0.12633490562438965, "step": 2080 }, { "epoch": 0.6971314209472982, "loss": 1.1597, "step": 2090 }, { "epoch": 0.6971314209472982, "grad_norm": 3.2459542751312256, "step": 2090 }, { "epoch": 0.6971314209472982, "learning_rate": 0.00029744026925104014, "step": 2090 }, { "epoch": 0.6971314209472982, "loss": 0.8244901895523071, "step": 2090 }, { "ce_loss": 0.18612109124660492, "epoch": 0.6971314209472982, "step": 2090 }, { "distill_loss": 0.4257374703884125, "epoch": 0.6971314209472982, "step": 2090 }, { "epoch": 0.6971314209472982, "ref_ce_loss": 0.12089783698320389, "step": 2090 }, { "epoch": 0.6971314209472982, "loss": 1.226898193359375, "step": 2090 }, { "ce_loss": 0.3660268485546112, "epoch": 0.6971314209472982, "step": 2090 }, { "distill_loss": 0.5295765995979309, "epoch": 0.6971314209472982, "step": 2090 }, { "epoch": 0.6971314209472982, "ref_ce_loss": 0.16705910861492157, "step": 2090 }, { "epoch": 0.6971314209472982, "loss": 1.0499669313430786, "step": 2090 }, { "ce_loss": 0.26359203457832336, "epoch": 0.6971314209472982, "step": 2090 }, { "distill_loss": 0.5406015515327454, "epoch": 0.6971314209472982, "step": 2090 }, { "epoch": 0.6971314209472982, "ref_ce_loss": 0.16747480630874634, "step": 2090 }, { "epoch": 0.6971314209472982, "loss": 0.9574893712997437, "step": 2090 }, { "ce_loss": 0.27059000730514526, "epoch": 0.6971314209472982, "step": 2090 }, { "distill_loss": 0.4332764446735382, "epoch": 0.6971314209472982, "step": 2090 }, { "epoch": 0.6971314209472982, "ref_ce_loss": 0.17012599110603333, "step": 2090 }, { "epoch": 0.7004669779853235, "loss": 1.1214, "step": 2100 }, { "epoch": 0.7004669779853235, "grad_norm": 3.7141575813293457, "step": 2100 }, { "epoch": 0.7004669779853235, "learning_rate": 0.00029740287313929643, "step": 2100 }, { "epoch": 0.7004669779853235, "loss": 1.2868798971176147, "step": 2100 }, { "ce_loss": 0.24377526342868805, "epoch": 0.7004669779853235, "step": 2100 }, { "distill_loss": 0.5068783760070801, "epoch": 0.7004669779853235, "step": 2100 }, { "epoch": 0.7004669779853235, "ref_ce_loss": 0.122403085231781, "step": 2100 }, { "epoch": 0.7004669779853235, "loss": 0.9823429584503174, "step": 2100 }, { "ce_loss": 0.27241188287734985, "epoch": 0.7004669779853235, "step": 2100 }, { "distill_loss": 0.39842548966407776, "epoch": 0.7004669779853235, "step": 2100 }, { "epoch": 0.7004669779853235, "ref_ce_loss": 0.18590964376926422, "step": 2100 }, { "epoch": 0.7004669779853235, "loss": 1.0657988786697388, "step": 2100 }, { "ce_loss": 0.21215464174747467, "epoch": 0.7004669779853235, "step": 2100 }, { "distill_loss": 0.368913471698761, "epoch": 0.7004669779853235, "step": 2100 }, { "epoch": 0.7004669779853235, "ref_ce_loss": 0.23388366401195526, "step": 2100 }, { "epoch": 0.7004669779853235, "loss": 0.9568293690681458, "step": 2100 }, { "ce_loss": 0.3097842335700989, "epoch": 0.7004669779853235, "step": 2100 }, { "distill_loss": 0.37886306643486023, "epoch": 0.7004669779853235, "step": 2100 }, { "epoch": 0.7004669779853235, "ref_ce_loss": 0.1720070242881775, "step": 2100 }, { "epoch": 0.7038025350233489, "loss": 1.0944, "step": 2110 }, { "epoch": 0.7038025350233489, "grad_norm": 8.202859878540039, "step": 2110 }, { "epoch": 0.7038025350233489, "learning_rate": 0.0002973652082228578, "step": 2110 }, { "epoch": 0.7038025350233489, "loss": 1.5614635944366455, "step": 2110 }, { "ce_loss": 0.4321347177028656, "epoch": 0.7038025350233489, "step": 2110 }, { "distill_loss": 0.5474117994308472, "epoch": 0.7038025350233489, "step": 2110 }, { "epoch": 0.7038025350233489, "ref_ce_loss": 0.17317543923854828, "step": 2110 }, { "epoch": 0.7038025350233489, "loss": 1.1091805696487427, "step": 2110 }, { "ce_loss": 0.19208572804927826, "epoch": 0.7038025350233489, "step": 2110 }, { "distill_loss": 0.3152616620063782, "epoch": 0.7038025350233489, "step": 2110 }, { "epoch": 0.7038025350233489, "ref_ce_loss": 0.19635343551635742, "step": 2110 }, { "epoch": 0.7038025350233489, "loss": 1.0785329341888428, "step": 2110 }, { "ce_loss": 0.3584291338920593, "epoch": 0.7038025350233489, "step": 2110 }, { "distill_loss": 0.41182008385658264, "epoch": 0.7038025350233489, "step": 2110 }, { "epoch": 0.7038025350233489, "ref_ce_loss": 0.14687897264957428, "step": 2110 }, { "epoch": 0.7038025350233489, "loss": 1.312581181526184, "step": 2110 }, { "ce_loss": 0.3514189124107361, "epoch": 0.7038025350233489, "step": 2110 }, { "distill_loss": 0.495456337928772, "epoch": 0.7038025350233489, "step": 2110 }, { "epoch": 0.7038025350233489, "ref_ce_loss": 0.14532673358917236, "step": 2110 }, { "epoch": 0.7071380920613742, "loss": 1.0739, "step": 2120 }, { "epoch": 0.7071380920613742, "grad_norm": 3.721625566482544, "step": 2120 }, { "epoch": 0.7071380920613742, "learning_rate": 0.00029732727457041025, "step": 2120 }, { "epoch": 0.7071380920613742, "loss": 1.1425330638885498, "step": 2120 }, { "ce_loss": 0.362537145614624, "epoch": 0.7071380920613742, "step": 2120 }, { "distill_loss": 0.3849060535430908, "epoch": 0.7071380920613742, "step": 2120 }, { "epoch": 0.7071380920613742, "ref_ce_loss": 0.17424069344997406, "step": 2120 }, { "epoch": 0.7071380920613742, "loss": 1.050082802772522, "step": 2120 }, { "ce_loss": 0.30090203881263733, "epoch": 0.7071380920613742, "step": 2120 }, { "distill_loss": 0.42408767342567444, "epoch": 0.7071380920613742, "step": 2120 }, { "epoch": 0.7071380920613742, "ref_ce_loss": 0.10593853145837784, "step": 2120 }, { "epoch": 0.7071380920613742, "loss": 1.2173357009887695, "step": 2120 }, { "ce_loss": 0.2668623924255371, "epoch": 0.7071380920613742, "step": 2120 }, { "distill_loss": 0.39260539412498474, "epoch": 0.7071380920613742, "step": 2120 }, { "epoch": 0.7071380920613742, "ref_ce_loss": 0.17890018224716187, "step": 2120 }, { "epoch": 0.7071380920613742, "loss": 1.525931715965271, "step": 2120 }, { "ce_loss": 0.35097646713256836, "epoch": 0.7071380920613742, "step": 2120 }, { "distill_loss": 0.341688334941864, "epoch": 0.7071380920613742, "step": 2120 }, { "epoch": 0.7071380920613742, "ref_ce_loss": 0.2002553790807724, "step": 2120 }, { "epoch": 0.7104736490993996, "loss": 0.8325, "step": 2130 }, { "epoch": 0.7104736490993996, "grad_norm": 5.317857265472412, "step": 2130 }, { "epoch": 0.7104736490993996, "learning_rate": 0.0002972890722511297, "step": 2130 }, { "epoch": 0.7104736490993996, "loss": 0.9740656614303589, "step": 2130 }, { "ce_loss": 0.2215503603219986, "epoch": 0.7104736490993996, "step": 2130 }, { "distill_loss": 0.18275123834609985, "epoch": 0.7104736490993996, "step": 2130 }, { "epoch": 0.7104736490993996, "ref_ce_loss": 0.18407899141311646, "step": 2130 }, { "epoch": 0.7104736490993996, "loss": 0.9693307876586914, "step": 2130 }, { "ce_loss": 0.33199867606163025, "epoch": 0.7104736490993996, "step": 2130 }, { "distill_loss": 0.2375098168849945, "epoch": 0.7104736490993996, "step": 2130 }, { "epoch": 0.7104736490993996, "ref_ce_loss": 0.17215308547019958, "step": 2130 }, { "epoch": 0.7104736490993996, "loss": 1.586306095123291, "step": 2130 }, { "ce_loss": 0.32106271386146545, "epoch": 0.7104736490993996, "step": 2130 }, { "distill_loss": 0.1949656754732132, "epoch": 0.7104736490993996, "step": 2130 }, { "epoch": 0.7104736490993996, "ref_ce_loss": 0.11499127000570297, "step": 2130 }, { "epoch": 0.7104736490993996, "loss": 0.836284875869751, "step": 2130 }, { "ce_loss": 0.3160724639892578, "epoch": 0.7104736490993996, "step": 2130 }, { "distill_loss": 0.1925450563430786, "epoch": 0.7104736490993996, "step": 2130 }, { "epoch": 0.7104736490993996, "ref_ce_loss": 0.24203264713287354, "step": 2130 }, { "epoch": 0.7138092061374249, "loss": 0.7741, "step": 2140 }, { "epoch": 0.7138092061374249, "grad_norm": 2.827847719192505, "step": 2140 }, { "epoch": 0.7138092061374249, "learning_rate": 0.0002972506013346822, "step": 2140 }, { "epoch": 0.7138092061374249, "loss": 0.853046715259552, "step": 2140 }, { "ce_loss": 0.3299194574356079, "epoch": 0.7138092061374249, "step": 2140 }, { "distill_loss": 0.17669892311096191, "epoch": 0.7138092061374249, "step": 2140 }, { "epoch": 0.7138092061374249, "ref_ce_loss": 0.13139741122722626, "step": 2140 }, { "epoch": 0.7138092061374249, "loss": 0.6886839270591736, "step": 2140 }, { "ce_loss": 0.3075639009475708, "epoch": 0.7138092061374249, "step": 2140 }, { "distill_loss": 0.18278264999389648, "epoch": 0.7138092061374249, "step": 2140 }, { "epoch": 0.7138092061374249, "ref_ce_loss": 0.19807660579681396, "step": 2140 }, { "epoch": 0.7138092061374249, "loss": 0.6770578622817993, "step": 2140 }, { "ce_loss": 0.2196367233991623, "epoch": 0.7138092061374249, "step": 2140 }, { "distill_loss": 0.1661662459373474, "epoch": 0.7138092061374249, "step": 2140 }, { "epoch": 0.7138092061374249, "ref_ce_loss": 0.12581881880760193, "step": 2140 }, { "epoch": 0.7138092061374249, "loss": 1.0704749822616577, "step": 2140 }, { "ce_loss": 0.3837515711784363, "epoch": 0.7138092061374249, "step": 2140 }, { "distill_loss": 0.18932490050792694, "epoch": 0.7138092061374249, "step": 2140 }, { "epoch": 0.7138092061374249, "ref_ce_loss": 0.19901344180107117, "step": 2140 }, { "epoch": 0.7171447631754503, "loss": 0.7765, "step": 2150 }, { "epoch": 0.7171447631754503, "grad_norm": 2.2405152320861816, "step": 2150 }, { "epoch": 0.7171447631754503, "learning_rate": 0.00029721186189122346, "step": 2150 }, { "epoch": 0.7171447631754503, "loss": 0.521872878074646, "step": 2150 }, { "ce_loss": 0.20972879230976105, "epoch": 0.7171447631754503, "step": 2150 }, { "distill_loss": 0.1326482594013214, "epoch": 0.7171447631754503, "step": 2150 }, { "epoch": 0.7171447631754503, "ref_ce_loss": 0.1792561411857605, "step": 2150 }, { "epoch": 0.7171447631754503, "loss": 0.7119396328926086, "step": 2150 }, { "ce_loss": 0.31725606322288513, "epoch": 0.7171447631754503, "step": 2150 }, { "distill_loss": 0.13686680793762207, "epoch": 0.7171447631754503, "step": 2150 }, { "epoch": 0.7171447631754503, "ref_ce_loss": 0.18049372732639313, "step": 2150 }, { "epoch": 0.7171447631754503, "loss": 0.5622824430465698, "step": 2150 }, { "ce_loss": 0.207227885723114, "epoch": 0.7171447631754503, "step": 2150 }, { "distill_loss": 0.1053415834903717, "epoch": 0.7171447631754503, "step": 2150 }, { "epoch": 0.7171447631754503, "ref_ce_loss": 0.17381590604782104, "step": 2150 }, { "epoch": 0.7171447631754503, "loss": 0.8207659721374512, "step": 2150 }, { "ce_loss": 0.24983304738998413, "epoch": 0.7171447631754503, "step": 2150 }, { "distill_loss": 0.1186215877532959, "epoch": 0.7171447631754503, "step": 2150 }, { "epoch": 0.7171447631754503, "ref_ce_loss": 0.20335406064987183, "step": 2150 }, { "epoch": 0.7204803202134756, "loss": 0.739, "step": 2160 }, { "epoch": 0.7204803202134756, "grad_norm": 4.988354682922363, "step": 2160 }, { "epoch": 0.7204803202134756, "learning_rate": 0.000297172853991399, "step": 2160 }, { "epoch": 0.7204803202134756, "loss": 0.9101301431655884, "step": 2160 }, { "ce_loss": 0.23088371753692627, "epoch": 0.7204803202134756, "step": 2160 }, { "distill_loss": 0.14969925582408905, "epoch": 0.7204803202134756, "step": 2160 }, { "epoch": 0.7204803202134756, "ref_ce_loss": 0.247276172041893, "step": 2160 }, { "epoch": 0.7204803202134756, "loss": 0.5818696022033691, "step": 2160 }, { "ce_loss": 0.21994711458683014, "epoch": 0.7204803202134756, "step": 2160 }, { "distill_loss": 0.19431249797344208, "epoch": 0.7204803202134756, "step": 2160 }, { "epoch": 0.7204803202134756, "ref_ce_loss": 0.12076810002326965, "step": 2160 }, { "epoch": 0.7204803202134756, "loss": 0.6261471509933472, "step": 2160 }, { "ce_loss": 0.2857092618942261, "epoch": 0.7204803202134756, "step": 2160 }, { "distill_loss": 0.18585669994354248, "epoch": 0.7204803202134756, "step": 2160 }, { "epoch": 0.7204803202134756, "ref_ce_loss": 0.1544153392314911, "step": 2160 }, { "epoch": 0.7204803202134756, "loss": 0.7494639754295349, "step": 2160 }, { "ce_loss": 0.26624763011932373, "epoch": 0.7204803202134756, "step": 2160 }, { "distill_loss": 0.15255224704742432, "epoch": 0.7204803202134756, "step": 2160 }, { "epoch": 0.7204803202134756, "ref_ce_loss": 0.13864634931087494, "step": 2160 }, { "epoch": 0.723815877251501, "loss": 0.8542, "step": 2170 }, { "epoch": 0.723815877251501, "grad_norm": 2.2944247722625732, "step": 2170 }, { "epoch": 0.723815877251501, "learning_rate": 0.0002971335777063438, "step": 2170 }, { "epoch": 0.723815877251501, "loss": 0.7885057926177979, "step": 2170 }, { "ce_loss": 0.2262822985649109, "epoch": 0.723815877251501, "step": 2170 }, { "distill_loss": 0.22849422693252563, "epoch": 0.723815877251501, "step": 2170 }, { "epoch": 0.723815877251501, "ref_ce_loss": 0.24023662507534027, "step": 2170 }, { "epoch": 0.723815877251501, "loss": 1.0535857677459717, "step": 2170 }, { "ce_loss": 0.33788785338401794, "epoch": 0.723815877251501, "step": 2170 }, { "distill_loss": 0.28586989641189575, "epoch": 0.723815877251501, "step": 2170 }, { "epoch": 0.723815877251501, "ref_ce_loss": 0.1780318319797516, "step": 2170 }, { "epoch": 0.723815877251501, "loss": 0.929999828338623, "step": 2170 }, { "ce_loss": 0.3797388970851898, "epoch": 0.723815877251501, "step": 2170 }, { "distill_loss": 0.2834654748439789, "epoch": 0.723815877251501, "step": 2170 }, { "epoch": 0.723815877251501, "ref_ce_loss": 0.1784849613904953, "step": 2170 }, { "epoch": 0.723815877251501, "loss": 0.8898862600326538, "step": 2170 }, { "ce_loss": 0.3450695872306824, "epoch": 0.723815877251501, "step": 2170 }, { "distill_loss": 0.3220275640487671, "epoch": 0.723815877251501, "step": 2170 }, { "epoch": 0.723815877251501, "ref_ce_loss": 0.16206231713294983, "step": 2170 }, { "epoch": 0.7271514342895263, "loss": 0.9429, "step": 2180 }, { "epoch": 0.7271514342895263, "grad_norm": 2.309176445007324, "step": 2180 }, { "epoch": 0.7271514342895263, "learning_rate": 0.0002970940331076823, "step": 2180 }, { "epoch": 0.7271514342895263, "loss": 0.8022871017456055, "step": 2180 }, { "ce_loss": 0.3425973355770111, "epoch": 0.7271514342895263, "step": 2180 }, { "distill_loss": 0.2834654152393341, "epoch": 0.7271514342895263, "step": 2180 }, { "epoch": 0.7271514342895263, "ref_ce_loss": 0.17585596442222595, "step": 2180 }, { "epoch": 0.7271514342895263, "loss": 0.9950260519981384, "step": 2180 }, { "ce_loss": 0.276937872171402, "epoch": 0.7271514342895263, "step": 2180 }, { "distill_loss": 0.3323085606098175, "epoch": 0.7271514342895263, "step": 2180 }, { "epoch": 0.7271514342895263, "ref_ce_loss": 0.15872758626937866, "step": 2180 }, { "epoch": 0.7271514342895263, "loss": 1.056720495223999, "step": 2180 }, { "ce_loss": 0.25281429290771484, "epoch": 0.7271514342895263, "step": 2180 }, { "distill_loss": 0.3386659622192383, "epoch": 0.7271514342895263, "step": 2180 }, { "epoch": 0.7271514342895263, "ref_ce_loss": 0.16279882192611694, "step": 2180 }, { "epoch": 0.7271514342895263, "loss": 0.8699762225151062, "step": 2180 }, { "ce_loss": 0.34646478295326233, "epoch": 0.7271514342895263, "step": 2180 }, { "distill_loss": 0.2966349720954895, "epoch": 0.7271514342895263, "step": 2180 }, { "epoch": 0.7271514342895263, "ref_ce_loss": 0.18022169172763824, "step": 2180 }, { "epoch": 0.7304869913275517, "loss": 0.9007, "step": 2190 }, { "epoch": 0.7304869913275517, "grad_norm": 2.0588133335113525, "step": 2190 }, { "epoch": 0.7304869913275517, "learning_rate": 0.00029705422026752833, "step": 2190 }, { "epoch": 0.7304869913275517, "loss": 0.6151515245437622, "step": 2190 }, { "ce_loss": 0.23072300851345062, "epoch": 0.7304869913275517, "step": 2190 }, { "distill_loss": 0.16477593779563904, "epoch": 0.7304869913275517, "step": 2190 }, { "epoch": 0.7304869913275517, "ref_ce_loss": 0.12058866024017334, "step": 2190 }, { "epoch": 0.7304869913275517, "loss": 0.6579670310020447, "step": 2190 }, { "ce_loss": 0.25194308161735535, "epoch": 0.7304869913275517, "step": 2190 }, { "distill_loss": 0.20696522295475006, "epoch": 0.7304869913275517, "step": 2190 }, { "epoch": 0.7304869913275517, "ref_ce_loss": 0.11303284764289856, "step": 2190 }, { "epoch": 0.7304869913275517, "loss": 0.7196918725967407, "step": 2190 }, { "ce_loss": 0.27340036630630493, "epoch": 0.7304869913275517, "step": 2190 }, { "distill_loss": 0.22912757098674774, "epoch": 0.7304869913275517, "step": 2190 }, { "epoch": 0.7304869913275517, "ref_ce_loss": 0.12707854807376862, "step": 2190 }, { "epoch": 0.7304869913275517, "loss": 0.6519283056259155, "step": 2190 }, { "ce_loss": 0.19143091142177582, "epoch": 0.7304869913275517, "step": 2190 }, { "distill_loss": 0.20538610219955444, "epoch": 0.7304869913275517, "step": 2190 }, { "epoch": 0.7304869913275517, "ref_ce_loss": 0.2145569622516632, "step": 2190 }, { "epoch": 0.733822548365577, "loss": 0.813, "step": 2200 }, { "epoch": 0.733822548365577, "grad_norm": 2.7598912715911865, "step": 2200 }, { "epoch": 0.733822548365577, "learning_rate": 0.0002970141392584847, "step": 2200 }, { "epoch": 0.733822548365577, "loss": 1.048766016960144, "step": 2200 }, { "ce_loss": 0.18640631437301636, "epoch": 0.733822548365577, "step": 2200 }, { "distill_loss": 0.37723419070243835, "epoch": 0.733822548365577, "step": 2200 }, { "epoch": 0.733822548365577, "ref_ce_loss": 0.16142550110816956, "step": 2200 }, { "epoch": 0.733822548365577, "loss": 1.8987281322479248, "step": 2200 }, { "ce_loss": 0.4953133761882782, "epoch": 0.733822548365577, "step": 2200 }, { "distill_loss": 0.496428906917572, "epoch": 0.733822548365577, "step": 2200 }, { "epoch": 0.733822548365577, "ref_ce_loss": 0.294897198677063, "step": 2200 }, { "epoch": 0.733822548365577, "loss": 1.2234065532684326, "step": 2200 }, { "ce_loss": 0.44955363869667053, "epoch": 0.733822548365577, "step": 2200 }, { "distill_loss": 0.4747593402862549, "epoch": 0.733822548365577, "step": 2200 }, { "epoch": 0.733822548365577, "ref_ce_loss": 0.2343982309103012, "step": 2200 }, { "epoch": 0.733822548365577, "loss": 1.452303171157837, "step": 2200 }, { "ce_loss": 0.26678091287612915, "epoch": 0.733822548365577, "step": 2200 }, { "distill_loss": 0.4596899449825287, "epoch": 0.733822548365577, "step": 2200 }, { "epoch": 0.733822548365577, "ref_ce_loss": 0.16570772230625153, "step": 2200 }, { "epoch": 0.7371581054036024, "loss": 1.0477, "step": 2210 }, { "epoch": 0.7371581054036024, "grad_norm": 3.025998830795288, "step": 2210 }, { "epoch": 0.7371581054036024, "learning_rate": 0.00029697379015364343, "step": 2210 }, { "epoch": 0.7371581054036024, "loss": 0.9370204210281372, "step": 2210 }, { "ce_loss": 0.30269691348075867, "epoch": 0.7371581054036024, "step": 2210 }, { "distill_loss": 0.22978685796260834, "epoch": 0.7371581054036024, "step": 2210 }, { "epoch": 0.7371581054036024, "ref_ce_loss": 0.2049352377653122, "step": 2210 }, { "epoch": 0.7371581054036024, "loss": 0.7113454341888428, "step": 2210 }, { "ce_loss": 0.1772550493478775, "epoch": 0.7371581054036024, "step": 2210 }, { "distill_loss": 0.2620844841003418, "epoch": 0.7371581054036024, "step": 2210 }, { "epoch": 0.7371581054036024, "ref_ce_loss": 0.09602729231119156, "step": 2210 }, { "epoch": 0.7371581054036024, "loss": 1.1970373392105103, "step": 2210 }, { "ce_loss": 0.3331030607223511, "epoch": 0.7371581054036024, "step": 2210 }, { "distill_loss": 0.2646982967853546, "epoch": 0.7371581054036024, "step": 2210 }, { "epoch": 0.7371581054036024, "ref_ce_loss": 0.10817735642194748, "step": 2210 }, { "epoch": 0.7371581054036024, "loss": 0.6757293343544006, "step": 2210 }, { "ce_loss": 0.2500683665275574, "epoch": 0.7371581054036024, "step": 2210 }, { "distill_loss": 0.2608494460582733, "epoch": 0.7371581054036024, "step": 2210 }, { "epoch": 0.7371581054036024, "ref_ce_loss": 0.16467423737049103, "step": 2210 }, { "epoch": 0.7404936624416277, "loss": 0.7845, "step": 2220 }, { "epoch": 0.7404936624416277, "grad_norm": 2.9061050415039062, "step": 2220 }, { "epoch": 0.7404936624416277, "learning_rate": 0.00029693317302658534, "step": 2220 }, { "epoch": 0.7404936624416277, "loss": 0.7362913489341736, "step": 2220 }, { "ce_loss": 0.24955718219280243, "epoch": 0.7404936624416277, "step": 2220 }, { "distill_loss": 0.1837398260831833, "epoch": 0.7404936624416277, "step": 2220 }, { "epoch": 0.7404936624416277, "ref_ce_loss": 0.1013365387916565, "step": 2220 }, { "epoch": 0.7404936624416277, "loss": 0.8126053810119629, "step": 2220 }, { "ce_loss": 0.31580850481987, "epoch": 0.7404936624416277, "step": 2220 }, { "distill_loss": 0.20748618245124817, "epoch": 0.7404936624416277, "step": 2220 }, { "epoch": 0.7404936624416277, "ref_ce_loss": 0.1511477679014206, "step": 2220 }, { "epoch": 0.7404936624416277, "loss": 0.5393919944763184, "step": 2220 }, { "ce_loss": 0.18664191663265228, "epoch": 0.7404936624416277, "step": 2220 }, { "distill_loss": 0.1758815348148346, "epoch": 0.7404936624416277, "step": 2220 }, { "epoch": 0.7404936624416277, "ref_ce_loss": 0.12390191853046417, "step": 2220 }, { "epoch": 0.7404936624416277, "loss": 0.7449082732200623, "step": 2220 }, { "ce_loss": 0.2927023470401764, "epoch": 0.7404936624416277, "step": 2220 }, { "distill_loss": 0.2008521407842636, "epoch": 0.7404936624416277, "step": 2220 }, { "epoch": 0.7404936624416277, "ref_ce_loss": 0.1507752686738968, "step": 2220 }, { "epoch": 0.7438292194796531, "loss": 0.8107, "step": 2230 }, { "epoch": 0.7438292194796531, "grad_norm": 2.3651397228240967, "step": 2230 }, { "epoch": 0.7438292194796531, "learning_rate": 0.00029689228795138, "step": 2230 }, { "epoch": 0.7438292194796531, "loss": 0.6003869771957397, "step": 2230 }, { "ce_loss": 0.22887085378170013, "epoch": 0.7438292194796531, "step": 2230 }, { "distill_loss": 0.17210716009140015, "epoch": 0.7438292194796531, "step": 2230 }, { "epoch": 0.7438292194796531, "ref_ce_loss": 0.14253908395767212, "step": 2230 }, { "epoch": 0.7438292194796531, "loss": 0.662057101726532, "step": 2230 }, { "ce_loss": 0.19098782539367676, "epoch": 0.7438292194796531, "step": 2230 }, { "distill_loss": 0.19936403632164001, "epoch": 0.7438292194796531, "step": 2230 }, { "epoch": 0.7438292194796531, "ref_ce_loss": 0.08553535491228104, "step": 2230 }, { "epoch": 0.7438292194796531, "loss": 0.5438237190246582, "step": 2230 }, { "ce_loss": 0.1906469613313675, "epoch": 0.7438292194796531, "step": 2230 }, { "distill_loss": 0.1763281524181366, "epoch": 0.7438292194796531, "step": 2230 }, { "epoch": 0.7438292194796531, "ref_ce_loss": 0.12854325771331787, "step": 2230 }, { "epoch": 0.7438292194796531, "loss": 0.7014032006263733, "step": 2230 }, { "ce_loss": 0.2834837734699249, "epoch": 0.7438292194796531, "step": 2230 }, { "distill_loss": 0.21160462498664856, "epoch": 0.7438292194796531, "step": 2230 }, { "epoch": 0.7438292194796531, "ref_ce_loss": 0.11291535943746567, "step": 2230 }, { "epoch": 0.7471647765176784, "loss": 0.8327, "step": 2240 }, { "epoch": 0.7471647765176784, "grad_norm": 4.035131454467773, "step": 2240 }, { "epoch": 0.7471647765176784, "learning_rate": 0.0002968511350025858, "step": 2240 }, { "epoch": 0.7471647765176784, "loss": 0.791498064994812, "step": 2240 }, { "ce_loss": 0.24079382419586182, "epoch": 0.7471647765176784, "step": 2240 }, { "distill_loss": 0.31047412753105164, "epoch": 0.7471647765176784, "step": 2240 }, { "epoch": 0.7471647765176784, "ref_ce_loss": 0.15600360929965973, "step": 2240 }, { "epoch": 0.7471647765176784, "loss": 0.8515423536300659, "step": 2240 }, { "ce_loss": 0.24841324985027313, "epoch": 0.7471647765176784, "step": 2240 }, { "distill_loss": 0.28120777010917664, "epoch": 0.7471647765176784, "step": 2240 }, { "epoch": 0.7471647765176784, "ref_ce_loss": 0.25041747093200684, "step": 2240 }, { "epoch": 0.7471647765176784, "loss": 0.8282434940338135, "step": 2240 }, { "ce_loss": 0.32026177644729614, "epoch": 0.7471647765176784, "step": 2240 }, { "distill_loss": 0.3410600423812866, "epoch": 0.7471647765176784, "step": 2240 }, { "epoch": 0.7471647765176784, "ref_ce_loss": 0.12167643010616302, "step": 2240 }, { "epoch": 0.7471647765176784, "loss": 0.8194974064826965, "step": 2240 }, { "ce_loss": 0.2856970429420471, "epoch": 0.7471647765176784, "step": 2240 }, { "distill_loss": 0.3313988149166107, "epoch": 0.7471647765176784, "step": 2240 }, { "epoch": 0.7471647765176784, "ref_ce_loss": 0.20230844616889954, "step": 2240 }, { "epoch": 0.7505003335557038, "loss": 0.8899, "step": 2250 }, { "epoch": 0.7505003335557038, "grad_norm": 2.862393617630005, "step": 2250 }, { "epoch": 0.7505003335557038, "learning_rate": 0.0002968097142552494, "step": 2250 }, { "epoch": 0.7505003335557038, "loss": 0.7680596113204956, "step": 2250 }, { "ce_loss": 0.29027098417282104, "epoch": 0.7505003335557038, "step": 2250 }, { "distill_loss": 0.15521329641342163, "epoch": 0.7505003335557038, "step": 2250 }, { "epoch": 0.7505003335557038, "ref_ce_loss": 0.23092901706695557, "step": 2250 }, { "epoch": 0.7505003335557038, "loss": 0.5187767148017883, "step": 2250 }, { "ce_loss": 0.20998996496200562, "epoch": 0.7505003335557038, "step": 2250 }, { "distill_loss": 0.1527378261089325, "epoch": 0.7505003335557038, "step": 2250 }, { "epoch": 0.7505003335557038, "ref_ce_loss": 0.1060512512922287, "step": 2250 }, { "epoch": 0.7505003335557038, "loss": 1.1777644157409668, "step": 2250 }, { "ce_loss": 0.25235748291015625, "epoch": 0.7505003335557038, "step": 2250 }, { "distill_loss": 0.17757558822631836, "epoch": 0.7505003335557038, "step": 2250 }, { "epoch": 0.7505003335557038, "ref_ce_loss": 0.21147017180919647, "step": 2250 }, { "epoch": 0.7505003335557038, "loss": 0.701378345489502, "step": 2250 }, { "ce_loss": 0.32881441712379456, "epoch": 0.7505003335557038, "step": 2250 }, { "distill_loss": 0.1808972954750061, "epoch": 0.7505003335557038, "step": 2250 }, { "epoch": 0.7505003335557038, "ref_ce_loss": 0.12013345956802368, "step": 2250 }, { "epoch": 0.7538358905937291, "loss": 0.8455, "step": 2260 }, { "epoch": 0.7538358905937291, "grad_norm": 1.874451994895935, "step": 2260 }, { "epoch": 0.7538358905937291, "learning_rate": 0.0002967680257849059, "step": 2260 }, { "epoch": 0.7538358905937291, "loss": 0.6945121884346008, "step": 2260 }, { "ce_loss": 0.33384138345718384, "epoch": 0.7538358905937291, "step": 2260 }, { "distill_loss": 0.17329035699367523, "epoch": 0.7538358905937291, "step": 2260 }, { "epoch": 0.7538358905937291, "ref_ce_loss": 0.18721669912338257, "step": 2260 }, { "epoch": 0.7538358905937291, "loss": 0.754318118095398, "step": 2260 }, { "ce_loss": 0.2305452823638916, "epoch": 0.7538358905937291, "step": 2260 }, { "distill_loss": 0.14365842938423157, "epoch": 0.7538358905937291, "step": 2260 }, { "epoch": 0.7538358905937291, "ref_ce_loss": 0.12453807890415192, "step": 2260 }, { "epoch": 0.7538358905937291, "loss": 1.0577518939971924, "step": 2260 }, { "ce_loss": 0.22482116520404816, "epoch": 0.7538358905937291, "step": 2260 }, { "distill_loss": 0.15167245268821716, "epoch": 0.7538358905937291, "step": 2260 }, { "epoch": 0.7538358905937291, "ref_ce_loss": 0.1493624448776245, "step": 2260 }, { "epoch": 0.7538358905937291, "loss": 1.0203830003738403, "step": 2260 }, { "ce_loss": 0.28023025393486023, "epoch": 0.7538358905937291, "step": 2260 }, { "distill_loss": 0.17490828037261963, "epoch": 0.7538358905937291, "step": 2260 }, { "epoch": 0.7538358905937291, "ref_ce_loss": 0.21855275332927704, "step": 2260 }, { "epoch": 0.7571714476317545, "loss": 0.7753, "step": 2270 }, { "epoch": 0.7571714476317545, "grad_norm": 3.631338357925415, "step": 2270 }, { "epoch": 0.7571714476317545, "learning_rate": 0.00029672606966757854, "step": 2270 }, { "epoch": 0.7571714476317545, "loss": 0.7592071890830994, "step": 2270 }, { "ce_loss": 0.3136902153491974, "epoch": 0.7571714476317545, "step": 2270 }, { "distill_loss": 0.14364375174045563, "epoch": 0.7571714476317545, "step": 2270 }, { "epoch": 0.7571714476317545, "ref_ce_loss": 0.22031280398368835, "step": 2270 }, { "epoch": 0.7571714476317545, "loss": 1.3026163578033447, "step": 2270 }, { "ce_loss": 0.3515109717845917, "epoch": 0.7571714476317545, "step": 2270 }, { "distill_loss": 0.14750359952449799, "epoch": 0.7571714476317545, "step": 2270 }, { "epoch": 0.7571714476317545, "ref_ce_loss": 0.2071472853422165, "step": 2270 }, { "epoch": 0.7571714476317545, "loss": 1.009307861328125, "step": 2270 }, { "ce_loss": 0.27528730034828186, "epoch": 0.7571714476317545, "step": 2270 }, { "distill_loss": 0.15073201060295105, "epoch": 0.7571714476317545, "step": 2270 }, { "epoch": 0.7571714476317545, "ref_ce_loss": 0.17023694515228271, "step": 2270 }, { "epoch": 0.7571714476317545, "loss": 1.061471939086914, "step": 2270 }, { "ce_loss": 0.3027641773223877, "epoch": 0.7571714476317545, "step": 2270 }, { "distill_loss": 0.14477810263633728, "epoch": 0.7571714476317545, "step": 2270 }, { "epoch": 0.7571714476317545, "ref_ce_loss": 0.1614489108324051, "step": 2270 }, { "epoch": 0.7605070046697798, "loss": 0.7498, "step": 2280 }, { "epoch": 0.7605070046697798, "grad_norm": 2.607236862182617, "step": 2280 }, { "epoch": 0.7605070046697798, "learning_rate": 0.0002966838459797789, "step": 2280 }, { "epoch": 0.7605070046697798, "loss": 0.7864717245101929, "step": 2280 }, { "ce_loss": 0.27395790815353394, "epoch": 0.7605070046697798, "step": 2280 }, { "distill_loss": 0.19188761711120605, "epoch": 0.7605070046697798, "step": 2280 }, { "epoch": 0.7605070046697798, "ref_ce_loss": 0.16352500021457672, "step": 2280 }, { "epoch": 0.7605070046697798, "loss": 0.7618268728256226, "step": 2280 }, { "ce_loss": 0.3203011155128479, "epoch": 0.7605070046697798, "step": 2280 }, { "distill_loss": 0.2293919324874878, "epoch": 0.7605070046697798, "step": 2280 }, { "epoch": 0.7605070046697798, "ref_ce_loss": 0.1524127572774887, "step": 2280 }, { "epoch": 0.7605070046697798, "loss": 0.6663198471069336, "step": 2280 }, { "ce_loss": 0.282576322555542, "epoch": 0.7605070046697798, "step": 2280 }, { "distill_loss": 0.24836762249469757, "epoch": 0.7605070046697798, "step": 2280 }, { "epoch": 0.7605070046697798, "ref_ce_loss": 0.1351906657218933, "step": 2280 }, { "epoch": 0.7605070046697798, "loss": 0.6264198422431946, "step": 2280 }, { "ce_loss": 0.23629958927631378, "epoch": 0.7605070046697798, "step": 2280 }, { "distill_loss": 0.18736609816551208, "epoch": 0.7605070046697798, "step": 2280 }, { "epoch": 0.7605070046697798, "ref_ce_loss": 0.20251253247261047, "step": 2280 }, { "epoch": 0.7638425617078052, "loss": 0.7769, "step": 2290 }, { "epoch": 0.7638425617078052, "grad_norm": 3.2813711166381836, "step": 2290 }, { "epoch": 0.7638425617078052, "learning_rate": 0.0002966413547985062, "step": 2290 }, { "epoch": 0.7638425617078052, "loss": 1.0629838705062866, "step": 2290 }, { "ce_loss": 0.34151533246040344, "epoch": 0.7638425617078052, "step": 2290 }, { "distill_loss": 0.21097716689109802, "epoch": 0.7638425617078052, "step": 2290 }, { "epoch": 0.7638425617078052, "ref_ce_loss": 0.17457735538482666, "step": 2290 }, { "epoch": 0.7638425617078052, "loss": 0.6704175472259521, "step": 2290 }, { "ce_loss": 0.2535441517829895, "epoch": 0.7638425617078052, "step": 2290 }, { "distill_loss": 0.1876956969499588, "epoch": 0.7638425617078052, "step": 2290 }, { "epoch": 0.7638425617078052, "ref_ce_loss": 0.15311689674854279, "step": 2290 }, { "epoch": 0.7638425617078052, "loss": 0.6135028600692749, "step": 2290 }, { "ce_loss": 0.19847552478313446, "epoch": 0.7638425617078052, "step": 2290 }, { "distill_loss": 0.18787772953510284, "epoch": 0.7638425617078052, "step": 2290 }, { "epoch": 0.7638425617078052, "ref_ce_loss": 0.14677968621253967, "step": 2290 }, { "epoch": 0.7638425617078052, "loss": 1.4862701892852783, "step": 2290 }, { "ce_loss": 0.26155686378479004, "epoch": 0.7638425617078052, "step": 2290 }, { "distill_loss": 0.2060234695672989, "epoch": 0.7638425617078052, "step": 2290 }, { "epoch": 0.7638425617078052, "ref_ce_loss": 0.14456400275230408, "step": 2290 }, { "epoch": 0.7671781187458305, "loss": 0.7509, "step": 2300 }, { "epoch": 0.7671781187458305, "grad_norm": 2.5237789154052734, "step": 2300 }, { "epoch": 0.7671781187458305, "learning_rate": 0.0002965985962012477, "step": 2300 }, { "epoch": 0.7671781187458305, "loss": 0.5957300066947937, "step": 2300 }, { "ce_loss": 0.26703551411628723, "epoch": 0.7671781187458305, "step": 2300 }, { "distill_loss": 0.1350327730178833, "epoch": 0.7671781187458305, "step": 2300 }, { "epoch": 0.7671781187458305, "ref_ce_loss": 0.1326143741607666, "step": 2300 }, { "epoch": 0.7671781187458305, "loss": 0.7044022083282471, "step": 2300 }, { "ce_loss": 0.17931139469146729, "epoch": 0.7671781187458305, "step": 2300 }, { "distill_loss": 0.14432227611541748, "epoch": 0.7671781187458305, "step": 2300 }, { "epoch": 0.7671781187458305, "ref_ce_loss": 0.16795673966407776, "step": 2300 }, { "epoch": 0.7671781187458305, "loss": 0.8188800811767578, "step": 2300 }, { "ce_loss": 0.29290133714675903, "epoch": 0.7671781187458305, "step": 2300 }, { "distill_loss": 0.1588171422481537, "epoch": 0.7671781187458305, "step": 2300 }, { "epoch": 0.7671781187458305, "ref_ce_loss": 0.1536998599767685, "step": 2300 }, { "epoch": 0.7671781187458305, "loss": 0.708327054977417, "step": 2300 }, { "ce_loss": 0.32811471819877625, "epoch": 0.7671781187458305, "step": 2300 }, { "distill_loss": 0.17735375463962555, "epoch": 0.7671781187458305, "step": 2300 }, { "epoch": 0.7671781187458305, "ref_ce_loss": 0.1569249927997589, "step": 2300 }, { "epoch": 0.7705136757838559, "loss": 0.8038, "step": 2310 }, { "epoch": 0.7705136757838559, "grad_norm": 2.1413021087646484, "step": 2310 }, { "epoch": 0.7705136757838559, "learning_rate": 0.00029655557026597815, "step": 2310 }, { "epoch": 0.7705136757838559, "loss": 0.6469117999076843, "step": 2310 }, { "ce_loss": 0.22463861107826233, "epoch": 0.7705136757838559, "step": 2310 }, { "distill_loss": 0.13765175640583038, "epoch": 0.7705136757838559, "step": 2310 }, { "epoch": 0.7705136757838559, "ref_ce_loss": 0.18833401799201965, "step": 2310 }, { "epoch": 0.7705136757838559, "loss": 0.8655230402946472, "step": 2310 }, { "ce_loss": 0.3175293803215027, "epoch": 0.7705136757838559, "step": 2310 }, { "distill_loss": 0.1456879824399948, "epoch": 0.7705136757838559, "step": 2310 }, { "epoch": 0.7705136757838559, "ref_ce_loss": 0.1650840938091278, "step": 2310 }, { "epoch": 0.7705136757838559, "loss": 0.7164068818092346, "step": 2310 }, { "ce_loss": 0.2648800313472748, "epoch": 0.7705136757838559, "step": 2310 }, { "distill_loss": 0.17314201593399048, "epoch": 0.7705136757838559, "step": 2310 }, { "epoch": 0.7705136757838559, "ref_ce_loss": 0.1560521125793457, "step": 2310 }, { "epoch": 0.7705136757838559, "loss": 0.4807608723640442, "step": 2310 }, { "ce_loss": 0.1812056303024292, "epoch": 0.7705136757838559, "step": 2310 }, { "distill_loss": 0.12476127594709396, "epoch": 0.7705136757838559, "step": 2310 }, { "epoch": 0.7705136757838559, "ref_ce_loss": 0.12248016893863678, "step": 2310 }, { "epoch": 0.7738492328218812, "loss": 0.7234, "step": 2320 }, { "epoch": 0.7738492328218812, "grad_norm": 1.9185909032821655, "step": 2320 }, { "epoch": 0.7738492328218812, "learning_rate": 0.0002965122770711599, "step": 2320 }, { "epoch": 0.7738492328218812, "loss": 0.4447229206562042, "step": 2320 }, { "ce_loss": 0.17661221325397491, "epoch": 0.7738492328218812, "step": 2320 }, { "distill_loss": 0.15855096280574799, "epoch": 0.7738492328218812, "step": 2320 }, { "epoch": 0.7738492328218812, "ref_ce_loss": 0.10898029804229736, "step": 2320 }, { "epoch": 0.7738492328218812, "loss": 1.2085399627685547, "step": 2320 }, { "ce_loss": 0.2651851773262024, "epoch": 0.7738492328218812, "step": 2320 }, { "distill_loss": 0.13599959015846252, "epoch": 0.7738492328218812, "step": 2320 }, { "epoch": 0.7738492328218812, "ref_ce_loss": 0.131027951836586, "step": 2320 }, { "epoch": 0.7738492328218812, "loss": 0.9505325555801392, "step": 2320 }, { "ce_loss": 0.27964797616004944, "epoch": 0.7738492328218812, "step": 2320 }, { "distill_loss": 0.1719699501991272, "epoch": 0.7738492328218812, "step": 2320 }, { "epoch": 0.7738492328218812, "ref_ce_loss": 0.12308228015899658, "step": 2320 }, { "epoch": 0.7738492328218812, "loss": 0.9700384140014648, "step": 2320 }, { "ce_loss": 0.5339371562004089, "epoch": 0.7738492328218812, "step": 2320 }, { "distill_loss": 0.22477610409259796, "epoch": 0.7738492328218812, "step": 2320 }, { "epoch": 0.7738492328218812, "ref_ce_loss": 0.15580026805400848, "step": 2320 }, { "epoch": 0.7771847898599066, "loss": 0.7795, "step": 2330 }, { "epoch": 0.7771847898599066, "grad_norm": 2.1152358055114746, "step": 2330 }, { "epoch": 0.7771847898599066, "learning_rate": 0.00029646871669574256, "step": 2330 }, { "epoch": 0.7771847898599066, "loss": 0.6992760896682739, "step": 2330 }, { "ce_loss": 0.2329271286725998, "epoch": 0.7771847898599066, "step": 2330 }, { "distill_loss": 0.11824934929609299, "epoch": 0.7771847898599066, "step": 2330 }, { "epoch": 0.7771847898599066, "ref_ce_loss": 0.16220200061798096, "step": 2330 }, { "epoch": 0.7771847898599066, "loss": 0.6952053904533386, "step": 2330 }, { "ce_loss": 0.3089507818222046, "epoch": 0.7771847898599066, "step": 2330 }, { "distill_loss": 0.13048169016838074, "epoch": 0.7771847898599066, "step": 2330 }, { "epoch": 0.7771847898599066, "ref_ce_loss": 0.1320132315158844, "step": 2330 }, { "epoch": 0.7771847898599066, "loss": 0.9603186845779419, "step": 2330 }, { "ce_loss": 0.3964546322822571, "epoch": 0.7771847898599066, "step": 2330 }, { "distill_loss": 0.13962863385677338, "epoch": 0.7771847898599066, "step": 2330 }, { "epoch": 0.7771847898599066, "ref_ce_loss": 0.20740242302417755, "step": 2330 }, { "epoch": 0.7771847898599066, "loss": 1.281550407409668, "step": 2330 }, { "ce_loss": 0.24422332644462585, "epoch": 0.7771847898599066, "step": 2330 }, { "distill_loss": 0.10252149403095245, "epoch": 0.7771847898599066, "step": 2330 }, { "epoch": 0.7771847898599066, "ref_ce_loss": 0.15284155309200287, "step": 2330 }, { "epoch": 0.7805203468979319, "loss": 0.7734, "step": 2340 }, { "epoch": 0.7805203468979319, "grad_norm": 2.945228099822998, "step": 2340 }, { "epoch": 0.7805203468979319, "learning_rate": 0.00029642488921916325, "step": 2340 }, { "epoch": 0.7805203468979319, "loss": 0.5152295827865601, "step": 2340 }, { "ce_loss": 0.21698597073554993, "epoch": 0.7805203468979319, "step": 2340 }, { "distill_loss": 0.15574845671653748, "epoch": 0.7805203468979319, "step": 2340 }, { "epoch": 0.7805203468979319, "ref_ce_loss": 0.14177797734737396, "step": 2340 }, { "epoch": 0.7805203468979319, "loss": 0.6439272165298462, "step": 2340 }, { "ce_loss": 0.23492102324962616, "epoch": 0.7805203468979319, "step": 2340 }, { "distill_loss": 0.1553235501050949, "epoch": 0.7805203468979319, "step": 2340 }, { "epoch": 0.7805203468979319, "ref_ce_loss": 0.11704672873020172, "step": 2340 }, { "epoch": 0.7805203468979319, "loss": 0.6790904998779297, "step": 2340 }, { "ce_loss": 0.298139363527298, "epoch": 0.7805203468979319, "step": 2340 }, { "distill_loss": 0.1888904869556427, "epoch": 0.7805203468979319, "step": 2340 }, { "epoch": 0.7805203468979319, "ref_ce_loss": 0.19122987985610962, "step": 2340 }, { "epoch": 0.7805203468979319, "loss": 0.5988003015518188, "step": 2340 }, { "ce_loss": 0.1609693169593811, "epoch": 0.7805203468979319, "step": 2340 }, { "distill_loss": 0.15114513039588928, "epoch": 0.7805203468979319, "step": 2340 }, { "epoch": 0.7805203468979319, "ref_ce_loss": 0.18135370314121246, "step": 2340 }, { "epoch": 0.7838559039359573, "loss": 0.7991, "step": 2350 }, { "epoch": 0.7838559039359573, "grad_norm": 3.823965311050415, "step": 2350 }, { "epoch": 0.7838559039359573, "learning_rate": 0.0002963807947213458, "step": 2350 }, { "epoch": 0.7838559039359573, "loss": 0.8651024103164673, "step": 2350 }, { "ce_loss": 0.22501082718372345, "epoch": 0.7838559039359573, "step": 2350 }, { "distill_loss": 0.1862696409225464, "epoch": 0.7838559039359573, "step": 2350 }, { "epoch": 0.7838559039359573, "ref_ce_loss": 0.10980051010847092, "step": 2350 }, { "epoch": 0.7838559039359573, "loss": 0.550654947757721, "step": 2350 }, { "ce_loss": 0.2535894811153412, "epoch": 0.7838559039359573, "step": 2350 }, { "distill_loss": 0.1499275267124176, "epoch": 0.7838559039359573, "step": 2350 }, { "epoch": 0.7838559039359573, "ref_ce_loss": 0.11346378922462463, "step": 2350 }, { "epoch": 0.7838559039359573, "loss": 0.6153100728988647, "step": 2350 }, { "ce_loss": 0.2186146378517151, "epoch": 0.7838559039359573, "step": 2350 }, { "distill_loss": 0.14961737394332886, "epoch": 0.7838559039359573, "step": 2350 }, { "epoch": 0.7838559039359573, "ref_ce_loss": 0.1576690971851349, "step": 2350 }, { "epoch": 0.7838559039359573, "loss": 0.7601372599601746, "step": 2350 }, { "ce_loss": 0.2906789481639862, "epoch": 0.7838559039359573, "step": 2350 }, { "distill_loss": 0.16383947432041168, "epoch": 0.7838559039359573, "step": 2350 }, { "epoch": 0.7838559039359573, "ref_ce_loss": 0.20926496386528015, "step": 2350 }, { "epoch": 0.7871914609739826, "loss": 0.7316, "step": 2360 }, { "epoch": 0.7871914609739826, "grad_norm": 2.4781031608581543, "step": 2360 }, { "epoch": 0.7871914609739826, "learning_rate": 0.0002963364332827014, "step": 2360 }, { "epoch": 0.7871914609739826, "loss": 0.6347142457962036, "step": 2360 }, { "ce_loss": 0.20326924324035645, "epoch": 0.7871914609739826, "step": 2360 }, { "distill_loss": 0.14620020985603333, "epoch": 0.7871914609739826, "step": 2360 }, { "epoch": 0.7871914609739826, "ref_ce_loss": 0.14227043092250824, "step": 2360 }, { "epoch": 0.7871914609739826, "loss": 1.0564327239990234, "step": 2360 }, { "ce_loss": 0.3296639025211334, "epoch": 0.7871914609739826, "step": 2360 }, { "distill_loss": 0.20934753119945526, "epoch": 0.7871914609739826, "step": 2360 }, { "epoch": 0.7871914609739826, "ref_ce_loss": 0.1660376489162445, "step": 2360 }, { "epoch": 0.7871914609739826, "loss": 0.7564572691917419, "step": 2360 }, { "ce_loss": 0.23574452102184296, "epoch": 0.7871914609739826, "step": 2360 }, { "distill_loss": 0.16113722324371338, "epoch": 0.7871914609739826, "step": 2360 }, { "epoch": 0.7871914609739826, "ref_ce_loss": 0.18381242454051971, "step": 2360 }, { "epoch": 0.7871914609739826, "loss": 1.2118096351623535, "step": 2360 }, { "ce_loss": 0.27346286177635193, "epoch": 0.7871914609739826, "step": 2360 }, { "distill_loss": 0.222482368350029, "epoch": 0.7871914609739826, "step": 2360 }, { "epoch": 0.7871914609739826, "ref_ce_loss": 0.17169533669948578, "step": 2360 }, { "epoch": 0.790527018012008, "loss": 0.7116, "step": 2370 }, { "epoch": 0.790527018012008, "grad_norm": 2.3116695880889893, "step": 2370 }, { "epoch": 0.790527018012008, "learning_rate": 0.00029629180498412765, "step": 2370 }, { "epoch": 0.790527018012008, "loss": 0.7551827430725098, "step": 2370 }, { "ce_loss": 0.21025662124156952, "epoch": 0.790527018012008, "step": 2370 }, { "distill_loss": 0.10454167425632477, "epoch": 0.790527018012008, "step": 2370 }, { "epoch": 0.790527018012008, "ref_ce_loss": 0.1653343290090561, "step": 2370 }, { "epoch": 0.790527018012008, "loss": 0.3096018135547638, "step": 2370 }, { "ce_loss": 0.13652345538139343, "epoch": 0.790527018012008, "step": 2370 }, { "distill_loss": 0.09901779890060425, "epoch": 0.790527018012008, "step": 2370 }, { "epoch": 0.790527018012008, "ref_ce_loss": 0.07386656105518341, "step": 2370 }, { "epoch": 0.790527018012008, "loss": 0.5769529938697815, "step": 2370 }, { "ce_loss": 0.2886972427368164, "epoch": 0.790527018012008, "step": 2370 }, { "distill_loss": 0.11762228608131409, "epoch": 0.790527018012008, "step": 2370 }, { "epoch": 0.790527018012008, "ref_ce_loss": 0.1701674461364746, "step": 2370 }, { "epoch": 0.790527018012008, "loss": 0.8072974681854248, "step": 2370 }, { "ce_loss": 0.3220660090446472, "epoch": 0.790527018012008, "step": 2370 }, { "distill_loss": 0.12519507110118866, "epoch": 0.790527018012008, "step": 2370 }, { "epoch": 0.790527018012008, "ref_ce_loss": 0.20523789525032043, "step": 2370 }, { "epoch": 0.7938625750500333, "loss": 0.7613, "step": 2380 }, { "epoch": 0.7938625750500333, "grad_norm": 4.6878509521484375, "step": 2380 }, { "epoch": 0.7938625750500333, "learning_rate": 0.00029624690990700907, "step": 2380 }, { "epoch": 0.7938625750500333, "loss": 0.6524088382720947, "step": 2380 }, { "ce_loss": 0.21513007581233978, "epoch": 0.7938625750500333, "step": 2380 }, { "distill_loss": 0.24648374319076538, "epoch": 0.7938625750500333, "step": 2380 }, { "epoch": 0.7938625750500333, "ref_ce_loss": 0.13517114520072937, "step": 2380 }, { "epoch": 0.7938625750500333, "loss": 1.555312156677246, "step": 2380 }, { "ce_loss": 0.3172930181026459, "epoch": 0.7938625750500333, "step": 2380 }, { "distill_loss": 0.18661177158355713, "epoch": 0.7938625750500333, "step": 2380 }, { "epoch": 0.7938625750500333, "ref_ce_loss": 0.24270252883434296, "step": 2380 }, { "epoch": 0.7938625750500333, "loss": 0.7257373332977295, "step": 2380 }, { "ce_loss": 0.28231558203697205, "epoch": 0.7938625750500333, "step": 2380 }, { "distill_loss": 0.22690123319625854, "epoch": 0.7938625750500333, "step": 2380 }, { "epoch": 0.7938625750500333, "ref_ce_loss": 0.15161336958408356, "step": 2380 }, { "epoch": 0.7938625750500333, "loss": 0.7957109212875366, "step": 2380 }, { "ce_loss": 0.23962001502513885, "epoch": 0.7938625750500333, "step": 2380 }, { "distill_loss": 0.22382575273513794, "epoch": 0.7938625750500333, "step": 2380 }, { "epoch": 0.7938625750500333, "ref_ce_loss": 0.21372731029987335, "step": 2380 }, { "epoch": 0.7971981320880587, "loss": 0.8852, "step": 2390 }, { "epoch": 0.7971981320880587, "grad_norm": 2.4466781616210938, "step": 2390 }, { "epoch": 0.7971981320880587, "learning_rate": 0.00029620174813321646, "step": 2390 }, { "epoch": 0.7971981320880587, "loss": 0.736382782459259, "step": 2390 }, { "ce_loss": 0.2084326297044754, "epoch": 0.7971981320880587, "step": 2390 }, { "distill_loss": 0.2203093320131302, "epoch": 0.7971981320880587, "step": 2390 }, { "epoch": 0.7971981320880587, "ref_ce_loss": 0.14905014634132385, "step": 2390 }, { "epoch": 0.7971981320880587, "loss": 0.6536170840263367, "step": 2390 }, { "ce_loss": 0.19936946034431458, "epoch": 0.7971981320880587, "step": 2390 }, { "distill_loss": 0.1701754927635193, "epoch": 0.7971981320880587, "step": 2390 }, { "epoch": 0.7971981320880587, "ref_ce_loss": 0.14872415363788605, "step": 2390 }, { "epoch": 0.7971981320880587, "loss": 0.7685835361480713, "step": 2390 }, { "ce_loss": 0.2817256450653076, "epoch": 0.7971981320880587, "step": 2390 }, { "distill_loss": 0.1766086220741272, "epoch": 0.7971981320880587, "step": 2390 }, { "epoch": 0.7971981320880587, "ref_ce_loss": 0.14955390989780426, "step": 2390 }, { "epoch": 0.7971981320880587, "loss": 1.0945006608963013, "step": 2390 }, { "ce_loss": 0.3128070533275604, "epoch": 0.7971981320880587, "step": 2390 }, { "distill_loss": 0.24318374693393707, "epoch": 0.7971981320880587, "step": 2390 }, { "epoch": 0.7971981320880587, "ref_ce_loss": 0.20368048548698425, "step": 2390 }, { "epoch": 0.800533689126084, "loss": 0.7957, "step": 2400 }, { "epoch": 0.800533689126084, "grad_norm": 3.198331832885742, "step": 2400 }, { "epoch": 0.800533689126084, "learning_rate": 0.0002961563197451072, "step": 2400 }, { "epoch": 0.800533689126084, "loss": 1.1640307903289795, "step": 2400 }, { "ce_loss": 0.2592827081680298, "epoch": 0.800533689126084, "step": 2400 }, { "distill_loss": 0.17209471762180328, "epoch": 0.800533689126084, "step": 2400 }, { "epoch": 0.800533689126084, "ref_ce_loss": 0.13521651923656464, "step": 2400 }, { "epoch": 0.800533689126084, "loss": 0.9015942811965942, "step": 2400 }, { "ce_loss": 0.22579120099544525, "epoch": 0.800533689126084, "step": 2400 }, { "distill_loss": 0.1696089655160904, "epoch": 0.800533689126084, "step": 2400 }, { "epoch": 0.800533689126084, "ref_ce_loss": 0.0945950597524643, "step": 2400 }, { "epoch": 0.800533689126084, "loss": 1.1169930696487427, "step": 2400 }, { "ce_loss": 0.24338200688362122, "epoch": 0.800533689126084, "step": 2400 }, { "distill_loss": 0.17723533511161804, "epoch": 0.800533689126084, "step": 2400 }, { "epoch": 0.800533689126084, "ref_ce_loss": 0.18651683628559113, "step": 2400 }, { "epoch": 0.800533689126084, "loss": 0.8603613376617432, "step": 2400 }, { "ce_loss": 0.3572869300842285, "epoch": 0.800533689126084, "step": 2400 }, { "distill_loss": 0.16341561079025269, "epoch": 0.800533689126084, "step": 2400 }, { "epoch": 0.800533689126084, "ref_ce_loss": 0.2452453076839447, "step": 2400 }, { "epoch": 0.8038692461641094, "loss": 0.8002, "step": 2410 }, { "epoch": 0.8038692461641094, "grad_norm": 2.273329973220825, "step": 2410 }, { "epoch": 0.8038692461641094, "learning_rate": 0.00029611062482552464, "step": 2410 }, { "epoch": 0.8038692461641094, "loss": 1.0640661716461182, "step": 2410 }, { "ce_loss": 0.30567464232444763, "epoch": 0.8038692461641094, "step": 2410 }, { "distill_loss": 0.20861530303955078, "epoch": 0.8038692461641094, "step": 2410 }, { "epoch": 0.8038692461641094, "ref_ce_loss": 0.22182579338550568, "step": 2410 }, { "epoch": 0.8038692461641094, "loss": 0.7910094261169434, "step": 2410 }, { "ce_loss": 0.2559601664543152, "epoch": 0.8038692461641094, "step": 2410 }, { "distill_loss": 0.19917668402194977, "epoch": 0.8038692461641094, "step": 2410 }, { "epoch": 0.8038692461641094, "ref_ce_loss": 0.1583707183599472, "step": 2410 }, { "epoch": 0.8038692461641094, "loss": 0.6116624474525452, "step": 2410 }, { "ce_loss": 0.21384522318840027, "epoch": 0.8038692461641094, "step": 2410 }, { "distill_loss": 0.17438215017318726, "epoch": 0.8038692461641094, "step": 2410 }, { "epoch": 0.8038692461641094, "ref_ce_loss": 0.12977702915668488, "step": 2410 }, { "epoch": 0.8038692461641094, "loss": 0.8569213151931763, "step": 2410 }, { "ce_loss": 0.40515777468681335, "epoch": 0.8038692461641094, "step": 2410 }, { "distill_loss": 0.2386610209941864, "epoch": 0.8038692461641094, "step": 2410 }, { "epoch": 0.8038692461641094, "ref_ce_loss": 0.16905608773231506, "step": 2410 }, { "epoch": 0.8072048032021347, "loss": 0.7839, "step": 2420 }, { "epoch": 0.8072048032021347, "grad_norm": 2.3425471782684326, "step": 2420 }, { "epoch": 0.8072048032021347, "learning_rate": 0.0002960646634577983, "step": 2420 }, { "epoch": 0.8072048032021347, "loss": 0.8587204813957214, "step": 2420 }, { "ce_loss": 0.2946504056453705, "epoch": 0.8072048032021347, "step": 2420 }, { "distill_loss": 0.26324498653411865, "epoch": 0.8072048032021347, "step": 2420 }, { "epoch": 0.8072048032021347, "ref_ce_loss": 0.17262223362922668, "step": 2420 }, { "epoch": 0.8072048032021347, "loss": 1.2969701290130615, "step": 2420 }, { "ce_loss": 0.22363796830177307, "epoch": 0.8072048032021347, "step": 2420 }, { "distill_loss": 0.19747430086135864, "epoch": 0.8072048032021347, "step": 2420 }, { "epoch": 0.8072048032021347, "ref_ce_loss": 0.13005143404006958, "step": 2420 }, { "epoch": 0.8072048032021347, "loss": 0.711165189743042, "step": 2420 }, { "ce_loss": 0.17365582287311554, "epoch": 0.8072048032021347, "step": 2420 }, { "distill_loss": 0.20176240801811218, "epoch": 0.8072048032021347, "step": 2420 }, { "epoch": 0.8072048032021347, "ref_ce_loss": 0.10909857600927353, "step": 2420 }, { "epoch": 0.8072048032021347, "loss": 0.7779433727264404, "step": 2420 }, { "ce_loss": 0.20582044124603271, "epoch": 0.8072048032021347, "step": 2420 }, { "distill_loss": 0.2517743706703186, "epoch": 0.8072048032021347, "step": 2420 }, { "epoch": 0.8072048032021347, "ref_ce_loss": 0.17426325380802155, "step": 2420 }, { "epoch": 0.8105403602401601, "loss": 0.8096, "step": 2430 }, { "epoch": 0.8105403602401601, "grad_norm": 1.9124221801757812, "step": 2430 }, { "epoch": 0.8105403602401601, "learning_rate": 0.00029601843572574373, "step": 2430 }, { "epoch": 0.8105403602401601, "loss": 0.6110777854919434, "step": 2430 }, { "ce_loss": 0.21673980355262756, "epoch": 0.8105403602401601, "step": 2430 }, { "distill_loss": 0.19520121812820435, "epoch": 0.8105403602401601, "step": 2430 }, { "epoch": 0.8105403602401601, "ref_ce_loss": 0.08564713597297668, "step": 2430 }, { "epoch": 0.8105403602401601, "loss": 0.7680901885032654, "step": 2430 }, { "ce_loss": 0.31637847423553467, "epoch": 0.8105403602401601, "step": 2430 }, { "distill_loss": 0.1756715029478073, "epoch": 0.8105403602401601, "step": 2430 }, { "epoch": 0.8105403602401601, "ref_ce_loss": 0.13785050809383392, "step": 2430 }, { "epoch": 0.8105403602401601, "loss": 0.7048861980438232, "step": 2430 }, { "ce_loss": 0.28358614444732666, "epoch": 0.8105403602401601, "step": 2430 }, { "distill_loss": 0.16370432078838348, "epoch": 0.8105403602401601, "step": 2430 }, { "epoch": 0.8105403602401601, "ref_ce_loss": 0.1661026030778885, "step": 2430 }, { "epoch": 0.8105403602401601, "loss": 0.6960597038269043, "step": 2430 }, { "ce_loss": 0.25172582268714905, "epoch": 0.8105403602401601, "step": 2430 }, { "distill_loss": 0.15557169914245605, "epoch": 0.8105403602401601, "step": 2430 }, { "epoch": 0.8105403602401601, "ref_ce_loss": 0.12519201636314392, "step": 2430 }, { "epoch": 0.8138759172781854, "loss": 0.8606, "step": 2440 }, { "epoch": 0.8138759172781854, "grad_norm": 2.105224609375, "step": 2440 }, { "epoch": 0.8138759172781854, "learning_rate": 0.0002959719417136619, "step": 2440 }, { "epoch": 0.8138759172781854, "loss": 0.6570507287979126, "step": 2440 }, { "ce_loss": 0.22971493005752563, "epoch": 0.8138759172781854, "step": 2440 }, { "distill_loss": 0.20155471563339233, "epoch": 0.8138759172781854, "step": 2440 }, { "epoch": 0.8138759172781854, "ref_ce_loss": 0.17011815309524536, "step": 2440 }, { "epoch": 0.8138759172781854, "loss": 0.8933101296424866, "step": 2440 }, { "ce_loss": 0.3815864324569702, "epoch": 0.8138759172781854, "step": 2440 }, { "distill_loss": 0.2659887373447418, "epoch": 0.8138759172781854, "step": 2440 }, { "epoch": 0.8138759172781854, "ref_ce_loss": 0.18426789343357086, "step": 2440 }, { "epoch": 0.8138759172781854, "loss": 0.7041268944740295, "step": 2440 }, { "ce_loss": 0.2709902822971344, "epoch": 0.8138759172781854, "step": 2440 }, { "distill_loss": 0.2296167016029358, "epoch": 0.8138759172781854, "step": 2440 }, { "epoch": 0.8138759172781854, "ref_ce_loss": 0.1552482694387436, "step": 2440 }, { "epoch": 0.8138759172781854, "loss": 0.75736004114151, "step": 2440 }, { "ce_loss": 0.2580389678478241, "epoch": 0.8138759172781854, "step": 2440 }, { "distill_loss": 0.21966888010501862, "epoch": 0.8138759172781854, "step": 2440 }, { "epoch": 0.8138759172781854, "ref_ce_loss": 0.18772462010383606, "step": 2440 }, { "epoch": 0.8172114743162108, "loss": 0.76, "step": 2450 }, { "epoch": 0.8172114743162108, "grad_norm": 2.26934814453125, "step": 2450 }, { "epoch": 0.8172114743162108, "learning_rate": 0.00029592518150633963, "step": 2450 }, { "epoch": 0.8172114743162108, "loss": 0.6550593972206116, "step": 2450 }, { "ce_loss": 0.2627783417701721, "epoch": 0.8172114743162108, "step": 2450 }, { "distill_loss": 0.17369389533996582, "epoch": 0.8172114743162108, "step": 2450 }, { "epoch": 0.8172114743162108, "ref_ce_loss": 0.16940702497959137, "step": 2450 }, { "epoch": 0.8172114743162108, "loss": 0.5833581686019897, "step": 2450 }, { "ce_loss": 0.2363595813512802, "epoch": 0.8172114743162108, "step": 2450 }, { "distill_loss": 0.14457328617572784, "epoch": 0.8172114743162108, "step": 2450 }, { "epoch": 0.8172114743162108, "ref_ce_loss": 0.13348720967769623, "step": 2450 }, { "epoch": 0.8172114743162108, "loss": 0.8066520690917969, "step": 2450 }, { "ce_loss": 0.2194216549396515, "epoch": 0.8172114743162108, "step": 2450 }, { "distill_loss": 0.16594922542572021, "epoch": 0.8172114743162108, "step": 2450 }, { "epoch": 0.8172114743162108, "ref_ce_loss": 0.20511534810066223, "step": 2450 }, { "epoch": 0.8172114743162108, "loss": 0.4526219964027405, "step": 2450 }, { "ce_loss": 0.1525576412677765, "epoch": 0.8172114743162108, "step": 2450 }, { "distill_loss": 0.1163424700498581, "epoch": 0.8172114743162108, "step": 2450 }, { "epoch": 0.8172114743162108, "ref_ce_loss": 0.12026916444301605, "step": 2450 }, { "epoch": 0.8205470313542361, "loss": 0.7301, "step": 2460 }, { "epoch": 0.8205470313542361, "grad_norm": 2.068974494934082, "step": 2460 }, { "epoch": 0.8205470313542361, "learning_rate": 0.00029587815518904907, "step": 2460 }, { "epoch": 0.8205470313542361, "loss": 0.7143868207931519, "step": 2460 }, { "ce_loss": 0.33941391110420227, "epoch": 0.8205470313542361, "step": 2460 }, { "distill_loss": 0.2006298154592514, "epoch": 0.8205470313542361, "step": 2460 }, { "epoch": 0.8205470313542361, "ref_ce_loss": 0.1742485612630844, "step": 2460 }, { "epoch": 0.8205470313542361, "loss": 0.5091679692268372, "step": 2460 }, { "ce_loss": 0.21453019976615906, "epoch": 0.8205470313542361, "step": 2460 }, { "distill_loss": 0.13742676377296448, "epoch": 0.8205470313542361, "step": 2460 }, { "epoch": 0.8205470313542361, "ref_ce_loss": 0.1569163054227829, "step": 2460 }, { "epoch": 0.8205470313542361, "loss": 0.7609373331069946, "step": 2460 }, { "ce_loss": 0.26815611124038696, "epoch": 0.8205470313542361, "step": 2460 }, { "distill_loss": 0.14041054248809814, "epoch": 0.8205470313542361, "step": 2460 }, { "epoch": 0.8205470313542361, "ref_ce_loss": 0.14617282152175903, "step": 2460 }, { "epoch": 0.8205470313542361, "loss": 0.5668540000915527, "step": 2460 }, { "ce_loss": 0.24641276895999908, "epoch": 0.8205470313542361, "step": 2460 }, { "distill_loss": 0.14012645184993744, "epoch": 0.8205470313542361, "step": 2460 }, { "epoch": 0.8205470313542361, "ref_ce_loss": 0.18027812242507935, "step": 2460 }, { "epoch": 0.8238825883922615, "loss": 0.7348, "step": 2470 }, { "epoch": 0.8238825883922615, "grad_norm": 2.643747568130493, "step": 2470 }, { "epoch": 0.8238825883922615, "learning_rate": 0.00029583086284754766, "step": 2470 }, { "epoch": 0.8238825883922615, "loss": 0.6161603927612305, "step": 2470 }, { "ce_loss": 0.25048938393592834, "epoch": 0.8238825883922615, "step": 2470 }, { "distill_loss": 0.12217261642217636, "epoch": 0.8238825883922615, "step": 2470 }, { "epoch": 0.8238825883922615, "ref_ce_loss": 0.17147208750247955, "step": 2470 }, { "epoch": 0.8238825883922615, "loss": 0.5763737559318542, "step": 2470 }, { "ce_loss": 0.21852903068065643, "epoch": 0.8238825883922615, "step": 2470 }, { "distill_loss": 0.13863025605678558, "epoch": 0.8238825883922615, "step": 2470 }, { "epoch": 0.8238825883922615, "ref_ce_loss": 0.12992407381534576, "step": 2470 }, { "epoch": 0.8238825883922615, "loss": 1.011842131614685, "step": 2470 }, { "ce_loss": 0.22845333814620972, "epoch": 0.8238825883922615, "step": 2470 }, { "distill_loss": 0.10414294898509979, "epoch": 0.8238825883922615, "step": 2470 }, { "epoch": 0.8238825883922615, "ref_ce_loss": 0.11669415980577469, "step": 2470 }, { "epoch": 0.8238825883922615, "loss": 0.7752334475517273, "step": 2470 }, { "ce_loss": 0.2148449867963791, "epoch": 0.8238825883922615, "step": 2470 }, { "distill_loss": 0.1058155745267868, "epoch": 0.8238825883922615, "step": 2470 }, { "epoch": 0.8238825883922615, "ref_ce_loss": 0.16419118642807007, "step": 2470 }, { "epoch": 0.8272181454302868, "loss": 0.7157, "step": 2480 }, { "epoch": 0.8272181454302868, "grad_norm": 2.1814582347869873, "step": 2480 }, { "epoch": 0.8272181454302868, "learning_rate": 0.00029578330456807804, "step": 2480 }, { "epoch": 0.8272181454302868, "loss": 1.1222140789031982, "step": 2480 }, { "ce_loss": 0.21106146275997162, "epoch": 0.8272181454302868, "step": 2480 }, { "distill_loss": 0.15598347783088684, "epoch": 0.8272181454302868, "step": 2480 }, { "epoch": 0.8272181454302868, "ref_ce_loss": 0.1223301962018013, "step": 2480 }, { "epoch": 0.8272181454302868, "loss": 1.7699038982391357, "step": 2480 }, { "ce_loss": 0.2586840093135834, "epoch": 0.8272181454302868, "step": 2480 }, { "distill_loss": 0.16339309513568878, "epoch": 0.8272181454302868, "step": 2480 }, { "epoch": 0.8272181454302868, "ref_ce_loss": 0.18896687030792236, "step": 2480 }, { "epoch": 0.8272181454302868, "loss": 0.6121606826782227, "step": 2480 }, { "ce_loss": 0.2650245726108551, "epoch": 0.8272181454302868, "step": 2480 }, { "distill_loss": 0.1810838282108307, "epoch": 0.8272181454302868, "step": 2480 }, { "epoch": 0.8272181454302868, "ref_ce_loss": 0.11699043959379196, "step": 2480 }, { "epoch": 0.8272181454302868, "loss": 0.751914381980896, "step": 2480 }, { "ce_loss": 0.3102262616157532, "epoch": 0.8272181454302868, "step": 2480 }, { "distill_loss": 0.15393638610839844, "epoch": 0.8272181454302868, "step": 2480 }, { "epoch": 0.8272181454302868, "ref_ce_loss": 0.16068010032176971, "step": 2480 }, { "epoch": 0.8305537024683122, "loss": 0.7792, "step": 2490 }, { "epoch": 0.8305537024683122, "grad_norm": 1.7032862901687622, "step": 2490 }, { "epoch": 0.8305537024683122, "learning_rate": 0.0002957354804373677, "step": 2490 }, { "epoch": 0.8305537024683122, "loss": 0.46619075536727905, "step": 2490 }, { "ce_loss": 0.19643811881542206, "epoch": 0.8305537024683122, "step": 2490 }, { "distill_loss": 0.1185787171125412, "epoch": 0.8305537024683122, "step": 2490 }, { "epoch": 0.8305537024683122, "ref_ce_loss": 0.150973379611969, "step": 2490 }, { "epoch": 0.8305537024683122, "loss": 0.5043961405754089, "step": 2490 }, { "ce_loss": 0.13823407888412476, "epoch": 0.8305537024683122, "step": 2490 }, { "distill_loss": 0.0923272967338562, "epoch": 0.8305537024683122, "step": 2490 }, { "epoch": 0.8305537024683122, "ref_ce_loss": 0.09950131922960281, "step": 2490 }, { "epoch": 0.8305537024683122, "loss": 0.5879054069519043, "step": 2490 }, { "ce_loss": 0.23579871654510498, "epoch": 0.8305537024683122, "step": 2490 }, { "distill_loss": 0.14946657419204712, "epoch": 0.8305537024683122, "step": 2490 }, { "epoch": 0.8305537024683122, "ref_ce_loss": 0.10562227666378021, "step": 2490 }, { "epoch": 0.8305537024683122, "loss": 0.7153053879737854, "step": 2490 }, { "ce_loss": 0.3567087650299072, "epoch": 0.8305537024683122, "step": 2490 }, { "distill_loss": 0.13327312469482422, "epoch": 0.8305537024683122, "step": 2490 }, { "epoch": 0.8305537024683122, "ref_ce_loss": 0.15352420508861542, "step": 2490 }, { "epoch": 0.8338892595063375, "loss": 0.7088, "step": 2500 }, { "epoch": 0.8338892595063375, "grad_norm": 3.5686287879943848, "step": 2500 }, { "epoch": 0.8338892595063375, "learning_rate": 0.000295687390542629, "step": 2500 }, { "epoch": 0.8338892595063375, "loss": 0.6453408598899841, "step": 2500 }, { "ce_loss": 0.2111901193857193, "epoch": 0.8338892595063375, "step": 2500 }, { "distill_loss": 0.16144970059394836, "epoch": 0.8338892595063375, "step": 2500 }, { "epoch": 0.8338892595063375, "ref_ce_loss": 0.16724175214767456, "step": 2500 }, { "epoch": 0.8338892595063375, "loss": 0.7878329753875732, "step": 2500 }, { "ce_loss": 0.19704513251781464, "epoch": 0.8338892595063375, "step": 2500 }, { "distill_loss": 0.24139323830604553, "epoch": 0.8338892595063375, "step": 2500 }, { "epoch": 0.8338892595063375, "ref_ce_loss": 0.12408225983381271, "step": 2500 }, { "epoch": 0.8338892595063375, "loss": 0.9499678015708923, "step": 2500 }, { "ce_loss": 0.4058931767940521, "epoch": 0.8338892595063375, "step": 2500 }, { "distill_loss": 0.267963171005249, "epoch": 0.8338892595063375, "step": 2500 }, { "epoch": 0.8338892595063375, "ref_ce_loss": 0.2148558646440506, "step": 2500 }, { "epoch": 0.8338892595063375, "loss": 0.8748348951339722, "step": 2500 }, { "ce_loss": 0.30255037546157837, "epoch": 0.8338892595063375, "step": 2500 }, { "distill_loss": 0.25724613666534424, "epoch": 0.8338892595063375, "step": 2500 }, { "epoch": 0.8338892595063375, "ref_ce_loss": 0.15020081400871277, "step": 2500 }, { "epoch": 0.8372248165443629, "loss": 0.8287, "step": 2510 }, { "epoch": 0.8372248165443629, "grad_norm": 2.9365673065185547, "step": 2510 }, { "epoch": 0.8372248165443629, "learning_rate": 0.0002956390349715589, "step": 2510 }, { "epoch": 0.8372248165443629, "loss": 0.7504816651344299, "step": 2510 }, { "ce_loss": 0.32103756070137024, "epoch": 0.8372248165443629, "step": 2510 }, { "distill_loss": 0.15747405588626862, "epoch": 0.8372248165443629, "step": 2510 }, { "epoch": 0.8372248165443629, "ref_ce_loss": 0.19202551245689392, "step": 2510 }, { "epoch": 0.8372248165443629, "loss": 0.6597751379013062, "step": 2510 }, { "ce_loss": 0.20975995063781738, "epoch": 0.8372248165443629, "step": 2510 }, { "distill_loss": 0.19682608544826508, "epoch": 0.8372248165443629, "step": 2510 }, { "epoch": 0.8372248165443629, "ref_ce_loss": 0.13535785675048828, "step": 2510 }, { "epoch": 0.8372248165443629, "loss": 0.6817447543144226, "step": 2510 }, { "ce_loss": 0.238074392080307, "epoch": 0.8372248165443629, "step": 2510 }, { "distill_loss": 0.15745586156845093, "epoch": 0.8372248165443629, "step": 2510 }, { "epoch": 0.8372248165443629, "ref_ce_loss": 0.16773714125156403, "step": 2510 }, { "epoch": 0.8372248165443629, "loss": 0.4860121011734009, "step": 2510 }, { "ce_loss": 0.17487522959709167, "epoch": 0.8372248165443629, "step": 2510 }, { "distill_loss": 0.17117461562156677, "epoch": 0.8372248165443629, "step": 2510 }, { "epoch": 0.8372248165443629, "ref_ce_loss": 0.09342694282531738, "step": 2510 }, { "epoch": 0.8405603735823882, "loss": 0.7391, "step": 2520 }, { "epoch": 0.8405603735823882, "grad_norm": 2.870335578918457, "step": 2520 }, { "epoch": 0.8405603735823882, "learning_rate": 0.0002955904138123389, "step": 2520 }, { "epoch": 0.8405603735823882, "loss": 0.6082489490509033, "step": 2520 }, { "ce_loss": 0.2526482045650482, "epoch": 0.8405603735823882, "step": 2520 }, { "distill_loss": 0.09827655553817749, "epoch": 0.8405603735823882, "step": 2520 }, { "epoch": 0.8405603735823882, "ref_ce_loss": 0.1371910274028778, "step": 2520 }, { "epoch": 0.8405603735823882, "loss": 0.8684769868850708, "step": 2520 }, { "ce_loss": 0.2551835775375366, "epoch": 0.8405603735823882, "step": 2520 }, { "distill_loss": 0.15360848605632782, "epoch": 0.8405603735823882, "step": 2520 }, { "epoch": 0.8405603735823882, "ref_ce_loss": 0.19132332503795624, "step": 2520 }, { "epoch": 0.8405603735823882, "loss": 0.6976226568222046, "step": 2520 }, { "ce_loss": 0.31281253695487976, "epoch": 0.8405603735823882, "step": 2520 }, { "distill_loss": 0.15646786987781525, "epoch": 0.8405603735823882, "step": 2520 }, { "epoch": 0.8405603735823882, "ref_ce_loss": 0.16439111530780792, "step": 2520 }, { "epoch": 0.8405603735823882, "loss": 0.5785008668899536, "step": 2520 }, { "ce_loss": 0.27898046374320984, "epoch": 0.8405603735823882, "step": 2520 }, { "distill_loss": 0.12810131907463074, "epoch": 0.8405603735823882, "step": 2520 }, { "epoch": 0.8405603735823882, "ref_ce_loss": 0.11890950053930283, "step": 2520 }, { "epoch": 0.8438959306204136, "loss": 0.8542, "step": 2530 }, { "epoch": 0.8438959306204136, "grad_norm": 2.7163124084472656, "step": 2530 }, { "epoch": 0.8438959306204136, "learning_rate": 0.0002955415271536349, "step": 2530 }, { "epoch": 0.8438959306204136, "loss": 0.9421725869178772, "step": 2530 }, { "ce_loss": 0.3706272840499878, "epoch": 0.8438959306204136, "step": 2530 }, { "distill_loss": 0.23287363350391388, "epoch": 0.8438959306204136, "step": 2530 }, { "epoch": 0.8438959306204136, "ref_ce_loss": 0.20354139804840088, "step": 2530 }, { "epoch": 0.8438959306204136, "loss": 0.524922251701355, "step": 2530 }, { "ce_loss": 0.15988919138908386, "epoch": 0.8438959306204136, "step": 2530 }, { "distill_loss": 0.1579529196023941, "epoch": 0.8438959306204136, "step": 2530 }, { "epoch": 0.8438959306204136, "ref_ce_loss": 0.09271088242530823, "step": 2530 }, { "epoch": 0.8438959306204136, "loss": 0.6980333924293518, "step": 2530 }, { "ce_loss": 0.266355961561203, "epoch": 0.8438959306204136, "step": 2530 }, { "distill_loss": 0.1916055679321289, "epoch": 0.8438959306204136, "step": 2530 }, { "epoch": 0.8438959306204136, "ref_ce_loss": 0.18757179379463196, "step": 2530 }, { "epoch": 0.8438959306204136, "loss": 0.8313827514648438, "step": 2530 }, { "ce_loss": 0.213656947016716, "epoch": 0.8438959306204136, "step": 2530 }, { "distill_loss": 0.2044852375984192, "epoch": 0.8438959306204136, "step": 2530 }, { "epoch": 0.8438959306204136, "ref_ce_loss": 0.21953712403774261, "step": 2530 }, { "epoch": 0.8472314876584389, "loss": 0.739, "step": 2540 }, { "epoch": 0.8472314876584389, "grad_norm": 1.7960280179977417, "step": 2540 }, { "epoch": 0.8472314876584389, "learning_rate": 0.0002954923750845968, "step": 2540 }, { "epoch": 0.8472314876584389, "loss": 0.6077286601066589, "step": 2540 }, { "ce_loss": 0.2709120810031891, "epoch": 0.8472314876584389, "step": 2540 }, { "distill_loss": 0.11971865594387054, "epoch": 0.8472314876584389, "step": 2540 }, { "epoch": 0.8472314876584389, "ref_ce_loss": 0.1525728404521942, "step": 2540 }, { "epoch": 0.8472314876584389, "loss": 0.5749415755271912, "step": 2540 }, { "ce_loss": 0.23235177993774414, "epoch": 0.8472314876584389, "step": 2540 }, { "distill_loss": 0.11225643754005432, "epoch": 0.8472314876584389, "step": 2540 }, { "epoch": 0.8472314876584389, "ref_ce_loss": 0.10792374610900879, "step": 2540 }, { "epoch": 0.8472314876584389, "loss": 0.762617826461792, "step": 2540 }, { "ce_loss": 0.3228139877319336, "epoch": 0.8472314876584389, "step": 2540 }, { "distill_loss": 0.13279499113559723, "epoch": 0.8472314876584389, "step": 2540 }, { "epoch": 0.8472314876584389, "ref_ce_loss": 0.14764104783535004, "step": 2540 }, { "epoch": 0.8472314876584389, "loss": 0.9446433782577515, "step": 2540 }, { "ce_loss": 0.4045880436897278, "epoch": 0.8472314876584389, "step": 2540 }, { "distill_loss": 0.1612992286682129, "epoch": 0.8472314876584389, "step": 2540 }, { "epoch": 0.8472314876584389, "ref_ce_loss": 0.20145398378372192, "step": 2540 }, { "epoch": 0.8505670446964643, "loss": 0.7286, "step": 2550 }, { "epoch": 0.8505670446964643, "grad_norm": 2.043759346008301, "step": 2550 }, { "epoch": 0.8505670446964643, "learning_rate": 0.0002954429576948586, "step": 2550 }, { "epoch": 0.8505670446964643, "loss": 0.6958951354026794, "step": 2550 }, { "ce_loss": 0.2730863094329834, "epoch": 0.8505670446964643, "step": 2550 }, { "distill_loss": 0.16268062591552734, "epoch": 0.8505670446964643, "step": 2550 }, { "epoch": 0.8505670446964643, "ref_ce_loss": 0.16490232944488525, "step": 2550 }, { "epoch": 0.8505670446964643, "loss": 0.48699092864990234, "step": 2550 }, { "ce_loss": 0.19955317676067352, "epoch": 0.8505670446964643, "step": 2550 }, { "distill_loss": 0.1272369772195816, "epoch": 0.8505670446964643, "step": 2550 }, { "epoch": 0.8505670446964643, "ref_ce_loss": 0.12268470227718353, "step": 2550 }, { "epoch": 0.8505670446964643, "loss": 0.6086199283599854, "step": 2550 }, { "ce_loss": 0.19976550340652466, "epoch": 0.8505670446964643, "step": 2550 }, { "distill_loss": 0.14242056012153625, "epoch": 0.8505670446964643, "step": 2550 }, { "epoch": 0.8505670446964643, "ref_ce_loss": 0.1820511668920517, "step": 2550 }, { "epoch": 0.8505670446964643, "loss": 1.2203011512756348, "step": 2550 }, { "ce_loss": 0.29445260763168335, "epoch": 0.8505670446964643, "step": 2550 }, { "distill_loss": 0.14953580498695374, "epoch": 0.8505670446964643, "step": 2550 }, { "epoch": 0.8505670446964643, "ref_ce_loss": 0.17585910856723785, "step": 2550 }, { "epoch": 0.8539026017344896, "loss": 0.7847, "step": 2560 }, { "epoch": 0.8539026017344896, "grad_norm": 4.350864887237549, "step": 2560 }, { "epoch": 0.8539026017344896, "learning_rate": 0.0002953932750745382, "step": 2560 }, { "epoch": 0.8539026017344896, "loss": 1.0202877521514893, "step": 2560 }, { "ce_loss": 0.3814989924430847, "epoch": 0.8539026017344896, "step": 2560 }, { "distill_loss": 0.15244382619857788, "epoch": 0.8539026017344896, "step": 2560 }, { "epoch": 0.8539026017344896, "ref_ce_loss": 0.2545083463191986, "step": 2560 }, { "epoch": 0.8539026017344896, "loss": 0.6912614107131958, "step": 2560 }, { "ce_loss": 0.17741495370864868, "epoch": 0.8539026017344896, "step": 2560 }, { "distill_loss": 0.12845268845558167, "epoch": 0.8539026017344896, "step": 2560 }, { "epoch": 0.8539026017344896, "ref_ce_loss": 0.10197576880455017, "step": 2560 }, { "epoch": 0.8539026017344896, "loss": 0.783969521522522, "step": 2560 }, { "ce_loss": 0.2941688597202301, "epoch": 0.8539026017344896, "step": 2560 }, { "distill_loss": 0.14433158934116364, "epoch": 0.8539026017344896, "step": 2560 }, { "epoch": 0.8539026017344896, "ref_ce_loss": 0.12809965014457703, "step": 2560 }, { "epoch": 0.8539026017344896, "loss": 0.8776670694351196, "step": 2560 }, { "ce_loss": 0.3103247582912445, "epoch": 0.8539026017344896, "step": 2560 }, { "distill_loss": 0.1200985312461853, "epoch": 0.8539026017344896, "step": 2560 }, { "epoch": 0.8539026017344896, "ref_ce_loss": 0.16940616071224213, "step": 2560 }, { "epoch": 0.857238158772515, "loss": 0.7478, "step": 2570 }, { "epoch": 0.857238158772515, "grad_norm": 2.9963643550872803, "step": 2570 }, { "epoch": 0.857238158772515, "learning_rate": 0.0002953433273142369, "step": 2570 }, { "epoch": 0.857238158772515, "loss": 0.6163371801376343, "step": 2570 }, { "ce_loss": 0.18766649067401886, "epoch": 0.857238158772515, "step": 2570 }, { "distill_loss": 0.13034361600875854, "epoch": 0.857238158772515, "step": 2570 }, { "epoch": 0.857238158772515, "ref_ce_loss": 0.15359312295913696, "step": 2570 }, { "epoch": 0.857238158772515, "loss": 0.6791805624961853, "step": 2570 }, { "ce_loss": 0.30410608649253845, "epoch": 0.857238158772515, "step": 2570 }, { "distill_loss": 0.11348666250705719, "epoch": 0.857238158772515, "step": 2570 }, { "epoch": 0.857238158772515, "ref_ce_loss": 0.16625384986400604, "step": 2570 }, { "epoch": 0.857238158772515, "loss": 0.94990074634552, "step": 2570 }, { "ce_loss": 0.40707695484161377, "epoch": 0.857238158772515, "step": 2570 }, { "distill_loss": 0.17179466784000397, "epoch": 0.857238158772515, "step": 2570 }, { "epoch": 0.857238158772515, "ref_ce_loss": 0.18617656826972961, "step": 2570 }, { "epoch": 0.857238158772515, "loss": 0.56731116771698, "step": 2570 }, { "ce_loss": 0.25088340044021606, "epoch": 0.857238158772515, "step": 2570 }, { "distill_loss": 0.10557133704423904, "epoch": 0.857238158772515, "step": 2570 }, { "epoch": 0.857238158772515, "ref_ce_loss": 0.15625420212745667, "step": 2570 }, { "epoch": 0.8605737158105403, "loss": 0.8025, "step": 2580 }, { "epoch": 0.8605737158105403, "grad_norm": 2.4704360961914062, "step": 2580 }, { "epoch": 0.8605737158105403, "learning_rate": 0.0002952931145050399, "step": 2580 }, { "epoch": 0.8605737158105403, "loss": 0.7413711547851562, "step": 2580 }, { "ce_loss": 0.21104562282562256, "epoch": 0.8605737158105403, "step": 2580 }, { "distill_loss": 0.19385764002799988, "epoch": 0.8605737158105403, "step": 2580 }, { "epoch": 0.8605737158105403, "ref_ce_loss": 0.15575964748859406, "step": 2580 }, { "epoch": 0.8605737158105403, "loss": 0.779699981212616, "step": 2580 }, { "ce_loss": 0.26666852831840515, "epoch": 0.8605737158105403, "step": 2580 }, { "distill_loss": 0.1735392063856125, "epoch": 0.8605737158105403, "step": 2580 }, { "epoch": 0.8605737158105403, "ref_ce_loss": 0.26088887453079224, "step": 2580 }, { "epoch": 0.8605737158105403, "loss": 0.9133224487304688, "step": 2580 }, { "ce_loss": 0.2493307888507843, "epoch": 0.8605737158105403, "step": 2580 }, { "distill_loss": 0.16577640175819397, "epoch": 0.8605737158105403, "step": 2580 }, { "epoch": 0.8605737158105403, "ref_ce_loss": 0.1426527500152588, "step": 2580 }, { "epoch": 0.8605737158105403, "loss": 1.1199960708618164, "step": 2580 }, { "ce_loss": 0.2786944508552551, "epoch": 0.8605737158105403, "step": 2580 }, { "distill_loss": 0.191022589802742, "epoch": 0.8605737158105403, "step": 2580 }, { "epoch": 0.8605737158105403, "ref_ce_loss": 0.11835911870002747, "step": 2580 }, { "epoch": 0.8639092728485657, "loss": 0.7495, "step": 2590 }, { "epoch": 0.8639092728485657, "grad_norm": 1.9572570323944092, "step": 2590 }, { "epoch": 0.8639092728485657, "learning_rate": 0.00029524263673851557, "step": 2590 }, { "epoch": 0.8639092728485657, "loss": 0.6552140712738037, "step": 2590 }, { "ce_loss": 0.23570433259010315, "epoch": 0.8639092728485657, "step": 2590 }, { "distill_loss": 0.13375253975391388, "epoch": 0.8639092728485657, "step": 2590 }, { "epoch": 0.8639092728485657, "ref_ce_loss": 0.18127413094043732, "step": 2590 }, { "epoch": 0.8639092728485657, "loss": 0.8274029493331909, "step": 2590 }, { "ce_loss": 0.3903733193874359, "epoch": 0.8639092728485657, "step": 2590 }, { "distill_loss": 0.15356363356113434, "epoch": 0.8639092728485657, "step": 2590 }, { "epoch": 0.8639092728485657, "ref_ce_loss": 0.2296900451183319, "step": 2590 }, { "epoch": 0.8639092728485657, "loss": 0.6463877558708191, "step": 2590 }, { "ce_loss": 0.2714865803718567, "epoch": 0.8639092728485657, "step": 2590 }, { "distill_loss": 0.1532272845506668, "epoch": 0.8639092728485657, "step": 2590 }, { "epoch": 0.8639092728485657, "ref_ce_loss": 0.17067858576774597, "step": 2590 }, { "epoch": 0.8639092728485657, "loss": 0.6551008224487305, "step": 2590 }, { "ce_loss": 0.24287007749080658, "epoch": 0.8639092728485657, "step": 2590 }, { "distill_loss": 0.12412548065185547, "epoch": 0.8639092728485657, "step": 2590 }, { "epoch": 0.8639092728485657, "ref_ce_loss": 0.15614332258701324, "step": 2590 }, { "epoch": 0.867244829886591, "loss": 0.7183, "step": 2600 }, { "epoch": 0.867244829886591, "grad_norm": 1.8994405269622803, "step": 2600 }, { "epoch": 0.867244829886591, "learning_rate": 0.0002951918941067153, "step": 2600 }, { "epoch": 0.867244829886591, "loss": 0.7773920297622681, "step": 2600 }, { "ce_loss": 0.3620452880859375, "epoch": 0.867244829886591, "step": 2600 }, { "distill_loss": 0.1090983971953392, "epoch": 0.867244829886591, "step": 2600 }, { "epoch": 0.867244829886591, "ref_ce_loss": 0.2240307331085205, "step": 2600 }, { "epoch": 0.867244829886591, "loss": 0.7050237059593201, "step": 2600 }, { "ce_loss": 0.2841552197933197, "epoch": 0.867244829886591, "step": 2600 }, { "distill_loss": 0.10316047817468643, "epoch": 0.867244829886591, "step": 2600 }, { "epoch": 0.867244829886591, "ref_ce_loss": 0.1758444905281067, "step": 2600 }, { "epoch": 0.867244829886591, "loss": 1.683584451675415, "step": 2600 }, { "ce_loss": 0.2757621109485626, "epoch": 0.867244829886591, "step": 2600 }, { "distill_loss": 0.10363587737083435, "epoch": 0.867244829886591, "step": 2600 }, { "epoch": 0.867244829886591, "ref_ce_loss": 0.1815105527639389, "step": 2600 }, { "epoch": 0.867244829886591, "loss": 0.6704810857772827, "step": 2600 }, { "ce_loss": 0.190278559923172, "epoch": 0.867244829886591, "step": 2600 }, { "distill_loss": 0.1048944815993309, "epoch": 0.867244829886591, "step": 2600 }, { "epoch": 0.867244829886591, "ref_ce_loss": 0.1369696855545044, "step": 2600 }, { "epoch": 0.8705803869246164, "loss": 0.7809, "step": 2610 }, { "epoch": 0.8705803869246164, "grad_norm": 4.59869909286499, "step": 2610 }, { "epoch": 0.8705803869246164, "learning_rate": 0.0002951408867021737, "step": 2610 }, { "epoch": 0.8705803869246164, "loss": 0.6961145401000977, "step": 2610 }, { "ce_loss": 0.20867450535297394, "epoch": 0.8705803869246164, "step": 2610 }, { "distill_loss": 0.222753643989563, "epoch": 0.8705803869246164, "step": 2610 }, { "epoch": 0.8705803869246164, "ref_ce_loss": 0.16406960785388947, "step": 2610 }, { "epoch": 0.8705803869246164, "loss": 0.7274801731109619, "step": 2610 }, { "ce_loss": 0.2560276389122009, "epoch": 0.8705803869246164, "step": 2610 }, { "distill_loss": 0.27618464827537537, "epoch": 0.8705803869246164, "step": 2610 }, { "epoch": 0.8705803869246164, "ref_ce_loss": 0.1208532452583313, "step": 2610 }, { "epoch": 0.8705803869246164, "loss": 0.530335009098053, "step": 2610 }, { "ce_loss": 0.18852654099464417, "epoch": 0.8705803869246164, "step": 2610 }, { "distill_loss": 0.20412476360797882, "epoch": 0.8705803869246164, "step": 2610 }, { "epoch": 0.8705803869246164, "ref_ce_loss": 0.13730789721012115, "step": 2610 }, { "epoch": 0.8705803869246164, "loss": 0.5325140953063965, "step": 2610 }, { "ce_loss": 0.15463118255138397, "epoch": 0.8705803869246164, "step": 2610 }, { "distill_loss": 0.20978079736232758, "epoch": 0.8705803869246164, "step": 2610 }, { "epoch": 0.8705803869246164, "ref_ce_loss": 0.11247812956571579, "step": 2610 }, { "epoch": 0.8739159439626417, "loss": 0.7297, "step": 2620 }, { "epoch": 0.8739159439626417, "grad_norm": 2.0906076431274414, "step": 2620 }, { "epoch": 0.8739159439626417, "learning_rate": 0.0002950896146179082, "step": 2620 }, { "epoch": 0.8739159439626417, "loss": 0.6182748079299927, "step": 2620 }, { "ce_loss": 0.30032458901405334, "epoch": 0.8739159439626417, "step": 2620 }, { "distill_loss": 0.17088757455348969, "epoch": 0.8739159439626417, "step": 2620 }, { "epoch": 0.8739159439626417, "ref_ce_loss": 0.14578019082546234, "step": 2620 }, { "epoch": 0.8739159439626417, "loss": 0.6845272183418274, "step": 2620 }, { "ce_loss": 0.2438613623380661, "epoch": 0.8739159439626417, "step": 2620 }, { "distill_loss": 0.1726510226726532, "epoch": 0.8739159439626417, "step": 2620 }, { "epoch": 0.8739159439626417, "ref_ce_loss": 0.16975204646587372, "step": 2620 }, { "epoch": 0.8739159439626417, "loss": 0.9628136157989502, "step": 2620 }, { "ce_loss": 0.22766920924186707, "epoch": 0.8739159439626417, "step": 2620 }, { "distill_loss": 0.16870301961898804, "epoch": 0.8739159439626417, "step": 2620 }, { "epoch": 0.8739159439626417, "ref_ce_loss": 0.14269216358661652, "step": 2620 }, { "epoch": 0.8739159439626417, "loss": 0.4854929745197296, "step": 2620 }, { "ce_loss": 0.16093426942825317, "epoch": 0.8739159439626417, "step": 2620 }, { "distill_loss": 0.11879430711269379, "epoch": 0.8739159439626417, "step": 2620 }, { "epoch": 0.8739159439626417, "ref_ce_loss": 0.14466573297977448, "step": 2620 }, { "epoch": 0.8772515010006671, "loss": 0.7219, "step": 2630 }, { "epoch": 0.8772515010006671, "grad_norm": 2.8054723739624023, "step": 2630 }, { "epoch": 0.8772515010006671, "learning_rate": 0.0002950380779474188, "step": 2630 }, { "epoch": 0.8772515010006671, "loss": 1.1032121181488037, "step": 2630 }, { "ce_loss": 0.27350157499313354, "epoch": 0.8772515010006671, "step": 2630 }, { "distill_loss": 0.14578324556350708, "epoch": 0.8772515010006671, "step": 2630 }, { "epoch": 0.8772515010006671, "ref_ce_loss": 0.16324932873249054, "step": 2630 }, { "epoch": 0.8772515010006671, "loss": 0.5892089605331421, "step": 2630 }, { "ce_loss": 0.23559221625328064, "epoch": 0.8772515010006671, "step": 2630 }, { "distill_loss": 0.15483921766281128, "epoch": 0.8772515010006671, "step": 2630 }, { "epoch": 0.8772515010006671, "ref_ce_loss": 0.11848069727420807, "step": 2630 }, { "epoch": 0.8772515010006671, "loss": 0.6435602307319641, "step": 2630 }, { "ce_loss": 0.2746572196483612, "epoch": 0.8772515010006671, "step": 2630 }, { "distill_loss": 0.13225823640823364, "epoch": 0.8772515010006671, "step": 2630 }, { "epoch": 0.8772515010006671, "ref_ce_loss": 0.13194727897644043, "step": 2630 }, { "epoch": 0.8772515010006671, "loss": 0.7193164229393005, "step": 2630 }, { "ce_loss": 0.20938901603221893, "epoch": 0.8772515010006671, "step": 2630 }, { "distill_loss": 0.15860141813755035, "epoch": 0.8772515010006671, "step": 2630 }, { "epoch": 0.8772515010006671, "ref_ce_loss": 0.17175227403640747, "step": 2630 }, { "epoch": 0.8805870580386924, "loss": 0.762, "step": 2640 }, { "epoch": 0.8805870580386924, "grad_norm": 2.040764570236206, "step": 2640 }, { "epoch": 0.8805870580386924, "learning_rate": 0.00029498627678468806, "step": 2640 }, { "epoch": 0.8805870580386924, "loss": 1.0706512928009033, "step": 2640 }, { "ce_loss": 0.2734338045120239, "epoch": 0.8805870580386924, "step": 2640 }, { "distill_loss": 0.17937436699867249, "epoch": 0.8805870580386924, "step": 2640 }, { "epoch": 0.8805870580386924, "ref_ce_loss": 0.2127029448747635, "step": 2640 }, { "epoch": 0.8805870580386924, "loss": 0.7411646842956543, "step": 2640 }, { "ce_loss": 0.1989816427230835, "epoch": 0.8805870580386924, "step": 2640 }, { "distill_loss": 0.18253065645694733, "epoch": 0.8805870580386924, "step": 2640 }, { "epoch": 0.8805870580386924, "ref_ce_loss": 0.1566845029592514, "step": 2640 }, { "epoch": 0.8805870580386924, "loss": 0.7127244472503662, "step": 2640 }, { "ce_loss": 0.18785162270069122, "epoch": 0.8805870580386924, "step": 2640 }, { "distill_loss": 0.13022294640541077, "epoch": 0.8805870580386924, "step": 2640 }, { "epoch": 0.8805870580386924, "ref_ce_loss": 0.14390313625335693, "step": 2640 }, { "epoch": 0.8805870580386924, "loss": 0.3556101322174072, "step": 2640 }, { "ce_loss": 0.15927663445472717, "epoch": 0.8805870580386924, "step": 2640 }, { "distill_loss": 0.12913666665554047, "epoch": 0.8805870580386924, "step": 2640 }, { "epoch": 0.8805870580386924, "ref_ce_loss": 0.06653880327939987, "step": 2640 }, { "epoch": 0.8839226150767178, "loss": 0.7489, "step": 2650 }, { "epoch": 0.8839226150767178, "grad_norm": 2.5249640941619873, "step": 2650 }, { "epoch": 0.8839226150767178, "learning_rate": 0.0002949342112241809, "step": 2650 }, { "epoch": 0.8839226150767178, "loss": 0.5982322692871094, "step": 2650 }, { "ce_loss": 0.20801950991153717, "epoch": 0.8839226150767178, "step": 2650 }, { "distill_loss": 0.09348731487989426, "epoch": 0.8839226150767178, "step": 2650 }, { "epoch": 0.8839226150767178, "ref_ce_loss": 0.13889434933662415, "step": 2650 }, { "epoch": 0.8839226150767178, "loss": 0.539240837097168, "step": 2650 }, { "ce_loss": 0.2223464399576187, "epoch": 0.8839226150767178, "step": 2650 }, { "distill_loss": 0.12154768407344818, "epoch": 0.8839226150767178, "step": 2650 }, { "epoch": 0.8839226150767178, "ref_ce_loss": 0.19085489213466644, "step": 2650 }, { "epoch": 0.8839226150767178, "loss": 0.524050772190094, "step": 2650 }, { "ce_loss": 0.24115309119224548, "epoch": 0.8839226150767178, "step": 2650 }, { "distill_loss": 0.10297761112451553, "epoch": 0.8839226150767178, "step": 2650 }, { "epoch": 0.8839226150767178, "ref_ce_loss": 0.13751719892024994, "step": 2650 }, { "epoch": 0.8839226150767178, "loss": 0.3736794590950012, "step": 2650 }, { "ce_loss": 0.10971083492040634, "epoch": 0.8839226150767178, "step": 2650 }, { "distill_loss": 0.10210859775543213, "epoch": 0.8839226150767178, "step": 2650 }, { "epoch": 0.8839226150767178, "ref_ce_loss": 0.09514066576957703, "step": 2650 }, { "epoch": 0.8872581721147431, "loss": 0.6912, "step": 2660 }, { "epoch": 0.8872581721147431, "grad_norm": 2.7994887828826904, "step": 2660 }, { "epoch": 0.8872581721147431, "learning_rate": 0.00029488188136084437, "step": 2660 }, { "epoch": 0.8872581721147431, "loss": 1.0558778047561646, "step": 2660 }, { "ce_loss": 0.32495981454849243, "epoch": 0.8872581721147431, "step": 2660 }, { "distill_loss": 0.14164505898952484, "epoch": 0.8872581721147431, "step": 2660 }, { "epoch": 0.8872581721147431, "ref_ce_loss": 0.22562389075756073, "step": 2660 }, { "epoch": 0.8872581721147431, "loss": 0.602891206741333, "step": 2660 }, { "ce_loss": 0.21373873949050903, "epoch": 0.8872581721147431, "step": 2660 }, { "distill_loss": 0.12601274251937866, "epoch": 0.8872581721147431, "step": 2660 }, { "epoch": 0.8872581721147431, "ref_ce_loss": 0.19673238694667816, "step": 2660 }, { "epoch": 0.8872581721147431, "loss": 0.9606189727783203, "step": 2660 }, { "ce_loss": 0.35660284757614136, "epoch": 0.8872581721147431, "step": 2660 }, { "distill_loss": 0.1498052179813385, "epoch": 0.8872581721147431, "step": 2660 }, { "epoch": 0.8872581721147431, "ref_ce_loss": 0.19679509103298187, "step": 2660 }, { "epoch": 0.8872581721147431, "loss": 0.6719042658805847, "step": 2660 }, { "ce_loss": 0.24385878443717957, "epoch": 0.8872581721147431, "step": 2660 }, { "distill_loss": 0.1391836255788803, "epoch": 0.8872581721147431, "step": 2660 }, { "epoch": 0.8872581721147431, "ref_ce_loss": 0.1761963814496994, "step": 2660 }, { "epoch": 0.8905937291527685, "loss": 0.7776, "step": 2670 }, { "epoch": 0.8905937291527685, "grad_norm": 4.569013595581055, "step": 2670 }, { "epoch": 0.8905937291527685, "learning_rate": 0.0002948292872901074, "step": 2670 }, { "epoch": 0.8905937291527685, "loss": 0.5371503233909607, "step": 2670 }, { "ce_loss": 0.2493179589509964, "epoch": 0.8905937291527685, "step": 2670 }, { "distill_loss": 0.11686110496520996, "epoch": 0.8905937291527685, "step": 2670 }, { "epoch": 0.8905937291527685, "ref_ce_loss": 0.14364975690841675, "step": 2670 }, { "epoch": 0.8905937291527685, "loss": 0.7579875588417053, "step": 2670 }, { "ce_loss": 0.29506489634513855, "epoch": 0.8905937291527685, "step": 2670 }, { "distill_loss": 0.13039056956768036, "epoch": 0.8905937291527685, "step": 2670 }, { "epoch": 0.8905937291527685, "ref_ce_loss": 0.20609255135059357, "step": 2670 }, { "epoch": 0.8905937291527685, "loss": 0.6515138745307922, "step": 2670 }, { "ce_loss": 0.2613978981971741, "epoch": 0.8905937291527685, "step": 2670 }, { "distill_loss": 0.16587205231189728, "epoch": 0.8905937291527685, "step": 2670 }, { "epoch": 0.8905937291527685, "ref_ce_loss": 0.1350294053554535, "step": 2670 }, { "epoch": 0.8905937291527685, "loss": 0.7277687788009644, "step": 2670 }, { "ce_loss": 0.28890296816825867, "epoch": 0.8905937291527685, "step": 2670 }, { "distill_loss": 0.15814818441867828, "epoch": 0.8905937291527685, "step": 2670 }, { "epoch": 0.8905937291527685, "ref_ce_loss": 0.19958439469337463, "step": 2670 }, { "epoch": 0.8939292861907938, "loss": 0.698, "step": 2680 }, { "epoch": 0.8939292861907938, "grad_norm": 1.6606212854385376, "step": 2680 }, { "epoch": 0.8939292861907938, "learning_rate": 0.000294776429107881, "step": 2680 }, { "epoch": 0.8939292861907938, "loss": 0.7957069873809814, "step": 2680 }, { "ce_loss": 0.3352309465408325, "epoch": 0.8939292861907938, "step": 2680 }, { "distill_loss": 0.11121989041566849, "epoch": 0.8939292861907938, "step": 2680 }, { "epoch": 0.8939292861907938, "ref_ce_loss": 0.20472891628742218, "step": 2680 }, { "epoch": 0.8939292861907938, "loss": 0.7181699275970459, "step": 2680 }, { "ce_loss": 0.2733495235443115, "epoch": 0.8939292861907938, "step": 2680 }, { "distill_loss": 0.11020343005657196, "epoch": 0.8939292861907938, "step": 2680 }, { "epoch": 0.8939292861907938, "ref_ce_loss": 0.135089710354805, "step": 2680 }, { "epoch": 0.8939292861907938, "loss": 0.8474379777908325, "step": 2680 }, { "ce_loss": 0.3044080138206482, "epoch": 0.8939292861907938, "step": 2680 }, { "distill_loss": 0.1047251969575882, "epoch": 0.8939292861907938, "step": 2680 }, { "epoch": 0.8939292861907938, "ref_ce_loss": 0.12011805176734924, "step": 2680 }, { "epoch": 0.8939292861907938, "loss": 0.8199704885482788, "step": 2680 }, { "ce_loss": 0.35054662823677063, "epoch": 0.8939292861907938, "step": 2680 }, { "distill_loss": 0.13516145944595337, "epoch": 0.8939292861907938, "step": 2680 }, { "epoch": 0.8939292861907938, "ref_ce_loss": 0.18137814104557037, "step": 2680 }, { "epoch": 0.8972648432288192, "loss": 0.678, "step": 2690 }, { "epoch": 0.8972648432288192, "grad_norm": 2.979337692260742, "step": 2690 }, { "epoch": 0.8972648432288192, "learning_rate": 0.0002947233069105575, "step": 2690 }, { "epoch": 0.8972648432288192, "loss": 1.02372145652771, "step": 2690 }, { "ce_loss": 0.2200205773115158, "epoch": 0.8972648432288192, "step": 2690 }, { "distill_loss": 0.10067467391490936, "epoch": 0.8972648432288192, "step": 2690 }, { "epoch": 0.8972648432288192, "ref_ce_loss": 0.17001208662986755, "step": 2690 }, { "epoch": 0.8972648432288192, "loss": 0.5303847789764404, "step": 2690 }, { "ce_loss": 0.2386377602815628, "epoch": 0.8972648432288192, "step": 2690 }, { "distill_loss": 0.11323900520801544, "epoch": 0.8972648432288192, "step": 2690 }, { "epoch": 0.8972648432288192, "ref_ce_loss": 0.17847669124603271, "step": 2690 }, { "epoch": 0.8972648432288192, "loss": 0.7682243585586548, "step": 2690 }, { "ce_loss": 0.3582816421985626, "epoch": 0.8972648432288192, "step": 2690 }, { "distill_loss": 0.1306098848581314, "epoch": 0.8972648432288192, "step": 2690 }, { "epoch": 0.8972648432288192, "ref_ce_loss": 0.21609355509281158, "step": 2690 }, { "epoch": 0.8972648432288192, "loss": 0.5860241651535034, "step": 2690 }, { "ce_loss": 0.2113438844680786, "epoch": 0.8972648432288192, "step": 2690 }, { "distill_loss": 0.10160598903894424, "epoch": 0.8972648432288192, "step": 2690 }, { "epoch": 0.8972648432288192, "ref_ce_loss": 0.17873786389827728, "step": 2690 }, { "epoch": 0.9006004002668445, "loss": 0.8651, "step": 2700 }, { "epoch": 0.9006004002668445, "grad_norm": 2.6056735515594482, "step": 2700 }, { "epoch": 0.9006004002668445, "learning_rate": 0.0002946699207950109, "step": 2700 }, { "epoch": 0.9006004002668445, "loss": 0.6318261623382568, "step": 2700 }, { "ce_loss": 0.2618946135044098, "epoch": 0.9006004002668445, "step": 2700 }, { "distill_loss": 0.11020223796367645, "epoch": 0.9006004002668445, "step": 2700 }, { "epoch": 0.9006004002668445, "ref_ce_loss": 0.1937880516052246, "step": 2700 }, { "epoch": 0.9006004002668445, "loss": 1.0718632936477661, "step": 2700 }, { "ce_loss": 0.27328452467918396, "epoch": 0.9006004002668445, "step": 2700 }, { "distill_loss": 0.13602453470230103, "epoch": 0.9006004002668445, "step": 2700 }, { "epoch": 0.9006004002668445, "ref_ce_loss": 0.24140718579292297, "step": 2700 }, { "epoch": 0.9006004002668445, "loss": 0.6519533395767212, "step": 2700 }, { "ce_loss": 0.2659465968608856, "epoch": 0.9006004002668445, "step": 2700 }, { "distill_loss": 0.11393342167139053, "epoch": 0.9006004002668445, "step": 2700 }, { "epoch": 0.9006004002668445, "ref_ce_loss": 0.14486336708068848, "step": 2700 }, { "epoch": 0.9006004002668445, "loss": 0.459310919046402, "step": 2700 }, { "ce_loss": 0.16971252858638763, "epoch": 0.9006004002668445, "step": 2700 }, { "distill_loss": 0.10560064762830734, "epoch": 0.9006004002668445, "step": 2700 }, { "epoch": 0.9006004002668445, "ref_ce_loss": 0.11843426525592804, "step": 2700 }, { "epoch": 0.9039359573048699, "loss": 0.8334, "step": 2710 }, { "epoch": 0.9039359573048699, "grad_norm": 2.7582480907440186, "step": 2710 }, { "epoch": 0.9039359573048699, "learning_rate": 0.0002946162708585964, "step": 2710 }, { "epoch": 0.9039359573048699, "loss": 0.5983908176422119, "step": 2710 }, { "ce_loss": 0.22176890075206757, "epoch": 0.9039359573048699, "step": 2710 }, { "distill_loss": 0.14231596887111664, "epoch": 0.9039359573048699, "step": 2710 }, { "epoch": 0.9039359573048699, "ref_ce_loss": 0.1662299633026123, "step": 2710 }, { "epoch": 0.9039359573048699, "loss": 0.7019459009170532, "step": 2710 }, { "ce_loss": 0.21000699698925018, "epoch": 0.9039359573048699, "step": 2710 }, { "distill_loss": 0.13449858129024506, "epoch": 0.9039359573048699, "step": 2710 }, { "epoch": 0.9039359573048699, "ref_ce_loss": 0.15763014554977417, "step": 2710 }, { "epoch": 0.9039359573048699, "loss": 0.46816378831863403, "step": 2710 }, { "ce_loss": 0.1613132506608963, "epoch": 0.9039359573048699, "step": 2710 }, { "distill_loss": 0.14022500813007355, "epoch": 0.9039359573048699, "step": 2710 }, { "epoch": 0.9039359573048699, "ref_ce_loss": 0.12063966691493988, "step": 2710 }, { "epoch": 0.9039359573048699, "loss": 0.8382180333137512, "step": 2710 }, { "ce_loss": 0.20074015855789185, "epoch": 0.9039359573048699, "step": 2710 }, { "distill_loss": 0.12996117770671844, "epoch": 0.9039359573048699, "step": 2710 }, { "epoch": 0.9039359573048699, "ref_ce_loss": 0.12067507207393646, "step": 2710 }, { "epoch": 0.9072715143428952, "loss": 0.7523, "step": 2720 }, { "epoch": 0.9072715143428952, "grad_norm": 2.3118250370025635, "step": 2720 }, { "epoch": 0.9072715143428952, "learning_rate": 0.0002945623571991503, "step": 2720 }, { "epoch": 0.9072715143428952, "loss": 0.6837984919548035, "step": 2720 }, { "ce_loss": 0.32145455479621887, "epoch": 0.9072715143428952, "step": 2720 }, { "distill_loss": 0.1139432042837143, "epoch": 0.9072715143428952, "step": 2720 }, { "epoch": 0.9072715143428952, "ref_ce_loss": 0.20749621093273163, "step": 2720 }, { "epoch": 0.9072715143428952, "loss": 0.5676144361495972, "step": 2720 }, { "ce_loss": 0.24475722014904022, "epoch": 0.9072715143428952, "step": 2720 }, { "distill_loss": 0.1074301153421402, "epoch": 0.9072715143428952, "step": 2720 }, { "epoch": 0.9072715143428952, "ref_ce_loss": 0.14644326269626617, "step": 2720 }, { "epoch": 0.9072715143428952, "loss": 0.7602922916412354, "step": 2720 }, { "ce_loss": 0.26372280716896057, "epoch": 0.9072715143428952, "step": 2720 }, { "distill_loss": 0.11422452330589294, "epoch": 0.9072715143428952, "step": 2720 }, { "epoch": 0.9072715143428952, "ref_ce_loss": 0.1716698259115219, "step": 2720 }, { "epoch": 0.9072715143428952, "loss": 0.8460100293159485, "step": 2720 }, { "ce_loss": 0.19480058550834656, "epoch": 0.9072715143428952, "step": 2720 }, { "distill_loss": 0.09263000637292862, "epoch": 0.9072715143428952, "step": 2720 }, { "epoch": 0.9072715143428952, "ref_ce_loss": 0.16573220491409302, "step": 2720 }, { "epoch": 0.9106070713809206, "loss": 0.7049, "step": 2730 }, { "epoch": 0.9106070713809206, "grad_norm": 3.8387629985809326, "step": 2730 }, { "epoch": 0.9106070713809206, "learning_rate": 0.0002945081799149899, "step": 2730 }, { "epoch": 0.9106070713809206, "loss": 0.7528659105300903, "step": 2730 }, { "ce_loss": 0.24429547786712646, "epoch": 0.9106070713809206, "step": 2730 }, { "distill_loss": 0.13893257081508636, "epoch": 0.9106070713809206, "step": 2730 }, { "epoch": 0.9106070713809206, "ref_ce_loss": 0.16770748794078827, "step": 2730 }, { "epoch": 0.9106070713809206, "loss": 0.8350508213043213, "step": 2730 }, { "ce_loss": 0.17379887402057648, "epoch": 0.9106070713809206, "step": 2730 }, { "distill_loss": 0.11916208267211914, "epoch": 0.9106070713809206, "step": 2730 }, { "epoch": 0.9106070713809206, "ref_ce_loss": 0.12559403479099274, "step": 2730 }, { "epoch": 0.9106070713809206, "loss": 0.8810762763023376, "step": 2730 }, { "ce_loss": 0.3514857292175293, "epoch": 0.9106070713809206, "step": 2730 }, { "distill_loss": 0.1322062462568283, "epoch": 0.9106070713809206, "step": 2730 }, { "epoch": 0.9106070713809206, "ref_ce_loss": 0.2726250886917114, "step": 2730 }, { "epoch": 0.9106070713809206, "loss": 0.6011776924133301, "step": 2730 }, { "ce_loss": 0.2854574918746948, "epoch": 0.9106070713809206, "step": 2730 }, { "distill_loss": 0.13027696311473846, "epoch": 0.9106070713809206, "step": 2730 }, { "epoch": 0.9106070713809206, "ref_ce_loss": 0.18520604074001312, "step": 2730 }, { "epoch": 0.9139426284189459, "loss": 0.7369, "step": 2740 }, { "epoch": 0.9139426284189459, "grad_norm": 3.095705509185791, "step": 2740 }, { "epoch": 0.9139426284189459, "learning_rate": 0.0002944537391049131, "step": 2740 }, { "epoch": 0.9139426284189459, "loss": 0.996353268623352, "step": 2740 }, { "ce_loss": 0.30720749497413635, "epoch": 0.9139426284189459, "step": 2740 }, { "distill_loss": 0.27163252234458923, "epoch": 0.9139426284189459, "step": 2740 }, { "epoch": 0.9139426284189459, "ref_ce_loss": 0.169632688164711, "step": 2740 }, { "epoch": 0.9139426284189459, "loss": 0.804191529750824, "step": 2740 }, { "ce_loss": 0.30768176913261414, "epoch": 0.9139426284189459, "step": 2740 }, { "distill_loss": 0.28657642006874084, "epoch": 0.9139426284189459, "step": 2740 }, { "epoch": 0.9139426284189459, "ref_ce_loss": 0.20976907014846802, "step": 2740 }, { "epoch": 0.9139426284189459, "loss": 0.6727897524833679, "step": 2740 }, { "ce_loss": 0.24741102755069733, "epoch": 0.9139426284189459, "step": 2740 }, { "distill_loss": 0.2660277187824249, "epoch": 0.9139426284189459, "step": 2740 }, { "epoch": 0.9139426284189459, "ref_ce_loss": 0.12837496399879456, "step": 2740 }, { "epoch": 0.9139426284189459, "loss": 0.810462474822998, "step": 2740 }, { "ce_loss": 0.18826617300510406, "epoch": 0.9139426284189459, "step": 2740 }, { "distill_loss": 0.2099066823720932, "epoch": 0.9139426284189459, "step": 2740 }, { "epoch": 0.9139426284189459, "ref_ce_loss": 0.10664959996938705, "step": 2740 }, { "epoch": 0.9172781854569713, "loss": 0.8526, "step": 2750 }, { "epoch": 0.9172781854569713, "grad_norm": 2.9879446029663086, "step": 2750 }, { "epoch": 0.9172781854569713, "learning_rate": 0.00029439903486819854, "step": 2750 }, { "epoch": 0.9172781854569713, "loss": 0.7249727845191956, "step": 2750 }, { "ce_loss": 0.2981150448322296, "epoch": 0.9172781854569713, "step": 2750 }, { "distill_loss": 0.13929876685142517, "epoch": 0.9172781854569713, "step": 2750 }, { "epoch": 0.9172781854569713, "ref_ce_loss": 0.17144180834293365, "step": 2750 }, { "epoch": 0.9172781854569713, "loss": 1.1745476722717285, "step": 2750 }, { "ce_loss": 0.2252090722322464, "epoch": 0.9172781854569713, "step": 2750 }, { "distill_loss": 0.12722156941890717, "epoch": 0.9172781854569713, "step": 2750 }, { "epoch": 0.9172781854569713, "ref_ce_loss": 0.15686529874801636, "step": 2750 }, { "epoch": 0.9172781854569713, "loss": 0.8217834234237671, "step": 2750 }, { "ce_loss": 0.28482645750045776, "epoch": 0.9172781854569713, "step": 2750 }, { "distill_loss": 0.16513006389141083, "epoch": 0.9172781854569713, "step": 2750 }, { "epoch": 0.9172781854569713, "ref_ce_loss": 0.1964171677827835, "step": 2750 }, { "epoch": 0.9172781854569713, "loss": 0.5900613069534302, "step": 2750 }, { "ce_loss": 0.2149066925048828, "epoch": 0.9172781854569713, "step": 2750 }, { "distill_loss": 0.13745594024658203, "epoch": 0.9172781854569713, "step": 2750 }, { "epoch": 0.9172781854569713, "ref_ce_loss": 0.1548822671175003, "step": 2750 }, { "epoch": 0.9206137424949966, "loss": 0.7581, "step": 2760 }, { "epoch": 0.9206137424949966, "grad_norm": 2.9715447425842285, "step": 2760 }, { "epoch": 0.9206137424949966, "learning_rate": 0.0002943440673046052, "step": 2760 }, { "epoch": 0.9206137424949966, "loss": 1.3678593635559082, "step": 2760 }, { "ce_loss": 0.3911042809486389, "epoch": 0.9206137424949966, "step": 2760 }, { "distill_loss": 0.3210551142692566, "epoch": 0.9206137424949966, "step": 2760 }, { "epoch": 0.9206137424949966, "ref_ce_loss": 0.1883934587240219, "step": 2760 }, { "epoch": 0.9206137424949966, "loss": 0.7819101810455322, "step": 2760 }, { "ce_loss": 0.22631727159023285, "epoch": 0.9206137424949966, "step": 2760 }, { "distill_loss": 0.27888303995132446, "epoch": 0.9206137424949966, "step": 2760 }, { "epoch": 0.9206137424949966, "ref_ce_loss": 0.15311084687709808, "step": 2760 }, { "epoch": 0.9206137424949966, "loss": 1.427027702331543, "step": 2760 }, { "ce_loss": 0.2932255268096924, "epoch": 0.9206137424949966, "step": 2760 }, { "distill_loss": 0.3127667009830475, "epoch": 0.9206137424949966, "step": 2760 }, { "epoch": 0.9206137424949966, "ref_ce_loss": 0.1879715919494629, "step": 2760 }, { "epoch": 0.9206137424949966, "loss": 0.8832669258117676, "step": 2760 }, { "ce_loss": 0.3134843111038208, "epoch": 0.9206137424949966, "step": 2760 }, { "distill_loss": 0.27922841906547546, "epoch": 0.9206137424949966, "step": 2760 }, { "epoch": 0.9206137424949966, "ref_ce_loss": 0.15278160572052002, "step": 2760 }, { "epoch": 0.923949299533022, "loss": 0.8373, "step": 2770 }, { "epoch": 0.923949299533022, "grad_norm": 2.631944417953491, "step": 2770 }, { "epoch": 0.923949299533022, "learning_rate": 0.0002942888365143721, "step": 2770 }, { "epoch": 0.923949299533022, "loss": 0.9981451630592346, "step": 2770 }, { "ce_loss": 0.22759626805782318, "epoch": 0.923949299533022, "step": 2770 }, { "distill_loss": 0.19656261801719666, "epoch": 0.923949299533022, "step": 2770 }, { "epoch": 0.923949299533022, "ref_ce_loss": 0.19053815305233002, "step": 2770 }, { "epoch": 0.923949299533022, "loss": 0.6683474779129028, "step": 2770 }, { "ce_loss": 0.2622044086456299, "epoch": 0.923949299533022, "step": 2770 }, { "distill_loss": 0.2272324562072754, "epoch": 0.923949299533022, "step": 2770 }, { "epoch": 0.923949299533022, "ref_ce_loss": 0.17883671820163727, "step": 2770 }, { "epoch": 0.923949299533022, "loss": 0.6071282625198364, "step": 2770 }, { "ce_loss": 0.20569933950901031, "epoch": 0.923949299533022, "step": 2770 }, { "distill_loss": 0.20898611843585968, "epoch": 0.923949299533022, "step": 2770 }, { "epoch": 0.923949299533022, "ref_ce_loss": 0.09778912365436554, "step": 2770 }, { "epoch": 0.923949299533022, "loss": 0.5389513969421387, "step": 2770 }, { "ce_loss": 0.17628800868988037, "epoch": 0.923949299533022, "step": 2770 }, { "distill_loss": 0.18154376745224, "epoch": 0.923949299533022, "step": 2770 }, { "epoch": 0.923949299533022, "ref_ce_loss": 0.14046207070350647, "step": 2770 }, { "epoch": 0.9272848565710473, "loss": 0.808, "step": 2780 }, { "epoch": 0.9272848565710473, "grad_norm": 2.491095781326294, "step": 2780 }, { "epoch": 0.9272848565710473, "learning_rate": 0.00029423334259821854, "step": 2780 }, { "epoch": 0.9272848565710473, "loss": 0.7875550389289856, "step": 2780 }, { "ce_loss": 0.29149720072746277, "epoch": 0.9272848565710473, "step": 2780 }, { "distill_loss": 0.26492205262184143, "epoch": 0.9272848565710473, "step": 2780 }, { "epoch": 0.9272848565710473, "ref_ce_loss": 0.15465959906578064, "step": 2780 }, { "epoch": 0.9272848565710473, "loss": 0.7306146621704102, "step": 2780 }, { "ce_loss": 0.29060646891593933, "epoch": 0.9272848565710473, "step": 2780 }, { "distill_loss": 0.22705847024917603, "epoch": 0.9272848565710473, "step": 2780 }, { "epoch": 0.9272848565710473, "ref_ce_loss": 0.14852774143218994, "step": 2780 }, { "epoch": 0.9272848565710473, "loss": 0.7648739814758301, "step": 2780 }, { "ce_loss": 0.3341917097568512, "epoch": 0.9272848565710473, "step": 2780 }, { "distill_loss": 0.227829247713089, "epoch": 0.9272848565710473, "step": 2780 }, { "epoch": 0.9272848565710473, "ref_ce_loss": 0.14977265894412994, "step": 2780 }, { "epoch": 0.9272848565710473, "loss": 0.984939694404602, "step": 2780 }, { "ce_loss": 0.37465381622314453, "epoch": 0.9272848565710473, "step": 2780 }, { "distill_loss": 0.2559279799461365, "epoch": 0.9272848565710473, "step": 2780 }, { "epoch": 0.9272848565710473, "ref_ce_loss": 0.2502400577068329, "step": 2780 }, { "epoch": 0.9306204136090727, "loss": 0.8007, "step": 2790 }, { "epoch": 0.9306204136090727, "grad_norm": 2.9029388427734375, "step": 2790 }, { "epoch": 0.9306204136090727, "learning_rate": 0.0002941775856573435, "step": 2790 }, { "epoch": 0.9306204136090727, "loss": 0.7560724020004272, "step": 2790 }, { "ce_loss": 0.24939562380313873, "epoch": 0.9306204136090727, "step": 2790 }, { "distill_loss": 0.24355027079582214, "epoch": 0.9306204136090727, "step": 2790 }, { "epoch": 0.9306204136090727, "ref_ce_loss": 0.17152981460094452, "step": 2790 }, { "epoch": 0.9306204136090727, "loss": 0.8174520134925842, "step": 2790 }, { "ce_loss": 0.2900400757789612, "epoch": 0.9306204136090727, "step": 2790 }, { "distill_loss": 0.2231612205505371, "epoch": 0.9306204136090727, "step": 2790 }, { "epoch": 0.9306204136090727, "ref_ce_loss": 0.2126293033361435, "step": 2790 }, { "epoch": 0.9306204136090727, "loss": 0.9072144031524658, "step": 2790 }, { "ce_loss": 0.2195039987564087, "epoch": 0.9306204136090727, "step": 2790 }, { "distill_loss": 0.22568339109420776, "epoch": 0.9306204136090727, "step": 2790 }, { "epoch": 0.9306204136090727, "ref_ce_loss": 0.10249602794647217, "step": 2790 }, { "epoch": 0.9306204136090727, "loss": 0.7943201065063477, "step": 2790 }, { "ce_loss": 0.23619568347930908, "epoch": 0.9306204136090727, "step": 2790 }, { "distill_loss": 0.19247688353061676, "epoch": 0.9306204136090727, "step": 2790 }, { "epoch": 0.9306204136090727, "ref_ce_loss": 0.14785772562026978, "step": 2790 }, { "epoch": 0.933955970647098, "loss": 0.7842, "step": 2800 }, { "epoch": 0.933955970647098, "grad_norm": 2.3317878246307373, "step": 2800 }, { "epoch": 0.933955970647098, "learning_rate": 0.0002941215657934256, "step": 2800 }, { "epoch": 0.933955970647098, "loss": 0.7021505236625671, "step": 2800 }, { "ce_loss": 0.34092476963996887, "epoch": 0.933955970647098, "step": 2800 }, { "distill_loss": 0.15701162815093994, "epoch": 0.933955970647098, "step": 2800 }, { "epoch": 0.933955970647098, "ref_ce_loss": 0.20417509973049164, "step": 2800 }, { "epoch": 0.933955970647098, "loss": 0.6944257020950317, "step": 2800 }, { "ce_loss": 0.2710808515548706, "epoch": 0.933955970647098, "step": 2800 }, { "distill_loss": 0.21411262452602386, "epoch": 0.933955970647098, "step": 2800 }, { "epoch": 0.933955970647098, "ref_ce_loss": 0.17248453199863434, "step": 2800 }, { "epoch": 0.933955970647098, "loss": 0.3621445596218109, "step": 2800 }, { "ce_loss": 0.14474163949489594, "epoch": 0.933955970647098, "step": 2800 }, { "distill_loss": 0.14359624683856964, "epoch": 0.933955970647098, "step": 2800 }, { "epoch": 0.933955970647098, "ref_ce_loss": 0.07347650825977325, "step": 2800 }, { "epoch": 0.933955970647098, "loss": 0.8690259456634521, "step": 2800 }, { "ce_loss": 0.3273351192474365, "epoch": 0.933955970647098, "step": 2800 }, { "distill_loss": 0.19411294162273407, "epoch": 0.933955970647098, "step": 2800 }, { "epoch": 0.933955970647098, "ref_ce_loss": 0.16561636328697205, "step": 2800 }, { "epoch": 0.9372915276851234, "loss": 0.7396, "step": 2810 }, { "epoch": 0.9372915276851234, "grad_norm": 1.872754693031311, "step": 2810 }, { "epoch": 0.9372915276851234, "learning_rate": 0.00029406528310862306, "step": 2810 }, { "epoch": 0.9372915276851234, "loss": 0.6328253746032715, "step": 2810 }, { "ce_loss": 0.26702645421028137, "epoch": 0.9372915276851234, "step": 2810 }, { "distill_loss": 0.11107096076011658, "epoch": 0.9372915276851234, "step": 2810 }, { "epoch": 0.9372915276851234, "ref_ce_loss": 0.12137383967638016, "step": 2810 }, { "epoch": 0.9372915276851234, "loss": 0.7653927803039551, "step": 2810 }, { "ce_loss": 0.32979145646095276, "epoch": 0.9372915276851234, "step": 2810 }, { "distill_loss": 0.14815056324005127, "epoch": 0.9372915276851234, "step": 2810 }, { "epoch": 0.9372915276851234, "ref_ce_loss": 0.14542245864868164, "step": 2810 }, { "epoch": 0.9372915276851234, "loss": 0.793331503868103, "step": 2810 }, { "ce_loss": 0.28229960799217224, "epoch": 0.9372915276851234, "step": 2810 }, { "distill_loss": 0.14099378883838654, "epoch": 0.9372915276851234, "step": 2810 }, { "epoch": 0.9372915276851234, "ref_ce_loss": 0.1897796094417572, "step": 2810 }, { "epoch": 0.9372915276851234, "loss": 0.4289068877696991, "step": 2810 }, { "ce_loss": 0.12203547358512878, "epoch": 0.9372915276851234, "step": 2810 }, { "distill_loss": 0.1050165444612503, "epoch": 0.9372915276851234, "step": 2810 }, { "epoch": 0.9372915276851234, "ref_ce_loss": 0.10393452644348145, "step": 2810 }, { "epoch": 0.9406270847231488, "loss": 0.8713, "step": 2820 }, { "epoch": 0.9406270847231488, "grad_norm": 5.31164026260376, "step": 2820 }, { "epoch": 0.9406270847231488, "learning_rate": 0.00029400873770557323, "step": 2820 }, { "epoch": 0.9406270847231488, "loss": 0.552436888217926, "step": 2820 }, { "ce_loss": 0.17268088459968567, "epoch": 0.9406270847231488, "step": 2820 }, { "distill_loss": 0.26252999901771545, "epoch": 0.9406270847231488, "step": 2820 }, { "epoch": 0.9406270847231488, "ref_ce_loss": 0.11688818782567978, "step": 2820 }, { "epoch": 0.9406270847231488, "loss": 0.6593774557113647, "step": 2820 }, { "ce_loss": 0.16558508574962616, "epoch": 0.9406270847231488, "step": 2820 }, { "distill_loss": 0.21160022914409637, "epoch": 0.9406270847231488, "step": 2820 }, { "epoch": 0.9406270847231488, "ref_ce_loss": 0.1414853036403656, "step": 2820 }, { "epoch": 0.9406270847231488, "loss": 0.8572372198104858, "step": 2820 }, { "ce_loss": 0.2560436427593231, "epoch": 0.9406270847231488, "step": 2820 }, { "distill_loss": 0.2739320695400238, "epoch": 0.9406270847231488, "step": 2820 }, { "epoch": 0.9406270847231488, "ref_ce_loss": 0.1760285347700119, "step": 2820 }, { "epoch": 0.9406270847231488, "loss": 0.812274694442749, "step": 2820 }, { "ce_loss": 0.21347567439079285, "epoch": 0.9406270847231488, "step": 2820 }, { "distill_loss": 0.31114456057548523, "epoch": 0.9406270847231488, "step": 2820 }, { "epoch": 0.9406270847231488, "ref_ce_loss": 0.15867622196674347, "step": 2820 }, { "epoch": 0.9439626417611742, "loss": 1.0656, "step": 2830 }, { "epoch": 0.9439626417611742, "grad_norm": 3.638387441635132, "step": 2830 }, { "epoch": 0.9439626417611742, "learning_rate": 0.00029395192968739264, "step": 2830 }, { "epoch": 0.9439626417611742, "loss": 1.7968251705169678, "step": 2830 }, { "ce_loss": 0.4090914726257324, "epoch": 0.9439626417611742, "step": 2830 }, { "distill_loss": 0.6144574880599976, "epoch": 0.9439626417611742, "step": 2830 }, { "epoch": 0.9439626417611742, "ref_ce_loss": 0.1680503636598587, "step": 2830 }, { "epoch": 0.9439626417611742, "loss": 0.9041198492050171, "step": 2830 }, { "ce_loss": 0.22265367209911346, "epoch": 0.9439626417611742, "step": 2830 }, { "distill_loss": 0.37538236379623413, "epoch": 0.9439626417611742, "step": 2830 }, { "epoch": 0.9439626417611742, "ref_ce_loss": 0.160588338971138, "step": 2830 }, { "epoch": 0.9439626417611742, "loss": 0.7223681807518005, "step": 2830 }, { "ce_loss": 0.18482926487922668, "epoch": 0.9439626417611742, "step": 2830 }, { "distill_loss": 0.3430609107017517, "epoch": 0.9439626417611742, "step": 2830 }, { "epoch": 0.9439626417611742, "ref_ce_loss": 0.11669516563415527, "step": 2830 }, { "epoch": 0.9439626417611742, "loss": 0.8510680794715881, "step": 2830 }, { "ce_loss": 0.12880350649356842, "epoch": 0.9439626417611742, "step": 2830 }, { "distill_loss": 0.42931199073791504, "epoch": 0.9439626417611742, "step": 2830 }, { "epoch": 0.9439626417611742, "ref_ce_loss": 0.11357249319553375, "step": 2830 }, { "epoch": 0.9472981987991995, "loss": 0.9909, "step": 2840 }, { "epoch": 0.9472981987991995, "grad_norm": 2.5945699214935303, "step": 2840 }, { "epoch": 0.9472981987991995, "learning_rate": 0.00029389485915767675, "step": 2840 }, { "epoch": 0.9472981987991995, "loss": 1.5304577350616455, "step": 2840 }, { "ce_loss": 0.37256667017936707, "epoch": 0.9472981987991995, "step": 2840 }, { "distill_loss": 0.2691047191619873, "epoch": 0.9472981987991995, "step": 2840 }, { "epoch": 0.9472981987991995, "ref_ce_loss": 0.16921545565128326, "step": 2840 }, { "epoch": 0.9472981987991995, "loss": 0.7942785620689392, "step": 2840 }, { "ce_loss": 0.2432491034269333, "epoch": 0.9472981987991995, "step": 2840 }, { "distill_loss": 0.299993097782135, "epoch": 0.9472981987991995, "step": 2840 }, { "epoch": 0.9472981987991995, "ref_ce_loss": 0.12675027549266815, "step": 2840 }, { "epoch": 0.9472981987991995, "loss": 0.7761873006820679, "step": 2840 }, { "ce_loss": 0.2942223846912384, "epoch": 0.9472981987991995, "step": 2840 }, { "distill_loss": 0.3370579481124878, "epoch": 0.9472981987991995, "step": 2840 }, { "epoch": 0.9472981987991995, "ref_ce_loss": 0.14476382732391357, "step": 2840 }, { "epoch": 0.9472981987991995, "loss": 0.8187755942344666, "step": 2840 }, { "ce_loss": 0.16483481228351593, "epoch": 0.9472981987991995, "step": 2840 }, { "distill_loss": 0.3002135455608368, "epoch": 0.9472981987991995, "step": 2840 }, { "epoch": 0.9472981987991995, "ref_ce_loss": 0.1228487640619278, "step": 2840 }, { "epoch": 0.9506337558372249, "loss": 0.781, "step": 2850 }, { "epoch": 0.9506337558372249, "grad_norm": 3.0309741497039795, "step": 2850 }, { "epoch": 0.9506337558372249, "learning_rate": 0.0002938375262204996, "step": 2850 }, { "epoch": 0.9506337558372249, "loss": 0.8299252390861511, "step": 2850 }, { "ce_loss": 0.18990515172481537, "epoch": 0.9506337558372249, "step": 2850 }, { "distill_loss": 0.16923873126506805, "epoch": 0.9506337558372249, "step": 2850 }, { "epoch": 0.9506337558372249, "ref_ce_loss": 0.11406919360160828, "step": 2850 }, { "epoch": 0.9506337558372249, "loss": 0.6928610801696777, "step": 2850 }, { "ce_loss": 0.23902356624603271, "epoch": 0.9506337558372249, "step": 2850 }, { "distill_loss": 0.19913634657859802, "epoch": 0.9506337558372249, "step": 2850 }, { "epoch": 0.9506337558372249, "ref_ce_loss": 0.18962852656841278, "step": 2850 }, { "epoch": 0.9506337558372249, "loss": 0.7069710493087769, "step": 2850 }, { "ce_loss": 0.24961289763450623, "epoch": 0.9506337558372249, "step": 2850 }, { "distill_loss": 0.15987449884414673, "epoch": 0.9506337558372249, "step": 2850 }, { "epoch": 0.9506337558372249, "ref_ce_loss": 0.22801867127418518, "step": 2850 }, { "epoch": 0.9506337558372249, "loss": 1.0759639739990234, "step": 2850 }, { "ce_loss": 0.1746206283569336, "epoch": 0.9506337558372249, "step": 2850 }, { "distill_loss": 0.1477518379688263, "epoch": 0.9506337558372249, "step": 2850 }, { "epoch": 0.9506337558372249, "ref_ce_loss": 0.1326025128364563, "step": 2850 }, { "epoch": 0.9539693128752502, "loss": 0.7967, "step": 2860 }, { "epoch": 0.9539693128752502, "grad_norm": 2.4280905723571777, "step": 2860 }, { "epoch": 0.9539693128752502, "learning_rate": 0.0002937799309804139, "step": 2860 }, { "epoch": 0.9539693128752502, "loss": 0.4592561423778534, "step": 2860 }, { "ce_loss": 0.1547430455684662, "epoch": 0.9539693128752502, "step": 2860 }, { "distill_loss": 0.1239887923002243, "epoch": 0.9539693128752502, "step": 2860 }, { "epoch": 0.9539693128752502, "ref_ce_loss": 0.10273492336273193, "step": 2860 }, { "epoch": 0.9539693128752502, "loss": 0.560876190662384, "step": 2860 }, { "ce_loss": 0.20759513974189758, "epoch": 0.9539693128752502, "step": 2860 }, { "distill_loss": 0.1149979904294014, "epoch": 0.9539693128752502, "step": 2860 }, { "epoch": 0.9539693128752502, "ref_ce_loss": 0.16428732872009277, "step": 2860 }, { "epoch": 0.9539693128752502, "loss": 0.687434732913971, "step": 2860 }, { "ce_loss": 0.25539320707321167, "epoch": 0.9539693128752502, "step": 2860 }, { "distill_loss": 0.15676721930503845, "epoch": 0.9539693128752502, "step": 2860 }, { "epoch": 0.9539693128752502, "ref_ce_loss": 0.1433422714471817, "step": 2860 }, { "epoch": 0.9539693128752502, "loss": 0.668256402015686, "step": 2860 }, { "ce_loss": 0.2717263400554657, "epoch": 0.9539693128752502, "step": 2860 }, { "distill_loss": 0.14029839634895325, "epoch": 0.9539693128752502, "step": 2860 }, { "epoch": 0.9539693128752502, "ref_ce_loss": 0.14045491814613342, "step": 2860 }, { "epoch": 0.9573048699132756, "loss": 0.724, "step": 2870 }, { "epoch": 0.9573048699132756, "grad_norm": 2.734346866607666, "step": 2870 }, { "epoch": 0.9573048699132756, "learning_rate": 0.0002937220735424506, "step": 2870 }, { "epoch": 0.9573048699132756, "loss": 0.6680867075920105, "step": 2870 }, { "ce_loss": 0.21752631664276123, "epoch": 0.9573048699132756, "step": 2870 }, { "distill_loss": 0.15151241421699524, "epoch": 0.9573048699132756, "step": 2870 }, { "epoch": 0.9573048699132756, "ref_ce_loss": 0.11191045492887497, "step": 2870 }, { "epoch": 0.9573048699132756, "loss": 0.6690918803215027, "step": 2870 }, { "ce_loss": 0.16620102524757385, "epoch": 0.9573048699132756, "step": 2870 }, { "distill_loss": 0.16352228820323944, "epoch": 0.9573048699132756, "step": 2870 }, { "epoch": 0.9573048699132756, "ref_ce_loss": 0.1283581703901291, "step": 2870 }, { "epoch": 0.9573048699132756, "loss": 0.8832834959030151, "step": 2870 }, { "ce_loss": 0.20673666894435883, "epoch": 0.9573048699132756, "step": 2870 }, { "distill_loss": 0.16823595762252808, "epoch": 0.9573048699132756, "step": 2870 }, { "epoch": 0.9573048699132756, "ref_ce_loss": 0.19927184283733368, "step": 2870 }, { "epoch": 0.9573048699132756, "loss": 0.7416874766349792, "step": 2870 }, { "ce_loss": 0.22405901551246643, "epoch": 0.9573048699132756, "step": 2870 }, { "distill_loss": 0.15006643533706665, "epoch": 0.9573048699132756, "step": 2870 }, { "epoch": 0.9573048699132756, "ref_ce_loss": 0.21225836873054504, "step": 2870 }, { "epoch": 0.9606404269513009, "loss": 0.738, "step": 2880 }, { "epoch": 0.9606404269513009, "grad_norm": 2.3339405059814453, "step": 2880 }, { "epoch": 0.9606404269513009, "learning_rate": 0.0002936639540121189, "step": 2880 }, { "epoch": 0.9606404269513009, "loss": 1.1273889541625977, "step": 2880 }, { "ce_loss": 0.1480511724948883, "epoch": 0.9606404269513009, "step": 2880 }, { "distill_loss": 0.0853370949625969, "epoch": 0.9606404269513009, "step": 2880 }, { "epoch": 0.9606404269513009, "ref_ce_loss": 0.11499093472957611, "step": 2880 }, { "epoch": 0.9606404269513009, "loss": 0.6598379611968994, "step": 2880 }, { "ce_loss": 0.30389922857284546, "epoch": 0.9606404269513009, "step": 2880 }, { "distill_loss": 0.11932610720396042, "epoch": 0.9606404269513009, "step": 2880 }, { "epoch": 0.9606404269513009, "ref_ce_loss": 0.16129109263420105, "step": 2880 }, { "epoch": 0.9606404269513009, "loss": 0.6228227019309998, "step": 2880 }, { "ce_loss": 0.26174575090408325, "epoch": 0.9606404269513009, "step": 2880 }, { "distill_loss": 0.1098339632153511, "epoch": 0.9606404269513009, "step": 2880 }, { "epoch": 0.9606404269513009, "ref_ce_loss": 0.17401239275932312, "step": 2880 }, { "epoch": 0.9606404269513009, "loss": 0.491678386926651, "step": 2880 }, { "ce_loss": 0.208614781498909, "epoch": 0.9606404269513009, "step": 2880 }, { "distill_loss": 0.10472996532917023, "epoch": 0.9606404269513009, "step": 2880 }, { "epoch": 0.9606404269513009, "ref_ce_loss": 0.17823967337608337, "step": 2880 }, { "epoch": 0.9639759839893263, "loss": 0.6991, "step": 2890 }, { "epoch": 0.9639759839893263, "grad_norm": 1.9527865648269653, "step": 2890 }, { "epoch": 0.9639759839893263, "learning_rate": 0.0002936055724954059, "step": 2890 }, { "epoch": 0.9639759839893263, "loss": 0.6537126302719116, "step": 2890 }, { "ce_loss": 0.32857227325439453, "epoch": 0.9639759839893263, "step": 2890 }, { "distill_loss": 0.14058195054531097, "epoch": 0.9639759839893263, "step": 2890 }, { "epoch": 0.9639759839893263, "ref_ce_loss": 0.18424710631370544, "step": 2890 }, { "epoch": 0.9639759839893263, "loss": 0.621168851852417, "step": 2890 }, { "ce_loss": 0.24660468101501465, "epoch": 0.9639759839893263, "step": 2890 }, { "distill_loss": 0.14954647421836853, "epoch": 0.9639759839893263, "step": 2890 }, { "epoch": 0.9639759839893263, "ref_ce_loss": 0.1393446922302246, "step": 2890 }, { "epoch": 0.9639759839893263, "loss": 0.6661877036094666, "step": 2890 }, { "ce_loss": 0.26231226325035095, "epoch": 0.9639759839893263, "step": 2890 }, { "distill_loss": 0.12446777522563934, "epoch": 0.9639759839893263, "step": 2890 }, { "epoch": 0.9639759839893263, "ref_ce_loss": 0.21287235617637634, "step": 2890 }, { "epoch": 0.9639759839893263, "loss": 0.4061315953731537, "step": 2890 }, { "ce_loss": 0.1379050761461258, "epoch": 0.9639759839893263, "step": 2890 }, { "distill_loss": 0.10318372398614883, "epoch": 0.9639759839893263, "step": 2890 }, { "epoch": 0.9639759839893263, "ref_ce_loss": 0.13453784584999084, "step": 2890 }, { "epoch": 0.9673115410273516, "loss": 0.6889, "step": 2900 }, { "epoch": 0.9673115410273516, "grad_norm": 1.8287708759307861, "step": 2900 }, { "epoch": 0.9673115410273516, "learning_rate": 0.0002935469290987765, "step": 2900 }, { "epoch": 0.9673115410273516, "loss": 0.6817537546157837, "step": 2900 }, { "ce_loss": 0.31336405873298645, "epoch": 0.9673115410273516, "step": 2900 }, { "distill_loss": 0.14172199368476868, "epoch": 0.9673115410273516, "step": 2900 }, { "epoch": 0.9673115410273516, "ref_ce_loss": 0.2252521812915802, "step": 2900 }, { "epoch": 0.9673115410273516, "loss": 0.5109325647354126, "step": 2900 }, { "ce_loss": 0.1633598655462265, "epoch": 0.9673115410273516, "step": 2900 }, { "distill_loss": 0.12366708368062973, "epoch": 0.9673115410273516, "step": 2900 }, { "epoch": 0.9673115410273516, "ref_ce_loss": 0.11546748876571655, "step": 2900 }, { "epoch": 0.9673115410273516, "loss": 0.6547499895095825, "step": 2900 }, { "ce_loss": 0.29048779606819153, "epoch": 0.9673115410273516, "step": 2900 }, { "distill_loss": 0.1456313133239746, "epoch": 0.9673115410273516, "step": 2900 }, { "epoch": 0.9673115410273516, "ref_ce_loss": 0.21713127195835114, "step": 2900 }, { "epoch": 0.9673115410273516, "loss": 0.6343402862548828, "step": 2900 }, { "ce_loss": 0.22194309532642365, "epoch": 0.9673115410273516, "step": 2900 }, { "distill_loss": 0.14054368436336517, "epoch": 0.9673115410273516, "step": 2900 }, { "epoch": 0.9673115410273516, "ref_ce_loss": 0.1832578331232071, "step": 2900 }, { "epoch": 0.970647098065377, "loss": 0.6749, "step": 2910 }, { "epoch": 0.970647098065377, "grad_norm": 2.4779341220855713, "step": 2910 }, { "epoch": 0.970647098065377, "learning_rate": 0.00029348802392917305, "step": 2910 }, { "epoch": 0.970647098065377, "loss": 0.5308644771575928, "step": 2910 }, { "ce_loss": 0.2553784251213074, "epoch": 0.970647098065377, "step": 2910 }, { "distill_loss": 0.12964846193790436, "epoch": 0.970647098065377, "step": 2910 }, { "epoch": 0.970647098065377, "ref_ce_loss": 0.1454658955335617, "step": 2910 }, { "epoch": 0.970647098065377, "loss": 0.7324144244194031, "step": 2910 }, { "ce_loss": 0.35473960638046265, "epoch": 0.970647098065377, "step": 2910 }, { "distill_loss": 0.13062384724617004, "epoch": 0.970647098065377, "step": 2910 }, { "epoch": 0.970647098065377, "ref_ce_loss": 0.16410627961158752, "step": 2910 }, { "epoch": 0.970647098065377, "loss": 0.6389783620834351, "step": 2910 }, { "ce_loss": 0.2581387162208557, "epoch": 0.970647098065377, "step": 2910 }, { "distill_loss": 0.11254259198904037, "epoch": 0.970647098065377, "step": 2910 }, { "epoch": 0.970647098065377, "ref_ce_loss": 0.10073118656873703, "step": 2910 }, { "epoch": 0.970647098065377, "loss": 0.8085727691650391, "step": 2910 }, { "ce_loss": 0.29789814352989197, "epoch": 0.970647098065377, "step": 2910 }, { "distill_loss": 0.11773893237113953, "epoch": 0.970647098065377, "step": 2910 }, { "epoch": 0.970647098065377, "ref_ce_loss": 0.1585742086172104, "step": 2910 }, { "epoch": 0.9739826551034023, "loss": 0.689, "step": 2920 }, { "epoch": 0.9739826551034023, "grad_norm": 2.5197560787200928, "step": 2920 }, { "epoch": 0.9739826551034023, "learning_rate": 0.0002934288570940153, "step": 2920 }, { "epoch": 0.9739826551034023, "loss": 0.7948296070098877, "step": 2920 }, { "ce_loss": 0.1193004623055458, "epoch": 0.9739826551034023, "step": 2920 }, { "distill_loss": 0.11356161534786224, "epoch": 0.9739826551034023, "step": 2920 }, { "epoch": 0.9739826551034023, "ref_ce_loss": 0.10010401904582977, "step": 2920 }, { "epoch": 0.9739826551034023, "loss": 0.8971014022827148, "step": 2920 }, { "ce_loss": 0.4552440345287323, "epoch": 0.9739826551034023, "step": 2920 }, { "distill_loss": 0.16932310163974762, "epoch": 0.9739826551034023, "step": 2920 }, { "epoch": 0.9739826551034023, "ref_ce_loss": 0.189690962433815, "step": 2920 }, { "epoch": 0.9739826551034023, "loss": 1.1074857711791992, "step": 2920 }, { "ce_loss": 0.2667383551597595, "epoch": 0.9739826551034023, "step": 2920 }, { "distill_loss": 0.10355770587921143, "epoch": 0.9739826551034023, "step": 2920 }, { "epoch": 0.9739826551034023, "ref_ce_loss": 0.1737743765115738, "step": 2920 }, { "epoch": 0.9739826551034023, "loss": 0.5775800943374634, "step": 2920 }, { "ce_loss": 0.12394880503416061, "epoch": 0.9739826551034023, "step": 2920 }, { "distill_loss": 0.09791535139083862, "epoch": 0.9739826551034023, "step": 2920 }, { "epoch": 0.9739826551034023, "ref_ce_loss": 0.1486055999994278, "step": 2920 }, { "epoch": 0.9773182121414277, "loss": 0.6744, "step": 2930 }, { "epoch": 0.9773182121414277, "grad_norm": 3.1562507152557373, "step": 2930 }, { "epoch": 0.9773182121414277, "learning_rate": 0.00029336942870120033, "step": 2930 }, { "epoch": 0.9773182121414277, "loss": 1.3249276876449585, "step": 2930 }, { "ce_loss": 0.3045770823955536, "epoch": 0.9773182121414277, "step": 2930 }, { "distill_loss": 0.12404598295688629, "epoch": 0.9773182121414277, "step": 2930 }, { "epoch": 0.9773182121414277, "ref_ce_loss": 0.15918442606925964, "step": 2930 }, { "epoch": 0.9773182121414277, "loss": 0.7175264954566956, "step": 2930 }, { "ce_loss": 0.19825127720832825, "epoch": 0.9773182121414277, "step": 2930 }, { "distill_loss": 0.1021764725446701, "epoch": 0.9773182121414277, "step": 2930 }, { "epoch": 0.9773182121414277, "ref_ce_loss": 0.16720989346504211, "step": 2930 }, { "epoch": 0.9773182121414277, "loss": 0.648270845413208, "step": 2930 }, { "ce_loss": 0.15907418727874756, "epoch": 0.9773182121414277, "step": 2930 }, { "distill_loss": 0.09872958809137344, "epoch": 0.9773182121414277, "step": 2930 }, { "epoch": 0.9773182121414277, "ref_ce_loss": 0.13381551206111908, "step": 2930 }, { "epoch": 0.9773182121414277, "loss": 0.8462702631950378, "step": 2930 }, { "ce_loss": 0.4205211400985718, "epoch": 0.9773182121414277, "step": 2930 }, { "distill_loss": 0.1454874575138092, "epoch": 0.9773182121414277, "step": 2930 }, { "epoch": 0.9773182121414277, "ref_ce_loss": 0.1591566801071167, "step": 2930 }, { "epoch": 0.980653769179453, "loss": 0.6945, "step": 2940 }, { "epoch": 0.980653769179453, "grad_norm": 2.639544725418091, "step": 2940 }, { "epoch": 0.980653769179453, "learning_rate": 0.000293309738859102, "step": 2940 }, { "epoch": 0.980653769179453, "loss": 0.4652785658836365, "step": 2940 }, { "ce_loss": 0.17041869461536407, "epoch": 0.980653769179453, "step": 2940 }, { "distill_loss": 0.12091349810361862, "epoch": 0.980653769179453, "step": 2940 }, { "epoch": 0.980653769179453, "ref_ce_loss": 0.12496335804462433, "step": 2940 }, { "epoch": 0.980653769179453, "loss": 0.6894442439079285, "step": 2940 }, { "ce_loss": 0.19380292296409607, "epoch": 0.980653769179453, "step": 2940 }, { "distill_loss": 0.12530314922332764, "epoch": 0.980653769179453, "step": 2940 }, { "epoch": 0.980653769179453, "ref_ce_loss": 0.14851728081703186, "step": 2940 }, { "epoch": 0.980653769179453, "loss": 0.523069441318512, "step": 2940 }, { "ce_loss": 0.1773136556148529, "epoch": 0.980653769179453, "step": 2940 }, { "distill_loss": 0.12015138566493988, "epoch": 0.980653769179453, "step": 2940 }, { "epoch": 0.980653769179453, "ref_ce_loss": 0.11531037092208862, "step": 2940 }, { "epoch": 0.980653769179453, "loss": 0.6160759329795837, "step": 2940 }, { "ce_loss": 0.20735104382038116, "epoch": 0.980653769179453, "step": 2940 }, { "distill_loss": 0.13009101152420044, "epoch": 0.980653769179453, "step": 2940 }, { "epoch": 0.980653769179453, "ref_ce_loss": 0.13989074528217316, "step": 2940 }, { "epoch": 0.9839893262174784, "loss": 1.0021, "step": 2950 }, { "epoch": 0.9839893262174784, "grad_norm": 2.5855627059936523, "step": 2950 }, { "epoch": 0.9839893262174784, "learning_rate": 0.0002932497876765711, "step": 2950 }, { "epoch": 0.9839893262174784, "loss": 0.606486976146698, "step": 2950 }, { "ce_loss": 0.22377228736877441, "epoch": 0.9839893262174784, "step": 2950 }, { "distill_loss": 0.2075396329164505, "epoch": 0.9839893262174784, "step": 2950 }, { "epoch": 0.9839893262174784, "ref_ce_loss": 0.10233394056558609, "step": 2950 }, { "epoch": 0.9839893262174784, "loss": 0.5983362197875977, "step": 2950 }, { "ce_loss": 0.18260978162288666, "epoch": 0.9839893262174784, "step": 2950 }, { "distill_loss": 0.1780635416507721, "epoch": 0.9839893262174784, "step": 2950 }, { "epoch": 0.9839893262174784, "ref_ce_loss": 0.08419310301542282, "step": 2950 }, { "epoch": 0.9839893262174784, "loss": 0.7534973621368408, "step": 2950 }, { "ce_loss": 0.21013881266117096, "epoch": 0.9839893262174784, "step": 2950 }, { "distill_loss": 0.24599647521972656, "epoch": 0.9839893262174784, "step": 2950 }, { "epoch": 0.9839893262174784, "ref_ce_loss": 0.19525231420993805, "step": 2950 }, { "epoch": 0.9839893262174784, "loss": 1.276760458946228, "step": 2950 }, { "ce_loss": 0.34345290064811707, "epoch": 0.9839893262174784, "step": 2950 }, { "distill_loss": 0.2109031230211258, "epoch": 0.9839893262174784, "step": 2950 }, { "epoch": 0.9839893262174784, "ref_ce_loss": 0.21050138771533966, "step": 2950 }, { "epoch": 0.9873248832555037, "loss": 0.7315, "step": 2960 }, { "epoch": 0.9873248832555037, "grad_norm": 2.1952788829803467, "step": 2960 }, { "epoch": 0.9873248832555037, "learning_rate": 0.0002931895752629349, "step": 2960 }, { "epoch": 0.9873248832555037, "loss": 0.5843423008918762, "step": 2960 }, { "ce_loss": 0.2293442189693451, "epoch": 0.9873248832555037, "step": 2960 }, { "distill_loss": 0.13252869248390198, "epoch": 0.9873248832555037, "step": 2960 }, { "epoch": 0.9873248832555037, "ref_ce_loss": 0.1581684648990631, "step": 2960 }, { "epoch": 0.9873248832555037, "loss": 0.7649279236793518, "step": 2960 }, { "ce_loss": 0.35924723744392395, "epoch": 0.9873248832555037, "step": 2960 }, { "distill_loss": 0.14280739426612854, "epoch": 0.9873248832555037, "step": 2960 }, { "epoch": 0.9873248832555037, "ref_ce_loss": 0.21604615449905396, "step": 2960 }, { "epoch": 0.9873248832555037, "loss": 0.6165916919708252, "step": 2960 }, { "ce_loss": 0.29689690470695496, "epoch": 0.9873248832555037, "step": 2960 }, { "distill_loss": 0.12747378647327423, "epoch": 0.9873248832555037, "step": 2960 }, { "epoch": 0.9873248832555037, "ref_ce_loss": 0.14126361906528473, "step": 2960 }, { "epoch": 0.9873248832555037, "loss": 0.4910457134246826, "step": 2960 }, { "ce_loss": 0.20109711587429047, "epoch": 0.9873248832555037, "step": 2960 }, { "distill_loss": 0.13267558813095093, "epoch": 0.9873248832555037, "step": 2960 }, { "epoch": 0.9873248832555037, "ref_ce_loss": 0.08610528707504272, "step": 2960 }, { "epoch": 0.9906604402935291, "loss": 0.6879, "step": 2970 }, { "epoch": 0.9906604402935291, "grad_norm": 5.9090895652771, "step": 2970 }, { "epoch": 0.9906604402935291, "learning_rate": 0.0002931291017279971, "step": 2970 }, { "epoch": 0.9906604402935291, "loss": 1.4796576499938965, "step": 2970 }, { "ce_loss": 0.25391554832458496, "epoch": 0.9906604402935291, "step": 2970 }, { "distill_loss": 0.12180805951356888, "epoch": 0.9906604402935291, "step": 2970 }, { "epoch": 0.9906604402935291, "ref_ce_loss": 0.1687461882829666, "step": 2970 }, { "epoch": 0.9906604402935291, "loss": 0.6464487314224243, "step": 2970 }, { "ce_loss": 0.3380550146102905, "epoch": 0.9906604402935291, "step": 2970 }, { "distill_loss": 0.11552728712558746, "epoch": 0.9906604402935291, "step": 2970 }, { "epoch": 0.9906604402935291, "ref_ce_loss": 0.19248206913471222, "step": 2970 }, { "epoch": 0.9906604402935291, "loss": 0.4546283781528473, "step": 2970 }, { "ce_loss": 0.1595580279827118, "epoch": 0.9906604402935291, "step": 2970 }, { "distill_loss": 0.0903565064072609, "epoch": 0.9906604402935291, "step": 2970 }, { "epoch": 0.9906604402935291, "ref_ce_loss": 0.14584065973758698, "step": 2970 }, { "epoch": 0.9906604402935291, "loss": 0.5977323055267334, "step": 2970 }, { "ce_loss": 0.22186864912509918, "epoch": 0.9906604402935291, "step": 2970 }, { "distill_loss": 0.12966331839561462, "epoch": 0.9906604402935291, "step": 2970 }, { "epoch": 0.9906604402935291, "ref_ce_loss": 0.1510908603668213, "step": 2970 }, { "epoch": 0.9939959973315544, "loss": 0.7359, "step": 2980 }, { "epoch": 0.9939959973315544, "grad_norm": 2.193732500076294, "step": 2980 }, { "epoch": 0.9939959973315544, "learning_rate": 0.00029306836718203755, "step": 2980 }, { "epoch": 0.9939959973315544, "loss": 0.9112476110458374, "step": 2980 }, { "ce_loss": 0.35679250955581665, "epoch": 0.9939959973315544, "step": 2980 }, { "distill_loss": 0.13653014600276947, "epoch": 0.9939959973315544, "step": 2980 }, { "epoch": 0.9939959973315544, "ref_ce_loss": 0.18527543544769287, "step": 2980 }, { "epoch": 0.9939959973315544, "loss": 0.8198769688606262, "step": 2980 }, { "ce_loss": 0.37211382389068604, "epoch": 0.9939959973315544, "step": 2980 }, { "distill_loss": 0.12967464327812195, "epoch": 0.9939959973315544, "step": 2980 }, { "epoch": 0.9939959973315544, "ref_ce_loss": 0.24657918512821198, "step": 2980 }, { "epoch": 0.9939959973315544, "loss": 0.6387631297111511, "step": 2980 }, { "ce_loss": 0.2982257902622223, "epoch": 0.9939959973315544, "step": 2980 }, { "distill_loss": 0.11798281222581863, "epoch": 0.9939959973315544, "step": 2980 }, { "epoch": 0.9939959973315544, "ref_ce_loss": 0.13815459609031677, "step": 2980 }, { "epoch": 0.9939959973315544, "loss": 1.0164731740951538, "step": 2980 }, { "ce_loss": 0.17050199210643768, "epoch": 0.9939959973315544, "step": 2980 }, { "distill_loss": 0.1035347655415535, "epoch": 0.9939959973315544, "step": 2980 }, { "epoch": 0.9939959973315544, "ref_ce_loss": 0.1252170354127884, "step": 2980 }, { "epoch": 0.9973315543695798, "loss": 0.7284, "step": 2990 }, { "epoch": 0.9973315543695798, "grad_norm": 7.909195423126221, "step": 2990 }, { "epoch": 0.9973315543695798, "learning_rate": 0.00029300737173581213, "step": 2990 }, { "epoch": 0.9973315543695798, "loss": 0.625489354133606, "step": 2990 }, { "ce_loss": 0.1594225913286209, "epoch": 0.9973315543695798, "step": 2990 }, { "distill_loss": 0.2516362965106964, "epoch": 0.9973315543695798, "step": 2990 }, { "epoch": 0.9973315543695798, "ref_ce_loss": 0.17256700992584229, "step": 2990 }, { "epoch": 0.9973315543695798, "loss": 0.8924147486686707, "step": 2990 }, { "ce_loss": 0.2955637276172638, "epoch": 0.9973315543695798, "step": 2990 }, { "distill_loss": 0.3669006824493408, "epoch": 0.9973315543695798, "step": 2990 }, { "epoch": 0.9973315543695798, "ref_ce_loss": 0.1760740727186203, "step": 2990 }, { "epoch": 0.9973315543695798, "loss": 1.2954351902008057, "step": 2990 }, { "ce_loss": 0.3024890720844269, "epoch": 0.9973315543695798, "step": 2990 }, { "distill_loss": 0.37931498885154724, "epoch": 0.9973315543695798, "step": 2990 }, { "epoch": 0.9973315543695798, "ref_ce_loss": 0.16591976583003998, "step": 2990 }, { "epoch": 0.9973315543695798, "loss": 0.7697754502296448, "step": 2990 }, { "ce_loss": 0.24155797064304352, "epoch": 0.9973315543695798, "step": 2990 }, { "distill_loss": 0.3250412940979004, "epoch": 0.9973315543695798, "step": 2990 }, { "epoch": 0.9973315543695798, "ref_ce_loss": 0.12637540698051453, "step": 2990 }, { "epoch": 1.0006671114076051, "loss": 0.9277, "step": 3000 }, { "epoch": 1.0006671114076051, "grad_norm": 2.1417627334594727, "step": 3000 }, { "epoch": 1.0006671114076051, "learning_rate": 0.0002929461155005525, "step": 3000 }, { "epoch": 1.0006671114076051, "loss": 0.9282156825065613, "step": 3000 }, { "ce_loss": 0.31524181365966797, "epoch": 1.0006671114076051, "step": 3000 }, { "distill_loss": 0.38363519310951233, "epoch": 1.0006671114076051, "step": 3000 }, { "epoch": 1.0006671114076051, "ref_ce_loss": 0.16468551754951477, "step": 3000 }, { "epoch": 1.0006671114076051, "loss": 0.5853087902069092, "step": 3000 }, { "ce_loss": 0.1501854807138443, "epoch": 1.0006671114076051, "step": 3000 }, { "distill_loss": 0.2998085916042328, "epoch": 1.0006671114076051, "step": 3000 }, { "epoch": 1.0006671114076051, "ref_ce_loss": 0.09147650003433228, "step": 3000 }, { "epoch": 1.0006671114076051, "loss": 0.6547336578369141, "step": 3000 }, { "ce_loss": 0.15813229978084564, "epoch": 1.0006671114076051, "step": 3000 }, { "distill_loss": 0.31622615456581116, "epoch": 1.0006671114076051, "step": 3000 }, { "epoch": 1.0006671114076051, "ref_ce_loss": 0.08632109314203262, "step": 3000 }, { "epoch": 1.0006671114076051, "loss": 1.010650873184204, "step": 3000 }, { "ce_loss": 0.2512657642364502, "epoch": 1.0006671114076051, "step": 3000 }, { "distill_loss": 0.33857262134552, "epoch": 1.0006671114076051, "step": 3000 }, { "epoch": 1.0006671114076051, "ref_ce_loss": 0.13122253119945526, "step": 3000 }, { "epoch": 1.0040026684456305, "loss": 0.7838, "step": 3010 }, { "epoch": 1.0040026684456305, "grad_norm": 4.269802093505859, "step": 3010 }, { "epoch": 1.0040026684456305, "learning_rate": 0.0002928845985879658, "step": 3010 }, { "epoch": 1.0040026684456305, "loss": 0.9392971992492676, "step": 3010 }, { "ce_loss": 0.2948865592479706, "epoch": 1.0040026684456305, "step": 3010 }, { "distill_loss": 0.2655756175518036, "epoch": 1.0040026684456305, "step": 3010 }, { "epoch": 1.0040026684456305, "ref_ce_loss": 0.14558996260166168, "step": 3010 }, { "epoch": 1.0040026684456305, "loss": 0.7004539966583252, "step": 3010 }, { "ce_loss": 0.18756701052188873, "epoch": 1.0040026684456305, "step": 3010 }, { "distill_loss": 0.20628270506858826, "epoch": 1.0040026684456305, "step": 3010 }, { "epoch": 1.0040026684456305, "ref_ce_loss": 0.12921027839183807, "step": 3010 }, { "epoch": 1.0040026684456305, "loss": 0.6955971717834473, "step": 3010 }, { "ce_loss": 0.24592086672782898, "epoch": 1.0040026684456305, "step": 3010 }, { "distill_loss": 0.21004171669483185, "epoch": 1.0040026684456305, "step": 3010 }, { "epoch": 1.0040026684456305, "ref_ce_loss": 0.12992355227470398, "step": 3010 }, { "epoch": 1.0040026684456305, "loss": 0.8428363800048828, "step": 3010 }, { "ce_loss": 0.12708482146263123, "epoch": 1.0040026684456305, "step": 3010 }, { "distill_loss": 0.1839914321899414, "epoch": 1.0040026684456305, "step": 3010 }, { "epoch": 1.0040026684456305, "ref_ce_loss": 0.11820346862077713, "step": 3010 }, { "epoch": 1.0073382254836558, "loss": 0.7617, "step": 3020 }, { "epoch": 1.0073382254836558, "grad_norm": 3.130213499069214, "step": 3020 }, { "epoch": 1.0073382254836558, "learning_rate": 0.00029282282111023464, "step": 3020 }, { "epoch": 1.0073382254836558, "loss": 0.7784039974212646, "step": 3020 }, { "ce_loss": 0.30632612109184265, "epoch": 1.0073382254836558, "step": 3020 }, { "distill_loss": 0.2653581500053406, "epoch": 1.0073382254836558, "step": 3020 }, { "epoch": 1.0073382254836558, "ref_ce_loss": 0.20647765696048737, "step": 3020 }, { "epoch": 1.0073382254836558, "loss": 1.3872511386871338, "step": 3020 }, { "ce_loss": 0.5806623101234436, "epoch": 1.0073382254836558, "step": 3020 }, { "distill_loss": 0.27128076553344727, "epoch": 1.0073382254836558, "step": 3020 }, { "epoch": 1.0073382254836558, "ref_ce_loss": 0.3675278425216675, "step": 3020 }, { "epoch": 1.0073382254836558, "loss": 0.5582769513130188, "step": 3020 }, { "ce_loss": 0.1799483597278595, "epoch": 1.0073382254836558, "step": 3020 }, { "distill_loss": 0.2470041662454605, "epoch": 1.0073382254836558, "step": 3020 }, { "epoch": 1.0073382254836558, "ref_ce_loss": 0.13095030188560486, "step": 3020 }, { "epoch": 1.0073382254836558, "loss": 1.0463906526565552, "step": 3020 }, { "ce_loss": 0.2606077492237091, "epoch": 1.0073382254836558, "step": 3020 }, { "distill_loss": 0.26832517981529236, "epoch": 1.0073382254836558, "step": 3020 }, { "epoch": 1.0073382254836558, "ref_ce_loss": 0.12840500473976135, "step": 3020 }, { "epoch": 1.0106737825216812, "loss": 0.8325, "step": 3030 }, { "epoch": 1.0106737825216812, "grad_norm": 4.799256801605225, "step": 3030 }, { "epoch": 1.0106737825216812, "learning_rate": 0.00029276078318001686, "step": 3030 }, { "epoch": 1.0106737825216812, "loss": 0.551561713218689, "step": 3030 }, { "ce_loss": 0.16658858954906464, "epoch": 1.0106737825216812, "step": 3030 }, { "distill_loss": 0.18139103055000305, "epoch": 1.0106737825216812, "step": 3030 }, { "epoch": 1.0106737825216812, "ref_ce_loss": 0.11683313548564911, "step": 3030 }, { "epoch": 1.0106737825216812, "loss": 0.7683470845222473, "step": 3030 }, { "ce_loss": 0.29117369651794434, "epoch": 1.0106737825216812, "step": 3030 }, { "distill_loss": 0.19345280528068542, "epoch": 1.0106737825216812, "step": 3030 }, { "epoch": 1.0106737825216812, "ref_ce_loss": 0.1394084095954895, "step": 3030 }, { "epoch": 1.0106737825216812, "loss": 0.5669423341751099, "step": 3030 }, { "ce_loss": 0.23442049324512482, "epoch": 1.0106737825216812, "step": 3030 }, { "distill_loss": 0.15671990811824799, "epoch": 1.0106737825216812, "step": 3030 }, { "epoch": 1.0106737825216812, "ref_ce_loss": 0.10465249419212341, "step": 3030 }, { "epoch": 1.0106737825216812, "loss": 0.8321839570999146, "step": 3030 }, { "ce_loss": 0.20825815200805664, "epoch": 1.0106737825216812, "step": 3030 }, { "distill_loss": 0.16022738814353943, "epoch": 1.0106737825216812, "step": 3030 }, { "epoch": 1.0106737825216812, "ref_ce_loss": 0.13101598620414734, "step": 3030 }, { "epoch": 1.0140093395597065, "loss": 0.6702, "step": 3040 }, { "epoch": 1.0140093395597065, "grad_norm": 5.037836074829102, "step": 3040 }, { "epoch": 1.0140093395597065, "learning_rate": 0.000292698484910445, "step": 3040 }, { "epoch": 1.0140093395597065, "loss": 0.5145962238311768, "step": 3040 }, { "ce_loss": 0.21563299000263214, "epoch": 1.0140093395597065, "step": 3040 }, { "distill_loss": 0.12810231745243073, "epoch": 1.0140093395597065, "step": 3040 }, { "epoch": 1.0140093395597065, "ref_ce_loss": 0.12446434050798416, "step": 3040 }, { "epoch": 1.0140093395597065, "loss": 0.8015331029891968, "step": 3040 }, { "ce_loss": 0.29993003606796265, "epoch": 1.0140093395597065, "step": 3040 }, { "distill_loss": 0.15203720331192017, "epoch": 1.0140093395597065, "step": 3040 }, { "epoch": 1.0140093395597065, "ref_ce_loss": 0.1160355657339096, "step": 3040 }, { "epoch": 1.0140093395597065, "loss": 0.5979145765304565, "step": 3040 }, { "ce_loss": 0.16789746284484863, "epoch": 1.0140093395597065, "step": 3040 }, { "distill_loss": 0.11229067295789719, "epoch": 1.0140093395597065, "step": 3040 }, { "epoch": 1.0140093395597065, "ref_ce_loss": 0.08903674781322479, "step": 3040 }, { "epoch": 1.0140093395597065, "loss": 1.0836822986602783, "step": 3040 }, { "ce_loss": 0.22605106234550476, "epoch": 1.0140093395597065, "step": 3040 }, { "distill_loss": 0.15512679517269135, "epoch": 1.0140093395597065, "step": 3040 }, { "epoch": 1.0140093395597065, "ref_ce_loss": 0.11995731294155121, "step": 3040 }, { "epoch": 1.0173448965977319, "loss": 0.7133, "step": 3050 }, { "epoch": 1.0173448965977319, "grad_norm": 1.9387662410736084, "step": 3050 }, { "epoch": 1.0173448965977319, "learning_rate": 0.0002926359264151267, "step": 3050 }, { "epoch": 1.0173448965977319, "loss": 0.5229524374008179, "step": 3050 }, { "ce_loss": 0.2520233392715454, "epoch": 1.0173448965977319, "step": 3050 }, { "distill_loss": 0.11001858115196228, "epoch": 1.0173448965977319, "step": 3050 }, { "epoch": 1.0173448965977319, "ref_ce_loss": 0.16077430546283722, "step": 3050 }, { "epoch": 1.0173448965977319, "loss": 0.7243987917900085, "step": 3050 }, { "ce_loss": 0.27749699354171753, "epoch": 1.0173448965977319, "step": 3050 }, { "distill_loss": 0.14459560811519623, "epoch": 1.0173448965977319, "step": 3050 }, { "epoch": 1.0173448965977319, "ref_ce_loss": 0.17567412555217743, "step": 3050 }, { "epoch": 1.0173448965977319, "loss": 0.40805763006210327, "step": 3050 }, { "ce_loss": 0.15319980680942535, "epoch": 1.0173448965977319, "step": 3050 }, { "distill_loss": 0.0772315114736557, "epoch": 1.0173448965977319, "step": 3050 }, { "epoch": 1.0173448965977319, "ref_ce_loss": 0.11021654307842255, "step": 3050 }, { "epoch": 1.0173448965977319, "loss": 0.44821786880493164, "step": 3050 }, { "ce_loss": 0.2088594138622284, "epoch": 1.0173448965977319, "step": 3050 }, { "distill_loss": 0.11182739585638046, "epoch": 1.0173448965977319, "step": 3050 }, { "epoch": 1.0173448965977319, "ref_ce_loss": 0.1273898184299469, "step": 3050 }, { "epoch": 1.0206804536357572, "loss": 0.7103, "step": 3060 }, { "epoch": 1.0206804536357572, "grad_norm": 3.4637560844421387, "step": 3060 }, { "epoch": 1.0206804536357572, "learning_rate": 0.00029257310780814383, "step": 3060 }, { "epoch": 1.0206804536357572, "loss": 0.6055834889411926, "step": 3060 }, { "ce_loss": 0.24962303042411804, "epoch": 1.0206804536357572, "step": 3060 }, { "distill_loss": 0.14352454245090485, "epoch": 1.0206804536357572, "step": 3060 }, { "epoch": 1.0206804536357572, "ref_ce_loss": 0.1418440341949463, "step": 3060 }, { "epoch": 1.0206804536357572, "loss": 0.577202558517456, "step": 3060 }, { "ce_loss": 0.2429414987564087, "epoch": 1.0206804536357572, "step": 3060 }, { "distill_loss": 0.16753113269805908, "epoch": 1.0206804536357572, "step": 3060 }, { "epoch": 1.0206804536357572, "ref_ce_loss": 0.10312572866678238, "step": 3060 }, { "epoch": 1.0206804536357572, "loss": 1.0788899660110474, "step": 3060 }, { "ce_loss": 0.1864338368177414, "epoch": 1.0206804536357572, "step": 3060 }, { "distill_loss": 0.15208856761455536, "epoch": 1.0206804536357572, "step": 3060 }, { "epoch": 1.0206804536357572, "ref_ce_loss": 0.1461482048034668, "step": 3060 }, { "epoch": 1.0206804536357572, "loss": 0.6381796598434448, "step": 3060 }, { "ce_loss": 0.23099523782730103, "epoch": 1.0206804536357572, "step": 3060 }, { "distill_loss": 0.15411165356636047, "epoch": 1.0206804536357572, "step": 3060 }, { "epoch": 1.0206804536357572, "ref_ce_loss": 0.17182599008083344, "step": 3060 }, { "epoch": 1.0240160106737826, "loss": 0.6631, "step": 3070 }, { "epoch": 1.0240160106737826, "grad_norm": 3.612990140914917, "step": 3070 }, { "epoch": 1.0240160106737826, "learning_rate": 0.00029251002920405286, "step": 3070 }, { "epoch": 1.0240160106737826, "loss": 0.4682758152484894, "step": 3070 }, { "ce_loss": 0.15293414890766144, "epoch": 1.0240160106737826, "step": 3070 }, { "distill_loss": 0.13002100586891174, "epoch": 1.0240160106737826, "step": 3070 }, { "epoch": 1.0240160106737826, "ref_ce_loss": 0.10538853704929352, "step": 3070 }, { "epoch": 1.0240160106737826, "loss": 0.580215573310852, "step": 3070 }, { "ce_loss": 0.21724647283554077, "epoch": 1.0240160106737826, "step": 3070 }, { "distill_loss": 0.17224983870983124, "epoch": 1.0240160106737826, "step": 3070 }, { "epoch": 1.0240160106737826, "ref_ce_loss": 0.11003424972295761, "step": 3070 }, { "epoch": 1.0240160106737826, "loss": 0.9422988295555115, "step": 3070 }, { "ce_loss": 0.20494846999645233, "epoch": 1.0240160106737826, "step": 3070 }, { "distill_loss": 0.149722620844841, "epoch": 1.0240160106737826, "step": 3070 }, { "epoch": 1.0240160106737826, "ref_ce_loss": 0.1599912941455841, "step": 3070 }, { "epoch": 1.0240160106737826, "loss": 0.7094449996948242, "step": 3070 }, { "ce_loss": 0.20244437456130981, "epoch": 1.0240160106737826, "step": 3070 }, { "distill_loss": 0.11915453523397446, "epoch": 1.0240160106737826, "step": 3070 }, { "epoch": 1.0240160106737826, "ref_ce_loss": 0.09864697605371475, "step": 3070 }, { "epoch": 1.027351567711808, "loss": 0.7187, "step": 3080 }, { "epoch": 1.027351567711808, "grad_norm": 3.3055739402770996, "step": 3080 }, { "epoch": 1.027351567711808, "learning_rate": 0.0002924466907178842, "step": 3080 }, { "epoch": 1.027351567711808, "loss": 0.6497579216957092, "step": 3080 }, { "ce_loss": 0.14034055173397064, "epoch": 1.027351567711808, "step": 3080 }, { "distill_loss": 0.1890394389629364, "epoch": 1.027351567711808, "step": 3080 }, { "epoch": 1.027351567711808, "ref_ce_loss": 0.15432484447956085, "step": 3080 }, { "epoch": 1.027351567711808, "loss": 0.9005087614059448, "step": 3080 }, { "ce_loss": 0.30830201506614685, "epoch": 1.027351567711808, "step": 3080 }, { "distill_loss": 0.30596300959587097, "epoch": 1.027351567711808, "step": 3080 }, { "epoch": 1.027351567711808, "ref_ce_loss": 0.1606854796409607, "step": 3080 }, { "epoch": 1.027351567711808, "loss": 0.530880331993103, "step": 3080 }, { "ce_loss": 0.14225921034812927, "epoch": 1.027351567711808, "step": 3080 }, { "distill_loss": 0.17800264060497284, "epoch": 1.027351567711808, "step": 3080 }, { "epoch": 1.027351567711808, "ref_ce_loss": 0.08136451244354248, "step": 3080 }, { "epoch": 1.027351567711808, "loss": 0.6551678776741028, "step": 3080 }, { "ce_loss": 0.19130204617977142, "epoch": 1.027351567711808, "step": 3080 }, { "distill_loss": 0.2622304856777191, "epoch": 1.027351567711808, "step": 3080 }, { "epoch": 1.027351567711808, "ref_ce_loss": 0.16420768201351166, "step": 3080 }, { "epoch": 1.0306871247498333, "loss": 0.724, "step": 3090 }, { "epoch": 1.0306871247498333, "grad_norm": 3.060091018676758, "step": 3090 }, { "epoch": 1.0306871247498333, "learning_rate": 0.0002923830924651424, "step": 3090 }, { "epoch": 1.0306871247498333, "loss": 0.6151540875434875, "step": 3090 }, { "ce_loss": 0.25476041436195374, "epoch": 1.0306871247498333, "step": 3090 }, { "distill_loss": 0.13451892137527466, "epoch": 1.0306871247498333, "step": 3090 }, { "epoch": 1.0306871247498333, "ref_ce_loss": 0.18273842334747314, "step": 3090 }, { "epoch": 1.0306871247498333, "loss": 0.8939247727394104, "step": 3090 }, { "ce_loss": 0.32922524213790894, "epoch": 1.0306871247498333, "step": 3090 }, { "distill_loss": 0.14805200695991516, "epoch": 1.0306871247498333, "step": 3090 }, { "epoch": 1.0306871247498333, "ref_ce_loss": 0.1756051629781723, "step": 3090 }, { "epoch": 1.0306871247498333, "loss": 0.8971487283706665, "step": 3090 }, { "ce_loss": 0.27347028255462646, "epoch": 1.0306871247498333, "step": 3090 }, { "distill_loss": 0.12306328862905502, "epoch": 1.0306871247498333, "step": 3090 }, { "epoch": 1.0306871247498333, "ref_ce_loss": 0.23444783687591553, "step": 3090 }, { "epoch": 1.0306871247498333, "loss": 0.769548773765564, "step": 3090 }, { "ce_loss": 0.2549266815185547, "epoch": 1.0306871247498333, "step": 3090 }, { "distill_loss": 0.15688112378120422, "epoch": 1.0306871247498333, "step": 3090 }, { "epoch": 1.0306871247498333, "ref_ce_loss": 0.2493422031402588, "step": 3090 }, { "epoch": 1.0340226817878586, "loss": 0.8049, "step": 3100 }, { "epoch": 1.0340226817878586, "grad_norm": 2.9161508083343506, "step": 3100 }, { "epoch": 1.0340226817878586, "learning_rate": 0.0002923192345618054, "step": 3100 }, { "epoch": 1.0340226817878586, "loss": 1.0467815399169922, "step": 3100 }, { "ce_loss": 0.33018168807029724, "epoch": 1.0340226817878586, "step": 3100 }, { "distill_loss": 0.22280463576316833, "epoch": 1.0340226817878586, "step": 3100 }, { "epoch": 1.0340226817878586, "ref_ce_loss": 0.17719537019729614, "step": 3100 }, { "epoch": 1.0340226817878586, "loss": 0.6442136168479919, "step": 3100 }, { "ce_loss": 0.23328647017478943, "epoch": 1.0340226817878586, "step": 3100 }, { "distill_loss": 0.2879638075828552, "epoch": 1.0340226817878586, "step": 3100 }, { "epoch": 1.0340226817878586, "ref_ce_loss": 0.12265841662883759, "step": 3100 }, { "epoch": 1.0340226817878586, "loss": 0.6296390891075134, "step": 3100 }, { "ce_loss": 0.1833505630493164, "epoch": 1.0340226817878586, "step": 3100 }, { "distill_loss": 0.30275848507881165, "epoch": 1.0340226817878586, "step": 3100 }, { "epoch": 1.0340226817878586, "ref_ce_loss": 0.09682515263557434, "step": 3100 }, { "epoch": 1.0340226817878586, "loss": 0.6476273536682129, "step": 3100 }, { "ce_loss": 0.22235122323036194, "epoch": 1.0340226817878586, "step": 3100 }, { "distill_loss": 0.25921764969825745, "epoch": 1.0340226817878586, "step": 3100 }, { "epoch": 1.0340226817878586, "ref_ce_loss": 0.12296784669160843, "step": 3100 }, { "epoch": 1.037358238825884, "loss": 0.7384, "step": 3110 }, { "epoch": 1.037358238825884, "grad_norm": 4.091484546661377, "step": 3110 }, { "epoch": 1.037358238825884, "learning_rate": 0.00029225511712432494, "step": 3110 }, { "epoch": 1.037358238825884, "loss": 0.8834012746810913, "step": 3110 }, { "ce_loss": 0.28296226263046265, "epoch": 1.037358238825884, "step": 3110 }, { "distill_loss": 0.2384410798549652, "epoch": 1.037358238825884, "step": 3110 }, { "epoch": 1.037358238825884, "ref_ce_loss": 0.12611015141010284, "step": 3110 }, { "epoch": 1.037358238825884, "loss": 0.5011523962020874, "step": 3110 }, { "ce_loss": 0.16393588483333588, "epoch": 1.037358238825884, "step": 3110 }, { "distill_loss": 0.1573915183544159, "epoch": 1.037358238825884, "step": 3110 }, { "epoch": 1.037358238825884, "ref_ce_loss": 0.09610199183225632, "step": 3110 }, { "epoch": 1.037358238825884, "loss": 0.7043334245681763, "step": 3110 }, { "ce_loss": 0.2144857943058014, "epoch": 1.037358238825884, "step": 3110 }, { "distill_loss": 0.1674727201461792, "epoch": 1.037358238825884, "step": 3110 }, { "epoch": 1.037358238825884, "ref_ce_loss": 0.15289588272571564, "step": 3110 }, { "epoch": 1.037358238825884, "loss": 0.5016894340515137, "step": 3110 }, { "ce_loss": 0.17593710124492645, "epoch": 1.037358238825884, "step": 3110 }, { "distill_loss": 0.20370127260684967, "epoch": 1.037358238825884, "step": 3110 }, { "epoch": 1.037358238825884, "ref_ce_loss": 0.06827589869499207, "step": 3110 }, { "epoch": 1.0406937958639093, "loss": 0.736, "step": 3120 }, { "epoch": 1.0406937958639093, "grad_norm": 2.3639414310455322, "step": 3120 }, { "epoch": 1.0406937958639093, "learning_rate": 0.0002921907402696259, "step": 3120 }, { "epoch": 1.0406937958639093, "loss": 0.8598054647445679, "step": 3120 }, { "ce_loss": 0.24055452644824982, "epoch": 1.0406937958639093, "step": 3120 }, { "distill_loss": 0.20108632743358612, "epoch": 1.0406937958639093, "step": 3120 }, { "epoch": 1.0406937958639093, "ref_ce_loss": 0.13357633352279663, "step": 3120 }, { "epoch": 1.0406937958639093, "loss": 0.5927767753601074, "step": 3120 }, { "ce_loss": 0.16187889873981476, "epoch": 1.0406937958639093, "step": 3120 }, { "distill_loss": 0.21812832355499268, "epoch": 1.0406937958639093, "step": 3120 }, { "epoch": 1.0406937958639093, "ref_ce_loss": 0.1286218911409378, "step": 3120 }, { "epoch": 1.0406937958639093, "loss": 0.6704609394073486, "step": 3120 }, { "ce_loss": 0.24853084981441498, "epoch": 1.0406937958639093, "step": 3120 }, { "distill_loss": 0.2065722942352295, "epoch": 1.0406937958639093, "step": 3120 }, { "epoch": 1.0406937958639093, "ref_ce_loss": 0.14731864631175995, "step": 3120 }, { "epoch": 1.0406937958639093, "loss": 1.198374629020691, "step": 3120 }, { "ce_loss": 0.2626439034938812, "epoch": 1.0406937958639093, "step": 3120 }, { "distill_loss": 0.23086431622505188, "epoch": 1.0406937958639093, "step": 3120 }, { "epoch": 1.0406937958639093, "ref_ce_loss": 0.2478064000606537, "step": 3120 }, { "epoch": 1.0440293529019347, "loss": 0.7584, "step": 3130 }, { "epoch": 1.0440293529019347, "grad_norm": 4.350533962249756, "step": 3130 }, { "epoch": 1.0440293529019347, "learning_rate": 0.00029212610411510627, "step": 3130 }, { "epoch": 1.0440293529019347, "loss": 0.5243332386016846, "step": 3130 }, { "ce_loss": 0.11996088922023773, "epoch": 1.0440293529019347, "step": 3130 }, { "distill_loss": 0.16550423204898834, "epoch": 1.0440293529019347, "step": 3130 }, { "epoch": 1.0440293529019347, "ref_ce_loss": 0.07062079012393951, "step": 3130 }, { "epoch": 1.0440293529019347, "loss": 1.1567490100860596, "step": 3130 }, { "ce_loss": 0.1746305376291275, "epoch": 1.0440293529019347, "step": 3130 }, { "distill_loss": 0.2075626105070114, "epoch": 1.0440293529019347, "step": 3130 }, { "epoch": 1.0440293529019347, "ref_ce_loss": 0.08206846565008163, "step": 3130 }, { "epoch": 1.0440293529019347, "loss": 0.6579044461250305, "step": 3130 }, { "ce_loss": 0.22023993730545044, "epoch": 1.0440293529019347, "step": 3130 }, { "distill_loss": 0.2082604169845581, "epoch": 1.0440293529019347, "step": 3130 }, { "epoch": 1.0440293529019347, "ref_ce_loss": 0.11965802311897278, "step": 3130 }, { "epoch": 1.0440293529019347, "loss": 0.7400968074798584, "step": 3130 }, { "ce_loss": 0.26753515005111694, "epoch": 1.0440293529019347, "step": 3130 }, { "distill_loss": 0.2450755536556244, "epoch": 1.0440293529019347, "step": 3130 }, { "epoch": 1.0440293529019347, "ref_ce_loss": 0.1641095131635666, "step": 3130 }, { "epoch": 1.04736490993996, "loss": 0.7619, "step": 3140 }, { "epoch": 1.04736490993996, "grad_norm": 2.58843994140625, "step": 3140 }, { "epoch": 1.04736490993996, "learning_rate": 0.000292061208778637, "step": 3140 }, { "epoch": 1.04736490993996, "loss": 0.455960750579834, "step": 3140 }, { "ce_loss": 0.09826745092868805, "epoch": 1.04736490993996, "step": 3140 }, { "distill_loss": 0.1610691100358963, "epoch": 1.04736490993996, "step": 3140 }, { "epoch": 1.04736490993996, "ref_ce_loss": 0.057575829327106476, "step": 3140 }, { "epoch": 1.04736490993996, "loss": 0.8150807619094849, "step": 3140 }, { "ce_loss": 0.26749011874198914, "epoch": 1.04736490993996, "step": 3140 }, { "distill_loss": 0.205758199095726, "epoch": 1.04736490993996, "step": 3140 }, { "epoch": 1.04736490993996, "ref_ce_loss": 0.17441095411777496, "step": 3140 }, { "epoch": 1.04736490993996, "loss": 1.1181557178497314, "step": 3140 }, { "ce_loss": 0.39353179931640625, "epoch": 1.04736490993996, "step": 3140 }, { "distill_loss": 0.24632498621940613, "epoch": 1.04736490993996, "step": 3140 }, { "epoch": 1.04736490993996, "ref_ce_loss": 0.1923367828130722, "step": 3140 }, { "epoch": 1.04736490993996, "loss": 0.9463489651679993, "step": 3140 }, { "ce_loss": 0.25722455978393555, "epoch": 1.04736490993996, "step": 3140 }, { "distill_loss": 0.18198177218437195, "epoch": 1.04736490993996, "step": 3140 }, { "epoch": 1.04736490993996, "ref_ce_loss": 0.1510559469461441, "step": 3140 }, { "epoch": 1.0507004669779854, "loss": 0.6906, "step": 3150 }, { "epoch": 1.0507004669779854, "grad_norm": 1.8357528448104858, "step": 3150 }, { "epoch": 1.0507004669779854, "learning_rate": 0.0002919960543785614, "step": 3150 }, { "epoch": 1.0507004669779854, "loss": 1.3078269958496094, "step": 3150 }, { "ce_loss": 0.3390888571739197, "epoch": 1.0507004669779854, "step": 3150 }, { "distill_loss": 0.16615071892738342, "epoch": 1.0507004669779854, "step": 3150 }, { "epoch": 1.0507004669779854, "ref_ce_loss": 0.21896415948867798, "step": 3150 }, { "epoch": 1.0507004669779854, "loss": 0.8389450311660767, "step": 3150 }, { "ce_loss": 0.2977851331233978, "epoch": 1.0507004669779854, "step": 3150 }, { "distill_loss": 0.14731603860855103, "epoch": 1.0507004669779854, "step": 3150 }, { "epoch": 1.0507004669779854, "ref_ce_loss": 0.15756700932979584, "step": 3150 }, { "epoch": 1.0507004669779854, "loss": 0.5830703973770142, "step": 3150 }, { "ce_loss": 0.26267823576927185, "epoch": 1.0507004669779854, "step": 3150 }, { "distill_loss": 0.15796950459480286, "epoch": 1.0507004669779854, "step": 3150 }, { "epoch": 1.0507004669779854, "ref_ce_loss": 0.10677004605531693, "step": 3150 }, { "epoch": 1.0507004669779854, "loss": 0.7216756939888, "step": 3150 }, { "ce_loss": 0.299459844827652, "epoch": 1.0507004669779854, "step": 3150 }, { "distill_loss": 0.1527896672487259, "epoch": 1.0507004669779854, "step": 3150 }, { "epoch": 1.0507004669779854, "ref_ce_loss": 0.13475990295410156, "step": 3150 }, { "epoch": 1.0540360240160107, "loss": 0.7631, "step": 3160 }, { "epoch": 1.0540360240160107, "grad_norm": 1.8830387592315674, "step": 3160 }, { "epoch": 1.0540360240160107, "learning_rate": 0.00029193064103369545, "step": 3160 }, { "epoch": 1.0540360240160107, "loss": 0.47316044569015503, "step": 3160 }, { "ce_loss": 0.13927239179611206, "epoch": 1.0540360240160107, "step": 3160 }, { "distill_loss": 0.1530410349369049, "epoch": 1.0540360240160107, "step": 3160 }, { "epoch": 1.0540360240160107, "ref_ce_loss": 0.12716428935527802, "step": 3160 }, { "epoch": 1.0540360240160107, "loss": 0.6238396763801575, "step": 3160 }, { "ce_loss": 0.1877501904964447, "epoch": 1.0540360240160107, "step": 3160 }, { "distill_loss": 0.16471980512142181, "epoch": 1.0540360240160107, "step": 3160 }, { "epoch": 1.0540360240160107, "ref_ce_loss": 0.13325060904026031, "step": 3160 }, { "epoch": 1.0540360240160107, "loss": 0.7353872656822205, "step": 3160 }, { "ce_loss": 0.23863466084003448, "epoch": 1.0540360240160107, "step": 3160 }, { "distill_loss": 0.17457962036132812, "epoch": 1.0540360240160107, "step": 3160 }, { "epoch": 1.0540360240160107, "ref_ce_loss": 0.10836151987314224, "step": 3160 }, { "epoch": 1.0540360240160107, "loss": 0.7750897407531738, "step": 3160 }, { "ce_loss": 0.2611212730407715, "epoch": 1.0540360240160107, "step": 3160 }, { "distill_loss": 0.15589670836925507, "epoch": 1.0540360240160107, "step": 3160 }, { "epoch": 1.0540360240160107, "ref_ce_loss": 0.1313047707080841, "step": 3160 }, { "epoch": 1.057371581054036, "loss": 0.7822, "step": 3170 }, { "epoch": 1.057371581054036, "grad_norm": 3.618147611618042, "step": 3170 }, { "epoch": 1.057371581054036, "learning_rate": 0.00029186496886332737, "step": 3170 }, { "epoch": 1.057371581054036, "loss": 0.8868449926376343, "step": 3170 }, { "ce_loss": 0.14827127754688263, "epoch": 1.057371581054036, "step": 3170 }, { "distill_loss": 0.18400876224040985, "epoch": 1.057371581054036, "step": 3170 }, { "epoch": 1.057371581054036, "ref_ce_loss": 0.09630703181028366, "step": 3170 }, { "epoch": 1.057371581054036, "loss": 1.0973975658416748, "step": 3170 }, { "ce_loss": 0.3045180141925812, "epoch": 1.057371581054036, "step": 3170 }, { "distill_loss": 0.2753039598464966, "epoch": 1.057371581054036, "step": 3170 }, { "epoch": 1.057371581054036, "ref_ce_loss": 0.17682716250419617, "step": 3170 }, { "epoch": 1.057371581054036, "loss": 0.8529131412506104, "step": 3170 }, { "ce_loss": 0.19607514142990112, "epoch": 1.057371581054036, "step": 3170 }, { "distill_loss": 0.29867586493492126, "epoch": 1.057371581054036, "step": 3170 }, { "epoch": 1.057371581054036, "ref_ce_loss": 0.12237987667322159, "step": 3170 }, { "epoch": 1.057371581054036, "loss": 1.0622225999832153, "step": 3170 }, { "ce_loss": 0.17475342750549316, "epoch": 1.057371581054036, "step": 3170 }, { "distill_loss": 0.2065230756998062, "epoch": 1.057371581054036, "step": 3170 }, { "epoch": 1.057371581054036, "ref_ce_loss": 0.12568923830986023, "step": 3170 }, { "epoch": 1.0607071380920614, "loss": 0.7855, "step": 3180 }, { "epoch": 1.0607071380920614, "grad_norm": 3.1847946643829346, "step": 3180 }, { "epoch": 1.0607071380920614, "learning_rate": 0.0002917990379872173, "step": 3180 }, { "epoch": 1.0607071380920614, "loss": 0.8633472323417664, "step": 3180 }, { "ce_loss": 0.2781065106391907, "epoch": 1.0607071380920614, "step": 3180 }, { "distill_loss": 0.26735353469848633, "epoch": 1.0607071380920614, "step": 3180 }, { "epoch": 1.0607071380920614, "ref_ce_loss": 0.11382268369197845, "step": 3180 }, { "epoch": 1.0607071380920614, "loss": 0.6711214184761047, "step": 3180 }, { "ce_loss": 0.2781183421611786, "epoch": 1.0607071380920614, "step": 3180 }, { "distill_loss": 0.1842014491558075, "epoch": 1.0607071380920614, "step": 3180 }, { "epoch": 1.0607071380920614, "ref_ce_loss": 0.1488085687160492, "step": 3180 }, { "epoch": 1.0607071380920614, "loss": 0.6630605459213257, "step": 3180 }, { "ce_loss": 0.1706310361623764, "epoch": 1.0607071380920614, "step": 3180 }, { "distill_loss": 0.24702239036560059, "epoch": 1.0607071380920614, "step": 3180 }, { "epoch": 1.0607071380920614, "ref_ce_loss": 0.09450356662273407, "step": 3180 }, { "epoch": 1.0607071380920614, "loss": 0.5658689737319946, "step": 3180 }, { "ce_loss": 0.25082194805145264, "epoch": 1.0607071380920614, "step": 3180 }, { "distill_loss": 0.1784793734550476, "epoch": 1.0607071380920614, "step": 3180 }, { "epoch": 1.0607071380920614, "ref_ce_loss": 0.13623644411563873, "step": 3180 }, { "epoch": 1.0640426951300868, "loss": 0.7828, "step": 3190 }, { "epoch": 1.0640426951300868, "grad_norm": 3.540067195892334, "step": 3190 }, { "epoch": 1.0640426951300868, "learning_rate": 0.000291732848525597, "step": 3190 }, { "epoch": 1.0640426951300868, "loss": 1.3457376956939697, "step": 3190 }, { "ce_loss": 0.28505170345306396, "epoch": 1.0640426951300868, "step": 3190 }, { "distill_loss": 0.24294057488441467, "epoch": 1.0640426951300868, "step": 3190 }, { "epoch": 1.0640426951300868, "ref_ce_loss": 0.13038063049316406, "step": 3190 }, { "epoch": 1.0640426951300868, "loss": 0.4256614148616791, "step": 3190 }, { "ce_loss": 0.1152573972940445, "epoch": 1.0640426951300868, "step": 3190 }, { "distill_loss": 0.17031772434711456, "epoch": 1.0640426951300868, "step": 3190 }, { "epoch": 1.0640426951300868, "ref_ce_loss": 0.08523818105459213, "step": 3190 }, { "epoch": 1.0640426951300868, "loss": 0.9928364157676697, "step": 3190 }, { "ce_loss": 0.2940632700920105, "epoch": 1.0640426951300868, "step": 3190 }, { "distill_loss": 0.26018041372299194, "epoch": 1.0640426951300868, "step": 3190 }, { "epoch": 1.0640426951300868, "ref_ce_loss": 0.14488235116004944, "step": 3190 }, { "epoch": 1.0640426951300868, "loss": 0.7044861912727356, "step": 3190 }, { "ce_loss": 0.2863961458206177, "epoch": 1.0640426951300868, "step": 3190 }, { "distill_loss": 0.22382394969463348, "epoch": 1.0640426951300868, "step": 3190 }, { "epoch": 1.0640426951300868, "ref_ce_loss": 0.13676100969314575, "step": 3190 }, { "epoch": 1.067378252168112, "loss": 0.7039, "step": 3200 }, { "epoch": 1.067378252168112, "grad_norm": 2.542185068130493, "step": 3200 }, { "epoch": 1.067378252168112, "learning_rate": 0.0002916664005991701, "step": 3200 }, { "epoch": 1.067378252168112, "loss": 1.3946433067321777, "step": 3200 }, { "ce_loss": 0.2782725393772125, "epoch": 1.067378252168112, "step": 3200 }, { "distill_loss": 0.15828801691532135, "epoch": 1.067378252168112, "step": 3200 }, { "epoch": 1.067378252168112, "ref_ce_loss": 0.2695305645465851, "step": 3200 }, { "epoch": 1.067378252168112, "loss": 0.7213587164878845, "step": 3200 }, { "ce_loss": 0.22855627536773682, "epoch": 1.067378252168112, "step": 3200 }, { "distill_loss": 0.16613143682479858, "epoch": 1.067378252168112, "step": 3200 }, { "epoch": 1.067378252168112, "ref_ce_loss": 0.20623084902763367, "step": 3200 }, { "epoch": 1.067378252168112, "loss": 0.5456353425979614, "step": 3200 }, { "ce_loss": 0.1833302229642868, "epoch": 1.067378252168112, "step": 3200 }, { "distill_loss": 0.16412489116191864, "epoch": 1.067378252168112, "step": 3200 }, { "epoch": 1.067378252168112, "ref_ce_loss": 0.1315329372882843, "step": 3200 }, { "epoch": 1.067378252168112, "loss": 0.6673558950424194, "step": 3200 }, { "ce_loss": 0.27535635232925415, "epoch": 1.067378252168112, "step": 3200 }, { "distill_loss": 0.18773838877677917, "epoch": 1.067378252168112, "step": 3200 }, { "epoch": 1.067378252168112, "ref_ce_loss": 0.1323918253183365, "step": 3200 }, { "epoch": 1.0707138092061375, "loss": 0.6588, "step": 3210 }, { "epoch": 1.0707138092061375, "grad_norm": 3.2879350185394287, "step": 3210 }, { "epoch": 1.0707138092061375, "learning_rate": 0.0002915996943291114, "step": 3210 }, { "epoch": 1.0707138092061375, "loss": 0.47054553031921387, "step": 3210 }, { "ce_loss": 0.14522524178028107, "epoch": 1.0707138092061375, "step": 3210 }, { "distill_loss": 0.12368728220462799, "epoch": 1.0707138092061375, "step": 3210 }, { "epoch": 1.0707138092061375, "ref_ce_loss": 0.06966076791286469, "step": 3210 }, { "epoch": 1.0707138092061375, "loss": 1.2660126686096191, "step": 3210 }, { "ce_loss": 0.26258736848831177, "epoch": 1.0707138092061375, "step": 3210 }, { "distill_loss": 0.1678849756717682, "epoch": 1.0707138092061375, "step": 3210 }, { "epoch": 1.0707138092061375, "ref_ce_loss": 0.15247942507266998, "step": 3210 }, { "epoch": 1.0707138092061375, "loss": 0.617216944694519, "step": 3210 }, { "ce_loss": 0.29853564500808716, "epoch": 1.0707138092061375, "step": 3210 }, { "distill_loss": 0.1275399774312973, "epoch": 1.0707138092061375, "step": 3210 }, { "epoch": 1.0707138092061375, "ref_ce_loss": 0.1393018513917923, "step": 3210 }, { "epoch": 1.0707138092061375, "loss": 0.6128079295158386, "step": 3210 }, { "ce_loss": 0.24983979761600494, "epoch": 1.0707138092061375, "step": 3210 }, { "distill_loss": 0.16351068019866943, "epoch": 1.0707138092061375, "step": 3210 }, { "epoch": 1.0707138092061375, "ref_ce_loss": 0.12650151550769806, "step": 3210 }, { "epoch": 1.0740493662441628, "loss": 0.7102, "step": 3220 }, { "epoch": 1.0740493662441628, "grad_norm": 2.6898279190063477, "step": 3220 }, { "epoch": 1.0740493662441628, "learning_rate": 0.00029153272983706665, "step": 3220 }, { "epoch": 1.0740493662441628, "loss": 0.6622181534767151, "step": 3220 }, { "ce_loss": 0.25215262174606323, "epoch": 1.0740493662441628, "step": 3220 }, { "distill_loss": 0.1243286281824112, "epoch": 1.0740493662441628, "step": 3220 }, { "epoch": 1.0740493662441628, "ref_ce_loss": 0.16581885516643524, "step": 3220 }, { "epoch": 1.0740493662441628, "loss": 0.7243108749389648, "step": 3220 }, { "ce_loss": 0.19404850900173187, "epoch": 1.0740493662441628, "step": 3220 }, { "distill_loss": 0.1466391682624817, "epoch": 1.0740493662441628, "step": 3220 }, { "epoch": 1.0740493662441628, "ref_ce_loss": 0.10018964856863022, "step": 3220 }, { "epoch": 1.0740493662441628, "loss": 0.6774519681930542, "step": 3220 }, { "ce_loss": 0.12806348502635956, "epoch": 1.0740493662441628, "step": 3220 }, { "distill_loss": 0.09503288567066193, "epoch": 1.0740493662441628, "step": 3220 }, { "epoch": 1.0740493662441628, "ref_ce_loss": 0.11790133267641068, "step": 3220 }, { "epoch": 1.0740493662441628, "loss": 0.8044633865356445, "step": 3220 }, { "ce_loss": 0.29243576526641846, "epoch": 1.0740493662441628, "step": 3220 }, { "distill_loss": 0.18098977208137512, "epoch": 1.0740493662441628, "step": 3220 }, { "epoch": 1.0740493662441628, "ref_ce_loss": 0.1330576390028, "step": 3220 }, { "epoch": 1.0773849232821882, "loss": 0.7111, "step": 3230 }, { "epoch": 1.0773849232821882, "grad_norm": 2.036210298538208, "step": 3230 }, { "epoch": 1.0773849232821882, "learning_rate": 0.0002914655072451528, "step": 3230 }, { "epoch": 1.0773849232821882, "loss": 0.5417668223381042, "step": 3230 }, { "ce_loss": 0.1813638061285019, "epoch": 1.0773849232821882, "step": 3230 }, { "distill_loss": 0.13782474398612976, "epoch": 1.0773849232821882, "step": 3230 }, { "epoch": 1.0773849232821882, "ref_ce_loss": 0.13959860801696777, "step": 3230 }, { "epoch": 1.0773849232821882, "loss": 0.5895418524742126, "step": 3230 }, { "ce_loss": 0.18671901524066925, "epoch": 1.0773849232821882, "step": 3230 }, { "distill_loss": 0.14473098516464233, "epoch": 1.0773849232821882, "step": 3230 }, { "epoch": 1.0773849232821882, "ref_ce_loss": 0.17107507586479187, "step": 3230 }, { "epoch": 1.0773849232821882, "loss": 0.6597346067428589, "step": 3230 }, { "ce_loss": 0.26041436195373535, "epoch": 1.0773849232821882, "step": 3230 }, { "distill_loss": 0.13516613841056824, "epoch": 1.0773849232821882, "step": 3230 }, { "epoch": 1.0773849232821882, "ref_ce_loss": 0.10818801075220108, "step": 3230 }, { "epoch": 1.0773849232821882, "loss": 1.3986637592315674, "step": 3230 }, { "ce_loss": 0.49624744057655334, "epoch": 1.0773849232821882, "step": 3230 }, { "distill_loss": 0.19390641152858734, "epoch": 1.0773849232821882, "step": 3230 }, { "epoch": 1.0773849232821882, "ref_ce_loss": 0.14032429456710815, "step": 3230 }, { "epoch": 1.0807204803202135, "loss": 0.6783, "step": 3240 }, { "epoch": 1.0807204803202135, "grad_norm": 2.5386595726013184, "step": 3240 }, { "epoch": 1.0807204803202135, "learning_rate": 0.00029139802667595735, "step": 3240 }, { "epoch": 1.0807204803202135, "loss": 0.7183874249458313, "step": 3240 }, { "ce_loss": 0.34214258193969727, "epoch": 1.0807204803202135, "step": 3240 }, { "distill_loss": 0.14284975826740265, "epoch": 1.0807204803202135, "step": 3240 }, { "epoch": 1.0807204803202135, "ref_ce_loss": 0.1737593561410904, "step": 3240 }, { "epoch": 1.0807204803202135, "loss": 0.6244215369224548, "step": 3240 }, { "ce_loss": 0.21345029771327972, "epoch": 1.0807204803202135, "step": 3240 }, { "distill_loss": 0.13486254215240479, "epoch": 1.0807204803202135, "step": 3240 }, { "epoch": 1.0807204803202135, "ref_ce_loss": 0.10968282073736191, "step": 3240 }, { "epoch": 1.0807204803202135, "loss": 0.6682906150817871, "step": 3240 }, { "ce_loss": 0.12535721063613892, "epoch": 1.0807204803202135, "step": 3240 }, { "distill_loss": 0.09490034729242325, "epoch": 1.0807204803202135, "step": 3240 }, { "epoch": 1.0807204803202135, "ref_ce_loss": 0.08199267834424973, "step": 3240 }, { "epoch": 1.0807204803202135, "loss": 0.7619924545288086, "step": 3240 }, { "ce_loss": 0.3022087812423706, "epoch": 1.0807204803202135, "step": 3240 }, { "distill_loss": 0.12522542476654053, "epoch": 1.0807204803202135, "step": 3240 }, { "epoch": 1.0807204803202135, "ref_ce_loss": 0.200872540473938, "step": 3240 }, { "epoch": 1.0840560373582389, "loss": 0.649, "step": 3250 }, { "epoch": 1.0840560373582389, "grad_norm": 1.8706231117248535, "step": 3250 }, { "epoch": 1.0840560373582389, "learning_rate": 0.00029133028825253823, "step": 3250 }, { "epoch": 1.0840560373582389, "loss": 0.6613234281539917, "step": 3250 }, { "ce_loss": 0.32606416940689087, "epoch": 1.0840560373582389, "step": 3250 }, { "distill_loss": 0.1341378539800644, "epoch": 1.0840560373582389, "step": 3250 }, { "epoch": 1.0840560373582389, "ref_ce_loss": 0.20065777003765106, "step": 3250 }, { "epoch": 1.0840560373582389, "loss": 0.7039586305618286, "step": 3250 }, { "ce_loss": 0.27694404125213623, "epoch": 1.0840560373582389, "step": 3250 }, { "distill_loss": 0.13455909490585327, "epoch": 1.0840560373582389, "step": 3250 }, { "epoch": 1.0840560373582389, "ref_ce_loss": 0.22442147135734558, "step": 3250 }, { "epoch": 1.0840560373582389, "loss": 1.2980964183807373, "step": 3250 }, { "ce_loss": 0.3101694583892822, "epoch": 1.0840560373582389, "step": 3250 }, { "distill_loss": 0.12680265307426453, "epoch": 1.0840560373582389, "step": 3250 }, { "epoch": 1.0840560373582389, "ref_ce_loss": 0.18077422678470612, "step": 3250 }, { "epoch": 1.0840560373582389, "loss": 0.6655110120773315, "step": 3250 }, { "ce_loss": 0.3538602292537689, "epoch": 1.0840560373582389, "step": 3250 }, { "distill_loss": 0.12952083349227905, "epoch": 1.0840560373582389, "step": 3250 }, { "epoch": 1.0840560373582389, "ref_ce_loss": 0.18190212547779083, "step": 3250 }, { "epoch": 1.0873915943962642, "loss": 0.6585, "step": 3260 }, { "epoch": 1.0873915943962642, "grad_norm": 1.9900285005569458, "step": 3260 }, { "epoch": 1.0873915943962642, "learning_rate": 0.00029126229209842355, "step": 3260 }, { "epoch": 1.0873915943962642, "loss": 0.6257374882698059, "step": 3260 }, { "ce_loss": 0.21010935306549072, "epoch": 1.0873915943962642, "step": 3260 }, { "distill_loss": 0.15154899656772614, "epoch": 1.0873915943962642, "step": 3260 }, { "epoch": 1.0873915943962642, "ref_ce_loss": 0.11518467962741852, "step": 3260 }, { "epoch": 1.0873915943962642, "loss": 0.5982180833816528, "step": 3260 }, { "ce_loss": 0.1900973916053772, "epoch": 1.0873915943962642, "step": 3260 }, { "distill_loss": 0.1584339737892151, "epoch": 1.0873915943962642, "step": 3260 }, { "epoch": 1.0873915943962642, "ref_ce_loss": 0.09822672605514526, "step": 3260 }, { "epoch": 1.0873915943962642, "loss": 0.4718421697616577, "step": 3260 }, { "ce_loss": 0.16259068250656128, "epoch": 1.0873915943962642, "step": 3260 }, { "distill_loss": 0.15456894040107727, "epoch": 1.0873915943962642, "step": 3260 }, { "epoch": 1.0873915943962642, "ref_ce_loss": 0.1545715481042862, "step": 3260 }, { "epoch": 1.0873915943962642, "loss": 0.6045008897781372, "step": 3260 }, { "ce_loss": 0.2306605726480484, "epoch": 1.0873915943962642, "step": 3260 }, { "distill_loss": 0.13497021794319153, "epoch": 1.0873915943962642, "step": 3260 }, { "epoch": 1.0873915943962642, "ref_ce_loss": 0.10883224755525589, "step": 3260 }, { "epoch": 1.0907271514342896, "loss": 0.6426, "step": 3270 }, { "epoch": 1.0907271514342896, "grad_norm": 2.159749746322632, "step": 3270 }, { "epoch": 1.0907271514342896, "learning_rate": 0.0002911940383376115, "step": 3270 }, { "epoch": 1.0907271514342896, "loss": 0.6150444746017456, "step": 3270 }, { "ce_loss": 0.25478172302246094, "epoch": 1.0907271514342896, "step": 3270 }, { "distill_loss": 0.20271824300289154, "epoch": 1.0907271514342896, "step": 3270 }, { "epoch": 1.0907271514342896, "ref_ce_loss": 0.15745176374912262, "step": 3270 }, { "epoch": 1.0907271514342896, "loss": 0.7662996053695679, "step": 3270 }, { "ce_loss": 0.22442622482776642, "epoch": 1.0907271514342896, "step": 3270 }, { "distill_loss": 0.1579366773366928, "epoch": 1.0907271514342896, "step": 3270 }, { "epoch": 1.0907271514342896, "ref_ce_loss": 0.1851867437362671, "step": 3270 }, { "epoch": 1.0907271514342896, "loss": 0.6150156855583191, "step": 3270 }, { "ce_loss": 0.15814071893692017, "epoch": 1.0907271514342896, "step": 3270 }, { "distill_loss": 0.15518149733543396, "epoch": 1.0907271514342896, "step": 3270 }, { "epoch": 1.0907271514342896, "ref_ce_loss": 0.10483012348413467, "step": 3270 }, { "epoch": 1.0907271514342896, "loss": 0.7023760676383972, "step": 3270 }, { "ce_loss": 0.2654307782649994, "epoch": 1.0907271514342896, "step": 3270 }, { "distill_loss": 0.13676705956459045, "epoch": 1.0907271514342896, "step": 3270 }, { "epoch": 1.0907271514342896, "ref_ce_loss": 0.12032781541347504, "step": 3270 }, { "epoch": 1.094062708472315, "loss": 0.6668, "step": 3280 }, { "epoch": 1.094062708472315, "grad_norm": 2.4643666744232178, "step": 3280 }, { "epoch": 1.094062708472315, "learning_rate": 0.00029112552709457013, "step": 3280 }, { "epoch": 1.094062708472315, "loss": 0.5930442810058594, "step": 3280 }, { "ce_loss": 0.20769517123699188, "epoch": 1.094062708472315, "step": 3280 }, { "distill_loss": 0.1775941401720047, "epoch": 1.094062708472315, "step": 3280 }, { "epoch": 1.094062708472315, "ref_ce_loss": 0.1543862670660019, "step": 3280 }, { "epoch": 1.094062708472315, "loss": 0.845909059047699, "step": 3280 }, { "ce_loss": 0.18115189671516418, "epoch": 1.094062708472315, "step": 3280 }, { "distill_loss": 0.14669832587242126, "epoch": 1.094062708472315, "step": 3280 }, { "epoch": 1.094062708472315, "ref_ce_loss": 0.1459181308746338, "step": 3280 }, { "epoch": 1.094062708472315, "loss": 0.8020659685134888, "step": 3280 }, { "ce_loss": 0.22992858290672302, "epoch": 1.094062708472315, "step": 3280 }, { "distill_loss": 0.18241055309772491, "epoch": 1.094062708472315, "step": 3280 }, { "epoch": 1.094062708472315, "ref_ce_loss": 0.16772453486919403, "step": 3280 }, { "epoch": 1.094062708472315, "loss": 0.637089729309082, "step": 3280 }, { "ce_loss": 0.16514697670936584, "epoch": 1.094062708472315, "step": 3280 }, { "distill_loss": 0.16328474879264832, "epoch": 1.094062708472315, "step": 3280 }, { "epoch": 1.094062708472315, "ref_ce_loss": 0.09549298137426376, "step": 3280 }, { "epoch": 1.0973982655103403, "loss": 0.7284, "step": 3290 }, { "epoch": 1.0973982655103403, "grad_norm": 2.1445436477661133, "step": 3290 }, { "epoch": 1.0973982655103403, "learning_rate": 0.0002910567584942367, "step": 3290 }, { "epoch": 1.0973982655103403, "loss": 0.47283828258514404, "step": 3290 }, { "ce_loss": 0.18235227465629578, "epoch": 1.0973982655103403, "step": 3290 }, { "distill_loss": 0.18419858813285828, "epoch": 1.0973982655103403, "step": 3290 }, { "epoch": 1.0973982655103403, "ref_ce_loss": 0.10618919879198074, "step": 3290 }, { "epoch": 1.0973982655103403, "loss": 0.9133608937263489, "step": 3290 }, { "ce_loss": 0.2051980197429657, "epoch": 1.0973982655103403, "step": 3290 }, { "distill_loss": 0.16253188252449036, "epoch": 1.0973982655103403, "step": 3290 }, { "epoch": 1.0973982655103403, "ref_ce_loss": 0.14432208240032196, "step": 3290 }, { "epoch": 1.0973982655103403, "loss": 0.4479078948497772, "step": 3290 }, { "ce_loss": 0.17836260795593262, "epoch": 1.0973982655103403, "step": 3290 }, { "distill_loss": 0.16010338068008423, "epoch": 1.0973982655103403, "step": 3290 }, { "epoch": 1.0973982655103403, "ref_ce_loss": 0.10920123010873795, "step": 3290 }, { "epoch": 1.0973982655103403, "loss": 0.8367888331413269, "step": 3290 }, { "ce_loss": 0.3513221740722656, "epoch": 1.0973982655103403, "step": 3290 }, { "distill_loss": 0.21813614666461945, "epoch": 1.0973982655103403, "step": 3290 }, { "epoch": 1.0973982655103403, "ref_ce_loss": 0.2671056389808655, "step": 3290 }, { "epoch": 1.1007338225483656, "loss": 0.6781, "step": 3300 }, { "epoch": 1.1007338225483656, "grad_norm": 2.1739988327026367, "step": 3300 }, { "epoch": 1.1007338225483656, "learning_rate": 0.00029098773266201817, "step": 3300 }, { "epoch": 1.1007338225483656, "loss": 0.6908473968505859, "step": 3300 }, { "ce_loss": 0.1812887191772461, "epoch": 1.1007338225483656, "step": 3300 }, { "distill_loss": 0.1621406078338623, "epoch": 1.1007338225483656, "step": 3300 }, { "epoch": 1.1007338225483656, "ref_ce_loss": 0.12778893113136292, "step": 3300 }, { "epoch": 1.1007338225483656, "loss": 0.4360514283180237, "step": 3300 }, { "ce_loss": 0.15996110439300537, "epoch": 1.1007338225483656, "step": 3300 }, { "distill_loss": 0.14277467131614685, "epoch": 1.1007338225483656, "step": 3300 }, { "epoch": 1.1007338225483656, "ref_ce_loss": 0.13183438777923584, "step": 3300 }, { "epoch": 1.1007338225483656, "loss": 0.6263667345046997, "step": 3300 }, { "ce_loss": 0.23108765482902527, "epoch": 1.1007338225483656, "step": 3300 }, { "distill_loss": 0.16686061024665833, "epoch": 1.1007338225483656, "step": 3300 }, { "epoch": 1.1007338225483656, "ref_ce_loss": 0.10735461115837097, "step": 3300 }, { "epoch": 1.1007338225483656, "loss": 0.47359082102775574, "step": 3300 }, { "ce_loss": 0.14491242170333862, "epoch": 1.1007338225483656, "step": 3300 }, { "distill_loss": 0.16404592990875244, "epoch": 1.1007338225483656, "step": 3300 }, { "epoch": 1.1007338225483656, "ref_ce_loss": 0.11639886349439621, "step": 3300 }, { "epoch": 1.104069379586391, "loss": 0.6936, "step": 3310 }, { "epoch": 1.104069379586391, "grad_norm": 2.9527342319488525, "step": 3310 }, { "epoch": 1.104069379586391, "learning_rate": 0.00029091844972379036, "step": 3310 }, { "epoch": 1.104069379586391, "loss": 0.5075188279151917, "step": 3310 }, { "ce_loss": 0.18285036087036133, "epoch": 1.104069379586391, "step": 3310 }, { "distill_loss": 0.11507537215948105, "epoch": 1.104069379586391, "step": 3310 }, { "epoch": 1.104069379586391, "ref_ce_loss": 0.14507053792476654, "step": 3310 }, { "epoch": 1.104069379586391, "loss": 0.6437574028968811, "step": 3310 }, { "ce_loss": 0.3010771572589874, "epoch": 1.104069379586391, "step": 3310 }, { "distill_loss": 0.1596427708864212, "epoch": 1.104069379586391, "step": 3310 }, { "epoch": 1.104069379586391, "ref_ce_loss": 0.1829010546207428, "step": 3310 }, { "epoch": 1.104069379586391, "loss": 0.79887855052948, "step": 3310 }, { "ce_loss": 0.3642679750919342, "epoch": 1.104069379586391, "step": 3310 }, { "distill_loss": 0.13354647159576416, "epoch": 1.104069379586391, "step": 3310 }, { "epoch": 1.104069379586391, "ref_ce_loss": 0.1479581743478775, "step": 3310 }, { "epoch": 1.104069379586391, "loss": 0.6291710734367371, "step": 3310 }, { "ce_loss": 0.2528185546398163, "epoch": 1.104069379586391, "step": 3310 }, { "distill_loss": 0.1336766481399536, "epoch": 1.104069379586391, "step": 3310 }, { "epoch": 1.104069379586391, "ref_ce_loss": 0.19675657153129578, "step": 3310 }, { "epoch": 1.1074049366244163, "loss": 0.675, "step": 3320 }, { "epoch": 1.1074049366244163, "grad_norm": 2.2137176990509033, "step": 3320 }, { "epoch": 1.1074049366244163, "learning_rate": 0.00029084890980589806, "step": 3320 }, { "epoch": 1.1074049366244163, "loss": 0.4275200664997101, "step": 3320 }, { "ce_loss": 0.1844518780708313, "epoch": 1.1074049366244163, "step": 3320 }, { "distill_loss": 0.09774865210056305, "epoch": 1.1074049366244163, "step": 3320 }, { "epoch": 1.1074049366244163, "ref_ce_loss": 0.1450997292995453, "step": 3320 }, { "epoch": 1.1074049366244163, "loss": 0.6636391878128052, "step": 3320 }, { "ce_loss": 0.2548641562461853, "epoch": 1.1074049366244163, "step": 3320 }, { "distill_loss": 0.10373241454362869, "epoch": 1.1074049366244163, "step": 3320 }, { "epoch": 1.1074049366244163, "ref_ce_loss": 0.18854564428329468, "step": 3320 }, { "epoch": 1.1074049366244163, "loss": 0.4774409830570221, "step": 3320 }, { "ce_loss": 0.21203358471393585, "epoch": 1.1074049366244163, "step": 3320 }, { "distill_loss": 0.12920010089874268, "epoch": 1.1074049366244163, "step": 3320 }, { "epoch": 1.1074049366244163, "ref_ce_loss": 0.13605579733848572, "step": 3320 }, { "epoch": 1.1074049366244163, "loss": 0.35023680329322815, "step": 3320 }, { "ce_loss": 0.1279526650905609, "epoch": 1.1074049366244163, "step": 3320 }, { "distill_loss": 0.09370873868465424, "epoch": 1.1074049366244163, "step": 3320 }, { "epoch": 1.1074049366244163, "ref_ce_loss": 0.12837746739387512, "step": 3320 }, { "epoch": 1.1107404936624417, "loss": 0.7074, "step": 3330 }, { "epoch": 1.1107404936624417, "grad_norm": 4.328708171844482, "step": 3330 }, { "epoch": 1.1107404936624417, "learning_rate": 0.0002907791130351547, "step": 3330 }, { "epoch": 1.1107404936624417, "loss": 1.5637755393981934, "step": 3330 }, { "ce_loss": 0.16754838824272156, "epoch": 1.1107404936624417, "step": 3330 }, { "distill_loss": 0.2678470015525818, "epoch": 1.1107404936624417, "step": 3330 }, { "epoch": 1.1107404936624417, "ref_ce_loss": 0.12202798575162888, "step": 3330 }, { "epoch": 1.1107404936624417, "loss": 0.9462152719497681, "step": 3330 }, { "ce_loss": 0.18462923169136047, "epoch": 1.1107404936624417, "step": 3330 }, { "distill_loss": 0.27984893321990967, "epoch": 1.1107404936624417, "step": 3330 }, { "epoch": 1.1107404936624417, "ref_ce_loss": 0.158742293715477, "step": 3330 }, { "epoch": 1.1107404936624417, "loss": 0.7361347079277039, "step": 3330 }, { "ce_loss": 0.20012395083904266, "epoch": 1.1107404936624417, "step": 3330 }, { "distill_loss": 0.23684218525886536, "epoch": 1.1107404936624417, "step": 3330 }, { "epoch": 1.1107404936624417, "ref_ce_loss": 0.18918201327323914, "step": 3330 }, { "epoch": 1.1107404936624417, "loss": 0.7664051055908203, "step": 3330 }, { "ce_loss": 0.28589898347854614, "epoch": 1.1107404936624417, "step": 3330 }, { "distill_loss": 0.3149957060813904, "epoch": 1.1107404936624417, "step": 3330 }, { "epoch": 1.1107404936624417, "ref_ce_loss": 0.1653788983821869, "step": 3330 }, { "epoch": 1.114076050700467, "loss": 0.766, "step": 3340 }, { "epoch": 1.114076050700467, "grad_norm": 3.208272933959961, "step": 3340 }, { "epoch": 1.114076050700467, "learning_rate": 0.0002907090595388419, "step": 3340 }, { "epoch": 1.114076050700467, "loss": 0.55037921667099, "step": 3340 }, { "ce_loss": 0.19427846372127533, "epoch": 1.114076050700467, "step": 3340 }, { "distill_loss": 0.19319778680801392, "epoch": 1.114076050700467, "step": 3340 }, { "epoch": 1.114076050700467, "ref_ce_loss": 0.16233539581298828, "step": 3340 }, { "epoch": 1.114076050700467, "loss": 1.021225929260254, "step": 3340 }, { "ce_loss": 0.23068369925022125, "epoch": 1.114076050700467, "step": 3340 }, { "distill_loss": 0.2348114550113678, "epoch": 1.114076050700467, "step": 3340 }, { "epoch": 1.114076050700467, "ref_ce_loss": 0.1501733660697937, "step": 3340 }, { "epoch": 1.114076050700467, "loss": 0.6855276823043823, "step": 3340 }, { "ce_loss": 0.2387414425611496, "epoch": 1.114076050700467, "step": 3340 }, { "distill_loss": 0.2071400135755539, "epoch": 1.114076050700467, "step": 3340 }, { "epoch": 1.114076050700467, "ref_ce_loss": 0.17829790711402893, "step": 3340 }, { "epoch": 1.114076050700467, "loss": 1.3309962749481201, "step": 3340 }, { "ce_loss": 0.26556396484375, "epoch": 1.114076050700467, "step": 3340 }, { "distill_loss": 0.2766760289669037, "epoch": 1.114076050700467, "step": 3340 }, { "epoch": 1.114076050700467, "ref_ce_loss": 0.11858808249235153, "step": 3340 }, { "epoch": 1.1174116077384924, "loss": 0.722, "step": 3350 }, { "epoch": 1.1174116077384924, "grad_norm": 2.8225748538970947, "step": 3350 }, { "epoch": 1.1174116077384924, "learning_rate": 0.00029063874944470976, "step": 3350 }, { "epoch": 1.1174116077384924, "loss": 0.6001484990119934, "step": 3350 }, { "ce_loss": 0.16739514470100403, "epoch": 1.1174116077384924, "step": 3350 }, { "distill_loss": 0.12116310000419617, "epoch": 1.1174116077384924, "step": 3350 }, { "epoch": 1.1174116077384924, "ref_ce_loss": 0.15140965580940247, "step": 3350 }, { "epoch": 1.1174116077384924, "loss": 0.6058046221733093, "step": 3350 }, { "ce_loss": 0.24347113072872162, "epoch": 1.1174116077384924, "step": 3350 }, { "distill_loss": 0.14185252785682678, "epoch": 1.1174116077384924, "step": 3350 }, { "epoch": 1.1174116077384924, "ref_ce_loss": 0.15476146340370178, "step": 3350 }, { "epoch": 1.1174116077384924, "loss": 0.5068092346191406, "step": 3350 }, { "ce_loss": 0.20585039258003235, "epoch": 1.1174116077384924, "step": 3350 }, { "distill_loss": 0.13100898265838623, "epoch": 1.1174116077384924, "step": 3350 }, { "epoch": 1.1174116077384924, "ref_ce_loss": 0.11952287703752518, "step": 3350 }, { "epoch": 1.1174116077384924, "loss": 0.6194139122962952, "step": 3350 }, { "ce_loss": 0.24892067909240723, "epoch": 1.1174116077384924, "step": 3350 }, { "distill_loss": 0.1232369989156723, "epoch": 1.1174116077384924, "step": 3350 }, { "epoch": 1.1174116077384924, "ref_ce_loss": 0.10848834365606308, "step": 3350 }, { "epoch": 1.1207471647765177, "loss": 0.7006, "step": 3360 }, { "epoch": 1.1207471647765177, "grad_norm": 3.294583797454834, "step": 3360 }, { "epoch": 1.1207471647765177, "learning_rate": 0.00029056818288097604, "step": 3360 }, { "epoch": 1.1207471647765177, "loss": 0.3698619306087494, "step": 3360 }, { "ce_loss": 0.13874569535255432, "epoch": 1.1207471647765177, "step": 3360 }, { "distill_loss": 0.10796372592449188, "epoch": 1.1207471647765177, "step": 3360 }, { "epoch": 1.1207471647765177, "ref_ce_loss": 0.12302211672067642, "step": 3360 }, { "epoch": 1.1207471647765177, "loss": 0.7495459318161011, "step": 3360 }, { "ce_loss": 0.24759386479854584, "epoch": 1.1207471647765177, "step": 3360 }, { "distill_loss": 0.1263791173696518, "epoch": 1.1207471647765177, "step": 3360 }, { "epoch": 1.1207471647765177, "ref_ce_loss": 0.14187084138393402, "step": 3360 }, { "epoch": 1.1207471647765177, "loss": 0.5314804911613464, "step": 3360 }, { "ce_loss": 0.1609436273574829, "epoch": 1.1207471647765177, "step": 3360 }, { "distill_loss": 0.12793494760990143, "epoch": 1.1207471647765177, "step": 3360 }, { "epoch": 1.1207471647765177, "ref_ce_loss": 0.11565695703029633, "step": 3360 }, { "epoch": 1.1207471647765177, "loss": 0.6234934329986572, "step": 3360 }, { "ce_loss": 0.25368544459342957, "epoch": 1.1207471647765177, "step": 3360 }, { "distill_loss": 0.1242745965719223, "epoch": 1.1207471647765177, "step": 3360 }, { "epoch": 1.1207471647765177, "ref_ce_loss": 0.12844549119472504, "step": 3360 }, { "epoch": 1.124082721814543, "loss": 0.6384, "step": 3370 }, { "epoch": 1.124082721814543, "grad_norm": 2.149684190750122, "step": 3370 }, { "epoch": 1.124082721814543, "learning_rate": 0.0002904973599763264, "step": 3370 }, { "epoch": 1.124082721814543, "loss": 0.8979804515838623, "step": 3370 }, { "ce_loss": 0.3260359764099121, "epoch": 1.124082721814543, "step": 3370 }, { "distill_loss": 0.12811465561389923, "epoch": 1.124082721814543, "step": 3370 }, { "epoch": 1.124082721814543, "ref_ce_loss": 0.1349736899137497, "step": 3370 }, { "epoch": 1.124082721814543, "loss": 0.7722697257995605, "step": 3370 }, { "ce_loss": 0.16676156222820282, "epoch": 1.124082721814543, "step": 3370 }, { "distill_loss": 0.11455924808979034, "epoch": 1.124082721814543, "step": 3370 }, { "epoch": 1.124082721814543, "ref_ce_loss": 0.14846940338611603, "step": 3370 }, { "epoch": 1.124082721814543, "loss": 0.7938320636749268, "step": 3370 }, { "ce_loss": 0.19437064230442047, "epoch": 1.124082721814543, "step": 3370 }, { "distill_loss": 0.10303813964128494, "epoch": 1.124082721814543, "step": 3370 }, { "epoch": 1.124082721814543, "ref_ce_loss": 0.1043003499507904, "step": 3370 }, { "epoch": 1.124082721814543, "loss": 0.591460108757019, "step": 3370 }, { "ce_loss": 0.2839815318584442, "epoch": 1.124082721814543, "step": 3370 }, { "distill_loss": 0.12387672066688538, "epoch": 1.124082721814543, "step": 3370 }, { "epoch": 1.124082721814543, "ref_ce_loss": 0.13258396089076996, "step": 3370 }, { "epoch": 1.1274182788525684, "loss": 0.7276, "step": 3380 }, { "epoch": 1.1274182788525684, "grad_norm": 3.415515899658203, "step": 3380 }, { "epoch": 1.1274182788525684, "learning_rate": 0.0002904262808599138, "step": 3380 }, { "epoch": 1.1274182788525684, "loss": 0.6177815794944763, "step": 3380 }, { "ce_loss": 0.1780986338853836, "epoch": 1.1274182788525684, "step": 3380 }, { "distill_loss": 0.09284936636686325, "epoch": 1.1274182788525684, "step": 3380 }, { "epoch": 1.1274182788525684, "ref_ce_loss": 0.15803013741970062, "step": 3380 }, { "epoch": 1.1274182788525684, "loss": 0.5742705464363098, "step": 3380 }, { "ce_loss": 0.22747960686683655, "epoch": 1.1274182788525684, "step": 3380 }, { "distill_loss": 0.08297803997993469, "epoch": 1.1274182788525684, "step": 3380 }, { "epoch": 1.1274182788525684, "ref_ce_loss": 0.17674367129802704, "step": 3380 }, { "epoch": 1.1274182788525684, "loss": 0.27375152707099915, "step": 3380 }, { "ce_loss": 0.09724527597427368, "epoch": 1.1274182788525684, "step": 3380 }, { "distill_loss": 0.06766033172607422, "epoch": 1.1274182788525684, "step": 3380 }, { "epoch": 1.1274182788525684, "ref_ce_loss": 0.1087183803319931, "step": 3380 }, { "epoch": 1.1274182788525684, "loss": 0.6211159229278564, "step": 3380 }, { "ce_loss": 0.27215591073036194, "epoch": 1.1274182788525684, "step": 3380 }, { "distill_loss": 0.08365193009376526, "epoch": 1.1274182788525684, "step": 3380 }, { "epoch": 1.1274182788525684, "ref_ce_loss": 0.19366669654846191, "step": 3380 }, { "epoch": 1.1307538358905938, "loss": 0.5849, "step": 3390 }, { "epoch": 1.1307538358905938, "grad_norm": 2.1435344219207764, "step": 3390 }, { "epoch": 1.1307538358905938, "learning_rate": 0.0002903549456613586, "step": 3390 }, { "epoch": 1.1307538358905938, "loss": 0.570225179195404, "step": 3390 }, { "ce_loss": 0.24740993976593018, "epoch": 1.1307538358905938, "step": 3390 }, { "distill_loss": 0.12046276032924652, "epoch": 1.1307538358905938, "step": 3390 }, { "epoch": 1.1307538358905938, "ref_ce_loss": 0.2011656016111374, "step": 3390 }, { "epoch": 1.1307538358905938, "loss": 0.645056962966919, "step": 3390 }, { "ce_loss": 0.2715945839881897, "epoch": 1.1307538358905938, "step": 3390 }, { "distill_loss": 0.14380735158920288, "epoch": 1.1307538358905938, "step": 3390 }, { "epoch": 1.1307538358905938, "ref_ce_loss": 0.1408105045557022, "step": 3390 }, { "epoch": 1.1307538358905938, "loss": 0.6090614795684814, "step": 3390 }, { "ce_loss": 0.2522716224193573, "epoch": 1.1307538358905938, "step": 3390 }, { "distill_loss": 0.13276003301143646, "epoch": 1.1307538358905938, "step": 3390 }, { "epoch": 1.1307538358905938, "ref_ce_loss": 0.16461263597011566, "step": 3390 }, { "epoch": 1.1307538358905938, "loss": 0.644291877746582, "step": 3390 }, { "ce_loss": 0.2850654423236847, "epoch": 1.1307538358905938, "step": 3390 }, { "distill_loss": 0.1500633805990219, "epoch": 1.1307538358905938, "step": 3390 }, { "epoch": 1.1307538358905938, "ref_ce_loss": 0.1433248519897461, "step": 3390 }, { "epoch": 1.134089392928619, "loss": 0.6568, "step": 3400 }, { "epoch": 1.134089392928619, "grad_norm": 3.319453477859497, "step": 3400 }, { "epoch": 1.134089392928619, "learning_rate": 0.000290283354510748, "step": 3400 }, { "epoch": 1.134089392928619, "loss": 0.6292673945426941, "step": 3400 }, { "ce_loss": 0.2306334227323532, "epoch": 1.134089392928619, "step": 3400 }, { "distill_loss": 0.11475005745887756, "epoch": 1.134089392928619, "step": 3400 }, { "epoch": 1.134089392928619, "ref_ce_loss": 0.10790764540433884, "step": 3400 }, { "epoch": 1.134089392928619, "loss": 0.3763951063156128, "step": 3400 }, { "ce_loss": 0.11027084290981293, "epoch": 1.134089392928619, "step": 3400 }, { "distill_loss": 0.13586261868476868, "epoch": 1.134089392928619, "step": 3400 }, { "epoch": 1.134089392928619, "ref_ce_loss": 0.10516718029975891, "step": 3400 }, { "epoch": 1.134089392928619, "loss": 0.5288718938827515, "step": 3400 }, { "ce_loss": 0.20292605459690094, "epoch": 1.134089392928619, "step": 3400 }, { "distill_loss": 0.13888004422187805, "epoch": 1.134089392928619, "step": 3400 }, { "epoch": 1.134089392928619, "ref_ce_loss": 0.10799039900302887, "step": 3400 }, { "epoch": 1.134089392928619, "loss": 0.7232894897460938, "step": 3400 }, { "ce_loss": 0.24011795222759247, "epoch": 1.134089392928619, "step": 3400 }, { "distill_loss": 0.13609902560710907, "epoch": 1.134089392928619, "step": 3400 }, { "epoch": 1.134089392928619, "ref_ce_loss": 0.1771697998046875, "step": 3400 }, { "epoch": 1.1374249499666444, "loss": 0.689, "step": 3410 }, { "epoch": 1.1374249499666444, "grad_norm": 2.43742036819458, "step": 3410 }, { "epoch": 1.1374249499666444, "learning_rate": 0.00029021150753863614, "step": 3410 }, { "epoch": 1.1374249499666444, "loss": 0.47013843059539795, "step": 3410 }, { "ce_loss": 0.25350990891456604, "epoch": 1.1374249499666444, "step": 3410 }, { "distill_loss": 0.13056300580501556, "epoch": 1.1374249499666444, "step": 3410 }, { "epoch": 1.1374249499666444, "ref_ce_loss": 0.08574430644512177, "step": 3410 }, { "epoch": 1.1374249499666444, "loss": 0.8182771801948547, "step": 3410 }, { "ce_loss": 0.2465936690568924, "epoch": 1.1374249499666444, "step": 3410 }, { "distill_loss": 0.1421625316143036, "epoch": 1.1374249499666444, "step": 3410 }, { "epoch": 1.1374249499666444, "ref_ce_loss": 0.19648271799087524, "step": 3410 }, { "epoch": 1.1374249499666444, "loss": 0.429518461227417, "step": 3410 }, { "ce_loss": 0.16575780510902405, "epoch": 1.1374249499666444, "step": 3410 }, { "distill_loss": 0.11913955211639404, "epoch": 1.1374249499666444, "step": 3410 }, { "epoch": 1.1374249499666444, "ref_ce_loss": 0.08452162146568298, "step": 3410 }, { "epoch": 1.1374249499666444, "loss": 0.5666241645812988, "step": 3410 }, { "ce_loss": 0.1754610687494278, "epoch": 1.1374249499666444, "step": 3410 }, { "distill_loss": 0.1132507249712944, "epoch": 1.1374249499666444, "step": 3410 }, { "epoch": 1.1374249499666444, "ref_ce_loss": 0.18586274981498718, "step": 3410 }, { "epoch": 1.1407605070046698, "loss": 0.6657, "step": 3420 }, { "epoch": 1.1407605070046698, "grad_norm": 2.6489334106445312, "step": 3420 }, { "epoch": 1.1407605070046698, "learning_rate": 0.00029013940487604336, "step": 3420 }, { "epoch": 1.1407605070046698, "loss": 0.7012091875076294, "step": 3420 }, { "ce_loss": 0.319513738155365, "epoch": 1.1407605070046698, "step": 3420 }, { "distill_loss": 0.23633024096488953, "epoch": 1.1407605070046698, "step": 3420 }, { "epoch": 1.1407605070046698, "ref_ce_loss": 0.14517493546009064, "step": 3420 }, { "epoch": 1.1407605070046698, "loss": 1.1926076412200928, "step": 3420 }, { "ce_loss": 0.28267160058021545, "epoch": 1.1407605070046698, "step": 3420 }, { "distill_loss": 0.182927668094635, "epoch": 1.1407605070046698, "step": 3420 }, { "epoch": 1.1407605070046698, "ref_ce_loss": 0.11487416177988052, "step": 3420 }, { "epoch": 1.1407605070046698, "loss": 0.642974317073822, "step": 3420 }, { "ce_loss": 0.19311952590942383, "epoch": 1.1407605070046698, "step": 3420 }, { "distill_loss": 0.14313045144081116, "epoch": 1.1407605070046698, "step": 3420 }, { "epoch": 1.1407605070046698, "ref_ce_loss": 0.16836071014404297, "step": 3420 }, { "epoch": 1.1407605070046698, "loss": 0.6440466046333313, "step": 3420 }, { "ce_loss": 0.2331516444683075, "epoch": 1.1407605070046698, "step": 3420 }, { "distill_loss": 0.1541604846715927, "epoch": 1.1407605070046698, "step": 3420 }, { "epoch": 1.1407605070046698, "ref_ce_loss": 0.16985993087291718, "step": 3420 }, { "epoch": 1.1440960640426951, "loss": 0.8199, "step": 3430 }, { "epoch": 1.1440960640426951, "grad_norm": 2.4301552772521973, "step": 3430 }, { "epoch": 1.1440960640426951, "learning_rate": 0.00029006704665445653, "step": 3430 }, { "epoch": 1.1440960640426951, "loss": 0.7986155152320862, "step": 3430 }, { "ce_loss": 0.16914814710617065, "epoch": 1.1440960640426951, "step": 3430 }, { "distill_loss": 0.2899048626422882, "epoch": 1.1440960640426951, "step": 3430 }, { "epoch": 1.1440960640426951, "ref_ce_loss": 0.14340253174304962, "step": 3430 }, { "epoch": 1.1440960640426951, "loss": 0.8709583282470703, "step": 3430 }, { "ce_loss": 0.23835349082946777, "epoch": 1.1440960640426951, "step": 3430 }, { "distill_loss": 0.3554132282733917, "epoch": 1.1440960640426951, "step": 3430 }, { "epoch": 1.1440960640426951, "ref_ce_loss": 0.13039939105510712, "step": 3430 }, { "epoch": 1.1440960640426951, "loss": 0.6296640634536743, "step": 3430 }, { "ce_loss": 0.1245369091629982, "epoch": 1.1440960640426951, "step": 3430 }, { "distill_loss": 0.27871596813201904, "epoch": 1.1440960640426951, "step": 3430 }, { "epoch": 1.1440960640426951, "ref_ce_loss": 0.11910228431224823, "step": 3430 }, { "epoch": 1.1440960640426951, "loss": 0.9232403039932251, "step": 3430 }, { "ce_loss": 0.13957470655441284, "epoch": 1.1440960640426951, "step": 3430 }, { "distill_loss": 0.36236271262168884, "epoch": 1.1440960640426951, "step": 3430 }, { "epoch": 1.1440960640426951, "ref_ce_loss": 0.12281995266675949, "step": 3430 }, { "epoch": 1.1474316210807205, "loss": 0.7824, "step": 3440 }, { "epoch": 1.1474316210807205, "grad_norm": 2.6593689918518066, "step": 3440 }, { "epoch": 1.1474316210807205, "learning_rate": 0.00028999443300582864, "step": 3440 }, { "epoch": 1.1474316210807205, "loss": 0.6766294836997986, "step": 3440 }, { "ce_loss": 0.2637663185596466, "epoch": 1.1474316210807205, "step": 3440 }, { "distill_loss": 0.21465124189853668, "epoch": 1.1474316210807205, "step": 3440 }, { "epoch": 1.1474316210807205, "ref_ce_loss": 0.14603792130947113, "step": 3440 }, { "epoch": 1.1474316210807205, "loss": 1.1934056282043457, "step": 3440 }, { "ce_loss": 0.3012538552284241, "epoch": 1.1474316210807205, "step": 3440 }, { "distill_loss": 0.19502484798431396, "epoch": 1.1474316210807205, "step": 3440 }, { "epoch": 1.1474316210807205, "ref_ce_loss": 0.1340654343366623, "step": 3440 }, { "epoch": 1.1474316210807205, "loss": 0.9661704301834106, "step": 3440 }, { "ce_loss": 0.2947535812854767, "epoch": 1.1474316210807205, "step": 3440 }, { "distill_loss": 0.17559944093227386, "epoch": 1.1474316210807205, "step": 3440 }, { "epoch": 1.1474316210807205, "ref_ce_loss": 0.18281979858875275, "step": 3440 }, { "epoch": 1.1474316210807205, "loss": 0.6069207191467285, "step": 3440 }, { "ce_loss": 0.2645988166332245, "epoch": 1.1474316210807205, "step": 3440 }, { "distill_loss": 0.1422264575958252, "epoch": 1.1474316210807205, "step": 3440 }, { "epoch": 1.1474316210807205, "ref_ce_loss": 0.200031116604805, "step": 3440 }, { "epoch": 1.1507671781187458, "loss": 0.6981, "step": 3450 }, { "epoch": 1.1507671781187458, "grad_norm": 6.953259468078613, "step": 3450 }, { "epoch": 1.1507671781187458, "learning_rate": 0.0002899215640625782, "step": 3450 }, { "epoch": 1.1507671781187458, "loss": 0.36105650663375854, "step": 3450 }, { "ce_loss": 0.11932425200939178, "epoch": 1.1507671781187458, "step": 3450 }, { "distill_loss": 0.10572926700115204, "epoch": 1.1507671781187458, "step": 3450 }, { "epoch": 1.1507671781187458, "ref_ce_loss": 0.09935622662305832, "step": 3450 }, { "epoch": 1.1507671781187458, "loss": 0.6997066736221313, "step": 3450 }, { "ce_loss": 0.22075653076171875, "epoch": 1.1507671781187458, "step": 3450 }, { "distill_loss": 0.15865163505077362, "epoch": 1.1507671781187458, "step": 3450 }, { "epoch": 1.1507671781187458, "ref_ce_loss": 0.12915155291557312, "step": 3450 }, { "epoch": 1.1507671781187458, "loss": 0.6172825694084167, "step": 3450 }, { "ce_loss": 0.31567999720573425, "epoch": 1.1507671781187458, "step": 3450 }, { "distill_loss": 0.1347445249557495, "epoch": 1.1507671781187458, "step": 3450 }, { "epoch": 1.1507671781187458, "ref_ce_loss": 0.16673196852207184, "step": 3450 }, { "epoch": 1.1507671781187458, "loss": 0.6522892117500305, "step": 3450 }, { "ce_loss": 0.3099440634250641, "epoch": 1.1507671781187458, "step": 3450 }, { "distill_loss": 0.12644734978675842, "epoch": 1.1507671781187458, "step": 3450 }, { "epoch": 1.1507671781187458, "ref_ce_loss": 0.13789407908916473, "step": 3450 }, { "epoch": 1.1541027351567712, "loss": 0.6446, "step": 3460 }, { "epoch": 1.1541027351567712, "grad_norm": 2.391942262649536, "step": 3460 }, { "epoch": 1.1541027351567712, "learning_rate": 0.00028984843995758945, "step": 3460 }, { "epoch": 1.1541027351567712, "loss": 0.6947566270828247, "step": 3460 }, { "ce_loss": 0.23904870450496674, "epoch": 1.1541027351567712, "step": 3460 }, { "distill_loss": 0.1353689730167389, "epoch": 1.1541027351567712, "step": 3460 }, { "epoch": 1.1541027351567712, "ref_ce_loss": 0.15380215644836426, "step": 3460 }, { "epoch": 1.1541027351567712, "loss": 0.42585551738739014, "step": 3460 }, { "ce_loss": 0.15706568956375122, "epoch": 1.1541027351567712, "step": 3460 }, { "distill_loss": 0.1261441856622696, "epoch": 1.1541027351567712, "step": 3460 }, { "epoch": 1.1541027351567712, "ref_ce_loss": 0.14227433502674103, "step": 3460 }, { "epoch": 1.1541027351567712, "loss": 1.389416217803955, "step": 3460 }, { "ce_loss": 0.25617700815200806, "epoch": 1.1541027351567712, "step": 3460 }, { "distill_loss": 0.14031198620796204, "epoch": 1.1541027351567712, "step": 3460 }, { "epoch": 1.1541027351567712, "ref_ce_loss": 0.15955011546611786, "step": 3460 }, { "epoch": 1.1541027351567712, "loss": 0.38896113634109497, "step": 3460 }, { "ce_loss": 0.17260347306728363, "epoch": 1.1541027351567712, "step": 3460 }, { "distill_loss": 0.11305748671293259, "epoch": 1.1541027351567712, "step": 3460 }, { "epoch": 1.1541027351567712, "ref_ce_loss": 0.10322257876396179, "step": 3460 }, { "epoch": 1.1574382921947965, "loss": 0.616, "step": 3470 }, { "epoch": 1.1574382921947965, "grad_norm": 2.23102068901062, "step": 3470 }, { "epoch": 1.1574382921947965, "learning_rate": 0.0002897750608242119, "step": 3470 }, { "epoch": 1.1574382921947965, "loss": 0.5251259207725525, "step": 3470 }, { "ce_loss": 0.21399055421352386, "epoch": 1.1574382921947965, "step": 3470 }, { "distill_loss": 0.12265275418758392, "epoch": 1.1574382921947965, "step": 3470 }, { "epoch": 1.1574382921947965, "ref_ce_loss": 0.1231919601559639, "step": 3470 }, { "epoch": 1.1574382921947965, "loss": 1.4573123455047607, "step": 3470 }, { "ce_loss": 0.46214139461517334, "epoch": 1.1574382921947965, "step": 3470 }, { "distill_loss": 0.15091703832149506, "epoch": 1.1574382921947965, "step": 3470 }, { "epoch": 1.1574382921947965, "ref_ce_loss": 0.18401379883289337, "step": 3470 }, { "epoch": 1.1574382921947965, "loss": 0.4763915538787842, "step": 3470 }, { "ce_loss": 0.18862882256507874, "epoch": 1.1574382921947965, "step": 3470 }, { "distill_loss": 0.08673227578401566, "epoch": 1.1574382921947965, "step": 3470 }, { "epoch": 1.1574382921947965, "ref_ce_loss": 0.10439599305391312, "step": 3470 }, { "epoch": 1.1574382921947965, "loss": 1.18876314163208, "step": 3470 }, { "ce_loss": 0.3029150664806366, "epoch": 1.1574382921947965, "step": 3470 }, { "distill_loss": 0.11280804872512817, "epoch": 1.1574382921947965, "step": 3470 }, { "epoch": 1.1574382921947965, "ref_ce_loss": 0.16956168413162231, "step": 3470 }, { "epoch": 1.160773849232822, "loss": 0.678, "step": 3480 }, { "epoch": 1.160773849232822, "grad_norm": 2.679671287536621, "step": 3480 }, { "epoch": 1.160773849232822, "learning_rate": 0.00028970142679626024, "step": 3480 }, { "epoch": 1.160773849232822, "loss": 0.4567927122116089, "step": 3480 }, { "ce_loss": 0.14957164227962494, "epoch": 1.160773849232822, "step": 3480 }, { "distill_loss": 0.15318655967712402, "epoch": 1.160773849232822, "step": 3480 }, { "epoch": 1.160773849232822, "ref_ce_loss": 0.10267200320959091, "step": 3480 }, { "epoch": 1.160773849232822, "loss": 0.5889483094215393, "step": 3480 }, { "ce_loss": 0.21724718809127808, "epoch": 1.160773849232822, "step": 3480 }, { "distill_loss": 0.1829308122396469, "epoch": 1.160773849232822, "step": 3480 }, { "epoch": 1.160773849232822, "ref_ce_loss": 0.0946304053068161, "step": 3480 }, { "epoch": 1.160773849232822, "loss": 0.9133307933807373, "step": 3480 }, { "ce_loss": 0.40971285104751587, "epoch": 1.160773849232822, "step": 3480 }, { "distill_loss": 0.2387247383594513, "epoch": 1.160773849232822, "step": 3480 }, { "epoch": 1.160773849232822, "ref_ce_loss": 0.21331532299518585, "step": 3480 }, { "epoch": 1.160773849232822, "loss": 0.8961673974990845, "step": 3480 }, { "ce_loss": 0.2807924449443817, "epoch": 1.160773849232822, "step": 3480 }, { "distill_loss": 0.22617176175117493, "epoch": 1.160773849232822, "step": 3480 }, { "epoch": 1.160773849232822, "ref_ce_loss": 0.16985855996608734, "step": 3480 }, { "epoch": 1.1641094062708472, "loss": 0.63, "step": 3490 }, { "epoch": 1.1641094062708472, "grad_norm": 2.3964879512786865, "step": 3490 }, { "epoch": 1.1641094062708472, "learning_rate": 0.00028962753800801383, "step": 3490 }, { "epoch": 1.1641094062708472, "loss": 1.1051898002624512, "step": 3490 }, { "ce_loss": 0.18541871011257172, "epoch": 1.1641094062708472, "step": 3490 }, { "distill_loss": 0.09360478073358536, "epoch": 1.1641094062708472, "step": 3490 }, { "epoch": 1.1641094062708472, "ref_ce_loss": 0.08527404069900513, "step": 3490 }, { "epoch": 1.1641094062708472, "loss": 0.6198468804359436, "step": 3490 }, { "ce_loss": 0.24808526039123535, "epoch": 1.1641094062708472, "step": 3490 }, { "distill_loss": 0.10804644227027893, "epoch": 1.1641094062708472, "step": 3490 }, { "epoch": 1.1641094062708472, "ref_ce_loss": 0.1124085932970047, "step": 3490 }, { "epoch": 1.1641094062708472, "loss": 0.928071141242981, "step": 3490 }, { "ce_loss": 0.26359590888023376, "epoch": 1.1641094062708472, "step": 3490 }, { "distill_loss": 0.13496239483356476, "epoch": 1.1641094062708472, "step": 3490 }, { "epoch": 1.1641094062708472, "ref_ce_loss": 0.12686119973659515, "step": 3490 }, { "epoch": 1.1641094062708472, "loss": 0.7552123665809631, "step": 3490 }, { "ce_loss": 0.23390626907348633, "epoch": 1.1641094062708472, "step": 3490 }, { "distill_loss": 0.09438870847225189, "epoch": 1.1641094062708472, "step": 3490 }, { "epoch": 1.1641094062708472, "ref_ce_loss": 0.14667364954948425, "step": 3490 }, { "epoch": 1.1674449633088726, "loss": 0.6156, "step": 3500 }, { "epoch": 1.1674449633088726, "grad_norm": 2.5168838500976562, "step": 3500 }, { "epoch": 1.1674449633088726, "learning_rate": 0.0002895533945942166, "step": 3500 }, { "epoch": 1.1674449633088726, "loss": 0.45355695486068726, "step": 3500 }, { "ce_loss": 0.18383370339870453, "epoch": 1.1674449633088726, "step": 3500 }, { "distill_loss": 0.09112045913934708, "epoch": 1.1674449633088726, "step": 3500 }, { "epoch": 1.1674449633088726, "ref_ce_loss": 0.1282046139240265, "step": 3500 }, { "epoch": 1.1674449633088726, "loss": 0.5794343948364258, "step": 3500 }, { "ce_loss": 0.26721489429473877, "epoch": 1.1674449633088726, "step": 3500 }, { "distill_loss": 0.11259608715772629, "epoch": 1.1674449633088726, "step": 3500 }, { "epoch": 1.1674449633088726, "ref_ce_loss": 0.19929681718349457, "step": 3500 }, { "epoch": 1.1674449633088726, "loss": 1.3107562065124512, "step": 3500 }, { "ce_loss": 0.22610430419445038, "epoch": 1.1674449633088726, "step": 3500 }, { "distill_loss": 0.09464081376791, "epoch": 1.1674449633088726, "step": 3500 }, { "epoch": 1.1674449633088726, "ref_ce_loss": 0.1893109828233719, "step": 3500 }, { "epoch": 1.1674449633088726, "loss": 0.7660556435585022, "step": 3500 }, { "ce_loss": 0.2392236292362213, "epoch": 1.1674449633088726, "step": 3500 }, { "distill_loss": 0.09696606546640396, "epoch": 1.1674449633088726, "step": 3500 }, { "epoch": 1.1674449633088726, "ref_ce_loss": 0.17173665761947632, "step": 3500 }, { "epoch": 1.170780520346898, "loss": 0.641, "step": 3510 }, { "epoch": 1.170780520346898, "grad_norm": 2.7460596561431885, "step": 3510 }, { "epoch": 1.170780520346898, "learning_rate": 0.000289478996690077, "step": 3510 }, { "epoch": 1.170780520346898, "loss": 0.7444456815719604, "step": 3510 }, { "ce_loss": 0.2830856442451477, "epoch": 1.170780520346898, "step": 3510 }, { "distill_loss": 0.10403694212436676, "epoch": 1.170780520346898, "step": 3510 }, { "epoch": 1.170780520346898, "ref_ce_loss": 0.12975654006004333, "step": 3510 }, { "epoch": 1.170780520346898, "loss": 0.65533047914505, "step": 3510 }, { "ce_loss": 0.21140718460083008, "epoch": 1.170780520346898, "step": 3510 }, { "distill_loss": 0.09631803631782532, "epoch": 1.170780520346898, "step": 3510 }, { "epoch": 1.170780520346898, "ref_ce_loss": 0.13436973094940186, "step": 3510 }, { "epoch": 1.170780520346898, "loss": 1.00142240524292, "step": 3510 }, { "ce_loss": 0.24279353022575378, "epoch": 1.170780520346898, "step": 3510 }, { "distill_loss": 0.10590619593858719, "epoch": 1.170780520346898, "step": 3510 }, { "epoch": 1.170780520346898, "ref_ce_loss": 0.13143108785152435, "step": 3510 }, { "epoch": 1.170780520346898, "loss": 0.5753421783447266, "step": 3510 }, { "ce_loss": 0.21339942514896393, "epoch": 1.170780520346898, "step": 3510 }, { "distill_loss": 0.0863531157374382, "epoch": 1.170780520346898, "step": 3510 }, { "epoch": 1.170780520346898, "ref_ce_loss": 0.17893031239509583, "step": 3510 }, { "epoch": 1.1741160773849233, "loss": 0.619, "step": 3520 }, { "epoch": 1.1741160773849233, "grad_norm": 4.942493438720703, "step": 3520 }, { "epoch": 1.1741160773849233, "learning_rate": 0.00028940434443126736, "step": 3520 }, { "epoch": 1.1741160773849233, "loss": 0.44487467408180237, "step": 3520 }, { "ce_loss": 0.19608616828918457, "epoch": 1.1741160773849233, "step": 3520 }, { "distill_loss": 0.10286536067724228, "epoch": 1.1741160773849233, "step": 3520 }, { "epoch": 1.1741160773849233, "ref_ce_loss": 0.0929742306470871, "step": 3520 }, { "epoch": 1.1741160773849233, "loss": 1.1735808849334717, "step": 3520 }, { "ce_loss": 0.1759229153394699, "epoch": 1.1741160773849233, "step": 3520 }, { "distill_loss": 0.11952294409275055, "epoch": 1.1741160773849233, "step": 3520 }, { "epoch": 1.1741160773849233, "ref_ce_loss": 0.20624127984046936, "step": 3520 }, { "epoch": 1.1741160773849233, "loss": 0.6996158957481384, "step": 3520 }, { "ce_loss": 0.1959640234708786, "epoch": 1.1741160773849233, "step": 3520 }, { "distill_loss": 0.0844564139842987, "epoch": 1.1741160773849233, "step": 3520 }, { "epoch": 1.1741160773849233, "ref_ce_loss": 0.08976216614246368, "step": 3520 }, { "epoch": 1.1741160773849233, "loss": 0.4972955286502838, "step": 3520 }, { "ce_loss": 0.22896063327789307, "epoch": 1.1741160773849233, "step": 3520 }, { "distill_loss": 0.12463532388210297, "epoch": 1.1741160773849233, "step": 3520 }, { "epoch": 1.1741160773849233, "ref_ce_loss": 0.1436271220445633, "step": 3520 }, { "epoch": 1.1774516344229486, "loss": 0.7453, "step": 3530 }, { "epoch": 1.1774516344229486, "grad_norm": 3.2423369884490967, "step": 3530 }, { "epoch": 1.1774516344229486, "learning_rate": 0.00028932943795392406, "step": 3530 }, { "epoch": 1.1774516344229486, "loss": 0.9250675439834595, "step": 3530 }, { "ce_loss": 0.2386307269334793, "epoch": 1.1774516344229486, "step": 3530 }, { "distill_loss": 0.24092139303684235, "epoch": 1.1774516344229486, "step": 3530 }, { "epoch": 1.1774516344229486, "ref_ce_loss": 0.1675252616405487, "step": 3530 }, { "epoch": 1.1774516344229486, "loss": 0.5805863738059998, "step": 3530 }, { "ce_loss": 0.231970876455307, "epoch": 1.1774516344229486, "step": 3530 }, { "distill_loss": 0.23430107533931732, "epoch": 1.1774516344229486, "step": 3530 }, { "epoch": 1.1774516344229486, "ref_ce_loss": 0.11357176303863525, "step": 3530 }, { "epoch": 1.1774516344229486, "loss": 0.8791962265968323, "step": 3530 }, { "ce_loss": 0.26668962836265564, "epoch": 1.1774516344229486, "step": 3530 }, { "distill_loss": 0.26670536398887634, "epoch": 1.1774516344229486, "step": 3530 }, { "epoch": 1.1774516344229486, "ref_ce_loss": 0.15831813216209412, "step": 3530 }, { "epoch": 1.1774516344229486, "loss": 0.5398500561714172, "step": 3530 }, { "ce_loss": 0.20641468465328217, "epoch": 1.1774516344229486, "step": 3530 }, { "distill_loss": 0.1799340546131134, "epoch": 1.1774516344229486, "step": 3530 }, { "epoch": 1.1774516344229486, "ref_ce_loss": 0.1532498002052307, "step": 3530 }, { "epoch": 1.180787191460974, "loss": 0.717, "step": 3540 }, { "epoch": 1.180787191460974, "grad_norm": 2.5336835384368896, "step": 3540 }, { "epoch": 1.180787191460974, "learning_rate": 0.0002892542773946468, "step": 3540 }, { "epoch": 1.180787191460974, "loss": 0.8210991621017456, "step": 3540 }, { "ce_loss": 0.19714991748332977, "epoch": 1.180787191460974, "step": 3540 }, { "distill_loss": 0.20616674423217773, "epoch": 1.180787191460974, "step": 3540 }, { "epoch": 1.180787191460974, "ref_ce_loss": 0.13417214155197144, "step": 3540 }, { "epoch": 1.180787191460974, "loss": 1.0714623928070068, "step": 3540 }, { "ce_loss": 0.37166714668273926, "epoch": 1.180787191460974, "step": 3540 }, { "distill_loss": 0.2562217712402344, "epoch": 1.180787191460974, "step": 3540 }, { "epoch": 1.180787191460974, "ref_ce_loss": 0.17911586165428162, "step": 3540 }, { "epoch": 1.180787191460974, "loss": 0.5534677505493164, "step": 3540 }, { "ce_loss": 0.19591966271400452, "epoch": 1.180787191460974, "step": 3540 }, { "distill_loss": 0.17593654990196228, "epoch": 1.180787191460974, "step": 3540 }, { "epoch": 1.180787191460974, "ref_ce_loss": 0.18121853470802307, "step": 3540 }, { "epoch": 1.180787191460974, "loss": 0.8016934394836426, "step": 3540 }, { "ce_loss": 0.3819798231124878, "epoch": 1.180787191460974, "step": 3540 }, { "distill_loss": 0.22713986039161682, "epoch": 1.180787191460974, "step": 3540 }, { "epoch": 1.180787191460974, "ref_ce_loss": 0.19177654385566711, "step": 3540 }, { "epoch": 1.1841227484989993, "loss": 0.7635, "step": 3550 }, { "epoch": 1.1841227484989993, "grad_norm": 5.0213165283203125, "step": 3550 }, { "epoch": 1.1841227484989993, "learning_rate": 0.00028917886289049903, "step": 3550 }, { "epoch": 1.1841227484989993, "loss": 0.651298463344574, "step": 3550 }, { "ce_loss": 0.19940482079982758, "epoch": 1.1841227484989993, "step": 3550 }, { "distill_loss": 0.23429922759532928, "epoch": 1.1841227484989993, "step": 3550 }, { "epoch": 1.1841227484989993, "ref_ce_loss": 0.13923677802085876, "step": 3550 }, { "epoch": 1.1841227484989993, "loss": 0.7713093161582947, "step": 3550 }, { "ce_loss": 0.30344629287719727, "epoch": 1.1841227484989993, "step": 3550 }, { "distill_loss": 0.22498206794261932, "epoch": 1.1841227484989993, "step": 3550 }, { "epoch": 1.1841227484989993, "ref_ce_loss": 0.18815527856349945, "step": 3550 }, { "epoch": 1.1841227484989993, "loss": 0.6081314086914062, "step": 3550 }, { "ce_loss": 0.2188054770231247, "epoch": 1.1841227484989993, "step": 3550 }, { "distill_loss": 0.18806442618370056, "epoch": 1.1841227484989993, "step": 3550 }, { "epoch": 1.1841227484989993, "ref_ce_loss": 0.134329155087471, "step": 3550 }, { "epoch": 1.1841227484989993, "loss": 0.7301296591758728, "step": 3550 }, { "ce_loss": 0.284285306930542, "epoch": 1.1841227484989993, "step": 3550 }, { "distill_loss": 0.27022460103034973, "epoch": 1.1841227484989993, "step": 3550 }, { "epoch": 1.1841227484989993, "ref_ce_loss": 0.17183545231819153, "step": 3550 }, { "epoch": 1.1874583055370247, "loss": 0.779, "step": 3560 }, { "epoch": 1.1874583055370247, "grad_norm": 2.13301682472229, "step": 3560 }, { "epoch": 1.1874583055370247, "learning_rate": 0.00028910319457900685, "step": 3560 }, { "epoch": 1.1874583055370247, "loss": 0.5808578133583069, "step": 3560 }, { "ce_loss": 0.1893155872821808, "epoch": 1.1874583055370247, "step": 3560 }, { "distill_loss": 0.14360655844211578, "epoch": 1.1874583055370247, "step": 3560 }, { "epoch": 1.1874583055370247, "ref_ce_loss": 0.0961495190858841, "step": 3560 }, { "epoch": 1.1874583055370247, "loss": 0.42147135734558105, "step": 3560 }, { "ce_loss": 0.17312490940093994, "epoch": 1.1874583055370247, "step": 3560 }, { "distill_loss": 0.133303701877594, "epoch": 1.1874583055370247, "step": 3560 }, { "epoch": 1.1874583055370247, "ref_ce_loss": 0.114800825715065, "step": 3560 }, { "epoch": 1.1874583055370247, "loss": 0.6292843818664551, "step": 3560 }, { "ce_loss": 0.2685680687427521, "epoch": 1.1874583055370247, "step": 3560 }, { "distill_loss": 0.17225492000579834, "epoch": 1.1874583055370247, "step": 3560 }, { "epoch": 1.1874583055370247, "ref_ce_loss": 0.12113097310066223, "step": 3560 }, { "epoch": 1.1874583055370247, "loss": 1.1248754262924194, "step": 3560 }, { "ce_loss": 0.24754074215888977, "epoch": 1.1874583055370247, "step": 3560 }, { "distill_loss": 0.16874170303344727, "epoch": 1.1874583055370247, "step": 3560 }, { "epoch": 1.1874583055370247, "ref_ce_loss": 0.23063063621520996, "step": 3560 }, { "epoch": 1.19079386257505, "loss": 0.6905, "step": 3570 }, { "epoch": 1.19079386257505, "grad_norm": 5.3514580726623535, "step": 3570 }, { "epoch": 1.19079386257505, "learning_rate": 0.00028902727259815956, "step": 3570 }, { "epoch": 1.19079386257505, "loss": 0.6627309322357178, "step": 3570 }, { "ce_loss": 0.31900355219841003, "epoch": 1.19079386257505, "step": 3570 }, { "distill_loss": 0.12173722684383392, "epoch": 1.19079386257505, "step": 3570 }, { "epoch": 1.19079386257505, "ref_ce_loss": 0.1653222143650055, "step": 3570 }, { "epoch": 1.19079386257505, "loss": 0.687881350517273, "step": 3570 }, { "ce_loss": 0.29725533723831177, "epoch": 1.19079386257505, "step": 3570 }, { "distill_loss": 0.14126135408878326, "epoch": 1.19079386257505, "step": 3570 }, { "epoch": 1.19079386257505, "ref_ce_loss": 0.19328750669956207, "step": 3570 }, { "epoch": 1.19079386257505, "loss": 0.44334492087364197, "step": 3570 }, { "ce_loss": 0.1040029302239418, "epoch": 1.19079386257505, "step": 3570 }, { "distill_loss": 0.09804385900497437, "epoch": 1.19079386257505, "step": 3570 }, { "epoch": 1.19079386257505, "ref_ce_loss": 0.09700576961040497, "step": 3570 }, { "epoch": 1.19079386257505, "loss": 0.797677755355835, "step": 3570 }, { "ce_loss": 0.16014978289604187, "epoch": 1.19079386257505, "step": 3570 }, { "distill_loss": 0.11923857778310776, "epoch": 1.19079386257505, "step": 3570 }, { "epoch": 1.19079386257505, "ref_ce_loss": 0.08255496621131897, "step": 3570 }, { "epoch": 1.1941294196130754, "loss": 0.6399, "step": 3580 }, { "epoch": 1.1941294196130754, "grad_norm": 2.4868955612182617, "step": 3580 }, { "epoch": 1.1941294196130754, "learning_rate": 0.00028895109708640876, "step": 3580 }, { "epoch": 1.1941294196130754, "loss": 0.7460382580757141, "step": 3580 }, { "ce_loss": 0.16989870369434357, "epoch": 1.1941294196130754, "step": 3580 }, { "distill_loss": 0.09856413304805756, "epoch": 1.1941294196130754, "step": 3580 }, { "epoch": 1.1941294196130754, "ref_ce_loss": 0.09345860034227371, "step": 3580 }, { "epoch": 1.1941294196130754, "loss": 0.9039356112480164, "step": 3580 }, { "ce_loss": 0.35457083582878113, "epoch": 1.1941294196130754, "step": 3580 }, { "distill_loss": 0.15812674164772034, "epoch": 1.1941294196130754, "step": 3580 }, { "epoch": 1.1941294196130754, "ref_ce_loss": 0.18375664949417114, "step": 3580 }, { "epoch": 1.1941294196130754, "loss": 0.6958930492401123, "step": 3580 }, { "ce_loss": 0.24443304538726807, "epoch": 1.1941294196130754, "step": 3580 }, { "distill_loss": 0.11708992719650269, "epoch": 1.1941294196130754, "step": 3580 }, { "epoch": 1.1941294196130754, "ref_ce_loss": 0.14090929925441742, "step": 3580 }, { "epoch": 1.1941294196130754, "loss": 0.7182627320289612, "step": 3580 }, { "ce_loss": 0.20644690096378326, "epoch": 1.1941294196130754, "step": 3580 }, { "distill_loss": 0.12384609133005142, "epoch": 1.1941294196130754, "step": 3580 }, { "epoch": 1.1941294196130754, "ref_ce_loss": 0.1659153550863266, "step": 3580 }, { "epoch": 1.1974649766511007, "loss": 0.6628, "step": 3590 }, { "epoch": 1.1974649766511007, "grad_norm": 29.785736083984375, "step": 3590 }, { "epoch": 1.1974649766511007, "learning_rate": 0.00028887466818266865, "step": 3590 }, { "epoch": 1.1974649766511007, "loss": 0.4725848436355591, "step": 3590 }, { "ce_loss": 0.25671958923339844, "epoch": 1.1974649766511007, "step": 3590 }, { "distill_loss": 0.08832456171512604, "epoch": 1.1974649766511007, "step": 3590 }, { "epoch": 1.1974649766511007, "ref_ce_loss": 0.1264975517988205, "step": 3590 }, { "epoch": 1.1974649766511007, "loss": 1.0542960166931152, "step": 3590 }, { "ce_loss": 0.29440543055534363, "epoch": 1.1974649766511007, "step": 3590 }, { "distill_loss": 0.11251768469810486, "epoch": 1.1974649766511007, "step": 3590 }, { "epoch": 1.1974649766511007, "ref_ce_loss": 0.20507730543613434, "step": 3590 }, { "epoch": 1.1974649766511007, "loss": 0.5612708330154419, "step": 3590 }, { "ce_loss": 0.13027560710906982, "epoch": 1.1974649766511007, "step": 3590 }, { "distill_loss": 0.08713222295045853, "epoch": 1.1974649766511007, "step": 3590 }, { "epoch": 1.1974649766511007, "ref_ce_loss": 0.09830352663993835, "step": 3590 }, { "epoch": 1.1974649766511007, "loss": 0.5265268683433533, "step": 3590 }, { "ce_loss": 0.22897528111934662, "epoch": 1.1974649766511007, "step": 3590 }, { "distill_loss": 0.10633121430873871, "epoch": 1.1974649766511007, "step": 3590 }, { "epoch": 1.1974649766511007, "ref_ce_loss": 0.12552028894424438, "step": 3590 }, { "epoch": 1.200800533689126, "loss": 0.6444, "step": 3600 }, { "epoch": 1.200800533689126, "grad_norm": 2.3796160221099854, "step": 3600 }, { "epoch": 1.200800533689126, "learning_rate": 0.00028879798602631537, "step": 3600 }, { "epoch": 1.200800533689126, "loss": 0.6752573847770691, "step": 3600 }, { "ce_loss": 0.22611364722251892, "epoch": 1.200800533689126, "step": 3600 }, { "distill_loss": 0.11424873024225235, "epoch": 1.200800533689126, "step": 3600 }, { "epoch": 1.200800533689126, "ref_ce_loss": 0.15618790686130524, "step": 3600 }, { "epoch": 1.200800533689126, "loss": 0.7464491128921509, "step": 3600 }, { "ce_loss": 0.21239803731441498, "epoch": 1.200800533689126, "step": 3600 }, { "distill_loss": 0.10173000395298004, "epoch": 1.200800533689126, "step": 3600 }, { "epoch": 1.200800533689126, "ref_ce_loss": 0.11320611089468002, "step": 3600 }, { "epoch": 1.200800533689126, "loss": 0.739050030708313, "step": 3600 }, { "ce_loss": 0.23679211735725403, "epoch": 1.200800533689126, "step": 3600 }, { "distill_loss": 0.11317134648561478, "epoch": 1.200800533689126, "step": 3600 }, { "epoch": 1.200800533689126, "ref_ce_loss": 0.14366133511066437, "step": 3600 }, { "epoch": 1.200800533689126, "loss": 0.4201709032058716, "step": 3600 }, { "ce_loss": 0.1703682392835617, "epoch": 1.200800533689126, "step": 3600 }, { "distill_loss": 0.12370355427265167, "epoch": 1.200800533689126, "step": 3600 }, { "epoch": 1.200800533689126, "ref_ce_loss": 0.1257946789264679, "step": 3600 }, { "epoch": 1.2041360907271514, "loss": 0.6474, "step": 3610 }, { "epoch": 1.2041360907271514, "grad_norm": 13.404465675354004, "step": 3610 }, { "epoch": 1.2041360907271514, "learning_rate": 0.0002887210507571869, "step": 3610 }, { "epoch": 1.2041360907271514, "loss": 0.49275848269462585, "step": 3610 }, { "ce_loss": 0.24199196696281433, "epoch": 1.2041360907271514, "step": 3610 }, { "distill_loss": 0.09797633439302444, "epoch": 1.2041360907271514, "step": 3610 }, { "epoch": 1.2041360907271514, "ref_ce_loss": 0.1525280922651291, "step": 3610 }, { "epoch": 1.2041360907271514, "loss": 0.5640525221824646, "step": 3610 }, { "ce_loss": 0.19955472648143768, "epoch": 1.2041360907271514, "step": 3610 }, { "distill_loss": 0.11258351802825928, "epoch": 1.2041360907271514, "step": 3610 }, { "epoch": 1.2041360907271514, "ref_ce_loss": 0.14966405928134918, "step": 3610 }, { "epoch": 1.2041360907271514, "loss": 0.48773708939552307, "step": 3610 }, { "ce_loss": 0.2079242616891861, "epoch": 1.2041360907271514, "step": 3610 }, { "distill_loss": 0.12160438299179077, "epoch": 1.2041360907271514, "step": 3610 }, { "epoch": 1.2041360907271514, "ref_ce_loss": 0.15779396891593933, "step": 3610 }, { "epoch": 1.2041360907271514, "loss": 0.6037769317626953, "step": 3610 }, { "ce_loss": 0.28633320331573486, "epoch": 1.2041360907271514, "step": 3610 }, { "distill_loss": 0.12164897471666336, "epoch": 1.2041360907271514, "step": 3610 }, { "epoch": 1.2041360907271514, "ref_ce_loss": 0.1462378203868866, "step": 3610 }, { "epoch": 1.2074716477651768, "loss": 0.6433, "step": 3620 }, { "epoch": 1.2074716477651768, "grad_norm": 2.566305160522461, "step": 3620 }, { "epoch": 1.2074716477651768, "learning_rate": 0.0002886438625155828, "step": 3620 }, { "epoch": 1.2074716477651768, "loss": 0.9501935243606567, "step": 3620 }, { "ce_loss": 0.23689863085746765, "epoch": 1.2074716477651768, "step": 3620 }, { "distill_loss": 0.14002355933189392, "epoch": 1.2074716477651768, "step": 3620 }, { "epoch": 1.2074716477651768, "ref_ce_loss": 0.12740619480609894, "step": 3620 }, { "epoch": 1.2074716477651768, "loss": 0.5629335641860962, "step": 3620 }, { "ce_loss": 0.16847597062587738, "epoch": 1.2074716477651768, "step": 3620 }, { "distill_loss": 0.0989687517285347, "epoch": 1.2074716477651768, "step": 3620 }, { "epoch": 1.2074716477651768, "ref_ce_loss": 0.18997547030448914, "step": 3620 }, { "epoch": 1.2074716477651768, "loss": 0.47026073932647705, "step": 3620 }, { "ce_loss": 0.21434803307056427, "epoch": 1.2074716477651768, "step": 3620 }, { "distill_loss": 0.10124547779560089, "epoch": 1.2074716477651768, "step": 3620 }, { "epoch": 1.2074716477651768, "ref_ce_loss": 0.15439580380916595, "step": 3620 }, { "epoch": 1.2074716477651768, "loss": 0.9181081652641296, "step": 3620 }, { "ce_loss": 0.34848782420158386, "epoch": 1.2074716477651768, "step": 3620 }, { "distill_loss": 0.12178175896406174, "epoch": 1.2074716477651768, "step": 3620 }, { "epoch": 1.2074716477651768, "ref_ce_loss": 0.20156177878379822, "step": 3620 }, { "epoch": 1.2108072048032021, "loss": 0.5965, "step": 3630 }, { "epoch": 1.2108072048032021, "grad_norm": 3.8586368560791016, "step": 3630 }, { "epoch": 1.2108072048032021, "learning_rate": 0.0002885664214422641, "step": 3630 }, { "epoch": 1.2108072048032021, "loss": 0.8446038961410522, "step": 3630 }, { "ce_loss": 0.2831612229347229, "epoch": 1.2108072048032021, "step": 3630 }, { "distill_loss": 0.117464579641819, "epoch": 1.2108072048032021, "step": 3630 }, { "epoch": 1.2108072048032021, "ref_ce_loss": 0.15272650122642517, "step": 3630 }, { "epoch": 1.2108072048032021, "loss": 0.6270732283592224, "step": 3630 }, { "ce_loss": 0.16092394292354584, "epoch": 1.2108072048032021, "step": 3630 }, { "distill_loss": 0.07629981637001038, "epoch": 1.2108072048032021, "step": 3630 }, { "epoch": 1.2108072048032021, "ref_ce_loss": 0.19512324035167694, "step": 3630 }, { "epoch": 1.2108072048032021, "loss": 1.1022748947143555, "step": 3630 }, { "ce_loss": 0.24730761349201202, "epoch": 1.2108072048032021, "step": 3630 }, { "distill_loss": 0.11945497244596481, "epoch": 1.2108072048032021, "step": 3630 }, { "epoch": 1.2108072048032021, "ref_ce_loss": 0.13521139323711395, "step": 3630 }, { "epoch": 1.2108072048032021, "loss": 0.43340691924095154, "step": 3630 }, { "ce_loss": 0.1800413727760315, "epoch": 1.2108072048032021, "step": 3630 }, { "distill_loss": 0.0931423231959343, "epoch": 1.2108072048032021, "step": 3630 }, { "epoch": 1.2108072048032021, "ref_ce_loss": 0.15474474430084229, "step": 3630 }, { "epoch": 1.2141427618412275, "loss": 0.5921, "step": 3640 }, { "epoch": 1.2141427618412275, "grad_norm": 3.449916362762451, "step": 3640 }, { "epoch": 1.2141427618412275, "learning_rate": 0.0002884887276784526, "step": 3640 }, { "epoch": 1.2141427618412275, "loss": 0.4230595827102661, "step": 3640 }, { "ce_loss": 0.16139523684978485, "epoch": 1.2141427618412275, "step": 3640 }, { "distill_loss": 0.088740773499012, "epoch": 1.2141427618412275, "step": 3640 }, { "epoch": 1.2141427618412275, "ref_ce_loss": 0.11849240958690643, "step": 3640 }, { "epoch": 1.2141427618412275, "loss": 1.0855642557144165, "step": 3640 }, { "ce_loss": 0.2549758851528168, "epoch": 1.2141427618412275, "step": 3640 }, { "distill_loss": 0.13948114216327667, "epoch": 1.2141427618412275, "step": 3640 }, { "epoch": 1.2141427618412275, "ref_ce_loss": 0.12895409762859344, "step": 3640 }, { "epoch": 1.2141427618412275, "loss": 0.6072877645492554, "step": 3640 }, { "ce_loss": 0.19314055144786835, "epoch": 1.2141427618412275, "step": 3640 }, { "distill_loss": 0.12987002730369568, "epoch": 1.2141427618412275, "step": 3640 }, { "epoch": 1.2141427618412275, "ref_ce_loss": 0.2171475738286972, "step": 3640 }, { "epoch": 1.2141427618412275, "loss": 0.3850543797016144, "step": 3640 }, { "ce_loss": 0.15985210239887238, "epoch": 1.2141427618412275, "step": 3640 }, { "distill_loss": 0.10375294089317322, "epoch": 1.2141427618412275, "step": 3640 }, { "epoch": 1.2141427618412275, "ref_ce_loss": 0.12134460359811783, "step": 3640 }, { "epoch": 1.2174783188792528, "loss": 0.6544, "step": 3650 }, { "epoch": 1.2174783188792528, "grad_norm": 2.291261911392212, "step": 3650 }, { "epoch": 1.2174783188792528, "learning_rate": 0.0002884107813658312, "step": 3650 }, { "epoch": 1.2174783188792528, "loss": 0.8231579065322876, "step": 3650 }, { "ce_loss": 0.3279842436313629, "epoch": 1.2174783188792528, "step": 3650 }, { "distill_loss": 0.10889627039432526, "epoch": 1.2174783188792528, "step": 3650 }, { "epoch": 1.2174783188792528, "ref_ce_loss": 0.2368888109922409, "step": 3650 }, { "epoch": 1.2174783188792528, "loss": 0.570781946182251, "step": 3650 }, { "ce_loss": 0.2116650938987732, "epoch": 1.2174783188792528, "step": 3650 }, { "distill_loss": 0.12157993018627167, "epoch": 1.2174783188792528, "step": 3650 }, { "epoch": 1.2174783188792528, "ref_ce_loss": 0.15611954033374786, "step": 3650 }, { "epoch": 1.2174783188792528, "loss": 0.48231443762779236, "step": 3650 }, { "ce_loss": 0.18459239602088928, "epoch": 1.2174783188792528, "step": 3650 }, { "distill_loss": 0.0977061539888382, "epoch": 1.2174783188792528, "step": 3650 }, { "epoch": 1.2174783188792528, "ref_ce_loss": 0.15423721075057983, "step": 3650 }, { "epoch": 1.2174783188792528, "loss": 0.4680092930793762, "step": 3650 }, { "ce_loss": 0.1793879270553589, "epoch": 1.2174783188792528, "step": 3650 }, { "distill_loss": 0.1235269084572792, "epoch": 1.2174783188792528, "step": 3650 }, { "epoch": 1.2174783188792528, "ref_ce_loss": 0.0962749794125557, "step": 3650 }, { "epoch": 1.2208138759172782, "loss": 0.7043, "step": 3660 }, { "epoch": 1.2208138759172782, "grad_norm": 6.269075393676758, "step": 3660 }, { "epoch": 1.2208138759172782, "learning_rate": 0.0002883325826465432, "step": 3660 }, { "epoch": 1.2208138759172782, "loss": 0.5581228137016296, "step": 3660 }, { "ce_loss": 0.2508165240287781, "epoch": 1.2208138759172782, "step": 3660 }, { "distill_loss": 0.09761396795511246, "epoch": 1.2208138759172782, "step": 3660 }, { "epoch": 1.2208138759172782, "ref_ce_loss": 0.12982720136642456, "step": 3660 }, { "epoch": 1.2208138759172782, "loss": 0.6540179252624512, "step": 3660 }, { "ce_loss": 0.22440961003303528, "epoch": 1.2208138759172782, "step": 3660 }, { "distill_loss": 0.1306951344013214, "epoch": 1.2208138759172782, "step": 3660 }, { "epoch": 1.2208138759172782, "ref_ce_loss": 0.14553359150886536, "step": 3660 }, { "epoch": 1.2208138759172782, "loss": 1.250847339630127, "step": 3660 }, { "ce_loss": 0.27133795619010925, "epoch": 1.2208138759172782, "step": 3660 }, { "distill_loss": 0.12290221452713013, "epoch": 1.2208138759172782, "step": 3660 }, { "epoch": 1.2208138759172782, "ref_ce_loss": 0.12761349976062775, "step": 3660 }, { "epoch": 1.2208138759172782, "loss": 0.43164223432540894, "step": 3660 }, { "ce_loss": 0.13554264605045319, "epoch": 1.2208138759172782, "step": 3660 }, { "distill_loss": 0.10463345795869827, "epoch": 1.2208138759172782, "step": 3660 }, { "epoch": 1.2208138759172782, "ref_ce_loss": 0.10604510456323624, "step": 3660 }, { "epoch": 1.2241494329553035, "loss": 0.697, "step": 3670 }, { "epoch": 1.2241494329553035, "grad_norm": 3.5499510765075684, "step": 3670 }, { "epoch": 1.2241494329553035, "learning_rate": 0.00028825413166319217, "step": 3670 }, { "epoch": 1.2241494329553035, "loss": 0.5550094246864319, "step": 3670 }, { "ce_loss": 0.15072378516197205, "epoch": 1.2241494329553035, "step": 3670 }, { "distill_loss": 0.08958537876605988, "epoch": 1.2241494329553035, "step": 3670 }, { "epoch": 1.2241494329553035, "ref_ce_loss": 0.11560764163732529, "step": 3670 }, { "epoch": 1.2241494329553035, "loss": 0.6072015762329102, "step": 3670 }, { "ce_loss": 0.30362680554389954, "epoch": 1.2241494329553035, "step": 3670 }, { "distill_loss": 0.11939045041799545, "epoch": 1.2241494329553035, "step": 3670 }, { "epoch": 1.2241494329553035, "ref_ce_loss": 0.1375572234392166, "step": 3670 }, { "epoch": 1.2241494329553035, "loss": 0.503847599029541, "step": 3670 }, { "ce_loss": 0.2185611128807068, "epoch": 1.2241494329553035, "step": 3670 }, { "distill_loss": 0.10109249502420425, "epoch": 1.2241494329553035, "step": 3670 }, { "epoch": 1.2241494329553035, "ref_ce_loss": 0.13138023018836975, "step": 3670 }, { "epoch": 1.2241494329553035, "loss": 0.6444181203842163, "step": 3670 }, { "ce_loss": 0.2963375449180603, "epoch": 1.2241494329553035, "step": 3670 }, { "distill_loss": 0.17675013840198517, "epoch": 1.2241494329553035, "step": 3670 }, { "epoch": 1.2241494329553035, "ref_ce_loss": 0.17069223523139954, "step": 3670 }, { "epoch": 1.227484989993329, "loss": 0.6122, "step": 3680 }, { "epoch": 1.227484989993329, "grad_norm": 1.8238017559051514, "step": 3680 }, { "epoch": 1.227484989993329, "learning_rate": 0.0002881754285588418, "step": 3680 }, { "epoch": 1.227484989993329, "loss": 0.610914409160614, "step": 3680 }, { "ce_loss": 0.16786062717437744, "epoch": 1.227484989993329, "step": 3680 }, { "distill_loss": 0.09828974306583405, "epoch": 1.227484989993329, "step": 3680 }, { "epoch": 1.227484989993329, "ref_ce_loss": 0.13559795916080475, "step": 3680 }, { "epoch": 1.227484989993329, "loss": 1.0151851177215576, "step": 3680 }, { "ce_loss": 0.2395806908607483, "epoch": 1.227484989993329, "step": 3680 }, { "distill_loss": 0.11997424811124802, "epoch": 1.227484989993329, "step": 3680 }, { "epoch": 1.227484989993329, "ref_ce_loss": 0.1494312435388565, "step": 3680 }, { "epoch": 1.227484989993329, "loss": 0.7240346074104309, "step": 3680 }, { "ce_loss": 0.352152019739151, "epoch": 1.227484989993329, "step": 3680 }, { "distill_loss": 0.1361006647348404, "epoch": 1.227484989993329, "step": 3680 }, { "epoch": 1.227484989993329, "ref_ce_loss": 0.17786292731761932, "step": 3680 }, { "epoch": 1.227484989993329, "loss": 0.5180095434188843, "step": 3680 }, { "ce_loss": 0.2050534188747406, "epoch": 1.227484989993329, "step": 3680 }, { "distill_loss": 0.11590767651796341, "epoch": 1.227484989993329, "step": 3680 }, { "epoch": 1.227484989993329, "ref_ce_loss": 0.1965571939945221, "step": 3680 }, { "epoch": 1.2308205470313542, "loss": 0.6667, "step": 3690 }, { "epoch": 1.2308205470313542, "grad_norm": 3.9641623497009277, "step": 3690 }, { "epoch": 1.2308205470313542, "learning_rate": 0.00028809647347701546, "step": 3690 }, { "epoch": 1.2308205470313542, "loss": 1.1504862308502197, "step": 3690 }, { "ce_loss": 0.2964133024215698, "epoch": 1.2308205470313542, "step": 3690 }, { "distill_loss": 0.10304144769906998, "epoch": 1.2308205470313542, "step": 3690 }, { "epoch": 1.2308205470313542, "ref_ce_loss": 0.18209707736968994, "step": 3690 }, { "epoch": 1.2308205470313542, "loss": 0.8985886573791504, "step": 3690 }, { "ce_loss": 0.29773250222206116, "epoch": 1.2308205470313542, "step": 3690 }, { "distill_loss": 0.11341776698827744, "epoch": 1.2308205470313542, "step": 3690 }, { "epoch": 1.2308205470313542, "ref_ce_loss": 0.16153104603290558, "step": 3690 }, { "epoch": 1.2308205470313542, "loss": 0.42868906259536743, "step": 3690 }, { "ce_loss": 0.14353783428668976, "epoch": 1.2308205470313542, "step": 3690 }, { "distill_loss": 0.09262649714946747, "epoch": 1.2308205470313542, "step": 3690 }, { "epoch": 1.2308205470313542, "ref_ce_loss": 0.1247769147157669, "step": 3690 }, { "epoch": 1.2308205470313542, "loss": 0.47106653451919556, "step": 3690 }, { "ce_loss": 0.2131417840719223, "epoch": 1.2308205470313542, "step": 3690 }, { "distill_loss": 0.10240469127893448, "epoch": 1.2308205470313542, "step": 3690 }, { "epoch": 1.2308205470313542, "ref_ce_loss": 0.08614566177129745, "step": 3690 }, { "epoch": 1.2341561040693796, "loss": 0.6576, "step": 3700 }, { "epoch": 1.2341561040693796, "grad_norm": 2.76326584815979, "step": 3700 }, { "epoch": 1.2341561040693796, "learning_rate": 0.00028801726656169617, "step": 3700 }, { "epoch": 1.2341561040693796, "loss": 0.9501110315322876, "step": 3700 }, { "ce_loss": 0.25042182207107544, "epoch": 1.2341561040693796, "step": 3700 }, { "distill_loss": 0.11113158613443375, "epoch": 1.2341561040693796, "step": 3700 }, { "epoch": 1.2341561040693796, "ref_ce_loss": 0.18582938611507416, "step": 3700 }, { "epoch": 1.2341561040693796, "loss": 0.3989907503128052, "step": 3700 }, { "ce_loss": 0.1486864984035492, "epoch": 1.2341561040693796, "step": 3700 }, { "distill_loss": 0.12427866458892822, "epoch": 1.2341561040693796, "step": 3700 }, { "epoch": 1.2341561040693796, "ref_ce_loss": 0.08574719727039337, "step": 3700 }, { "epoch": 1.2341561040693796, "loss": 0.5246773958206177, "step": 3700 }, { "ce_loss": 0.18894904851913452, "epoch": 1.2341561040693796, "step": 3700 }, { "distill_loss": 0.12698686122894287, "epoch": 1.2341561040693796, "step": 3700 }, { "epoch": 1.2341561040693796, "ref_ce_loss": 0.16238951683044434, "step": 3700 }, { "epoch": 1.2341561040693796, "loss": 0.48624590039253235, "step": 3700 }, { "ce_loss": 0.15266580879688263, "epoch": 1.2341561040693796, "step": 3700 }, { "distill_loss": 0.11565395444631577, "epoch": 1.2341561040693796, "step": 3700 }, { "epoch": 1.2341561040693796, "ref_ce_loss": 0.12820059061050415, "step": 3700 }, { "epoch": 1.237491661107405, "loss": 0.6881, "step": 3710 }, { "epoch": 1.237491661107405, "grad_norm": 2.4743893146514893, "step": 3710 }, { "epoch": 1.237491661107405, "learning_rate": 0.00028793780795732603, "step": 3710 }, { "epoch": 1.237491661107405, "loss": 0.47350871562957764, "step": 3710 }, { "ce_loss": 0.2040337473154068, "epoch": 1.237491661107405, "step": 3710 }, { "distill_loss": 0.14777837693691254, "epoch": 1.237491661107405, "step": 3710 }, { "epoch": 1.237491661107405, "ref_ce_loss": 0.1179155632853508, "step": 3710 }, { "epoch": 1.237491661107405, "loss": 0.4797500967979431, "step": 3710 }, { "ce_loss": 0.1412224918603897, "epoch": 1.237491661107405, "step": 3710 }, { "distill_loss": 0.13309955596923828, "epoch": 1.237491661107405, "step": 3710 }, { "epoch": 1.237491661107405, "ref_ce_loss": 0.15007424354553223, "step": 3710 }, { "epoch": 1.237491661107405, "loss": 0.6380416750907898, "step": 3710 }, { "ce_loss": 0.11058314144611359, "epoch": 1.237491661107405, "step": 3710 }, { "distill_loss": 0.12416449189186096, "epoch": 1.237491661107405, "step": 3710 }, { "epoch": 1.237491661107405, "ref_ce_loss": 0.13631749153137207, "step": 3710 }, { "epoch": 1.237491661107405, "loss": 0.473675012588501, "step": 3710 }, { "ce_loss": 0.17507950961589813, "epoch": 1.237491661107405, "step": 3710 }, { "distill_loss": 0.1544097363948822, "epoch": 1.237491661107405, "step": 3710 }, { "epoch": 1.237491661107405, "ref_ce_loss": 0.14373698830604553, "step": 3710 }, { "epoch": 1.2408272181454303, "loss": 0.6932, "step": 3720 }, { "epoch": 1.2408272181454303, "grad_norm": 2.810713768005371, "step": 3720 }, { "epoch": 1.2408272181454303, "learning_rate": 0.0002878580978088062, "step": 3720 }, { "epoch": 1.2408272181454303, "loss": 0.4802871644496918, "step": 3720 }, { "ce_loss": 0.19279566407203674, "epoch": 1.2408272181454303, "step": 3720 }, { "distill_loss": 0.1619156002998352, "epoch": 1.2408272181454303, "step": 3720 }, { "epoch": 1.2408272181454303, "ref_ce_loss": 0.12544824182987213, "step": 3720 }, { "epoch": 1.2408272181454303, "loss": 0.7181475162506104, "step": 3720 }, { "ce_loss": 0.16339702904224396, "epoch": 1.2408272181454303, "step": 3720 }, { "distill_loss": 0.13173145055770874, "epoch": 1.2408272181454303, "step": 3720 }, { "epoch": 1.2408272181454303, "ref_ce_loss": 0.1465282142162323, "step": 3720 }, { "epoch": 1.2408272181454303, "loss": 0.46347326040267944, "step": 3720 }, { "ce_loss": 0.1841423511505127, "epoch": 1.2408272181454303, "step": 3720 }, { "distill_loss": 0.14019352197647095, "epoch": 1.2408272181454303, "step": 3720 }, { "epoch": 1.2408272181454303, "ref_ce_loss": 0.13906390964984894, "step": 3720 }, { "epoch": 1.2408272181454303, "loss": 0.7790138721466064, "step": 3720 }, { "ce_loss": 0.2417084127664566, "epoch": 1.2408272181454303, "step": 3720 }, { "distill_loss": 0.18298661708831787, "epoch": 1.2408272181454303, "step": 3720 }, { "epoch": 1.2408272181454303, "ref_ce_loss": 0.17605675756931305, "step": 3720 }, { "epoch": 1.2441627751834556, "loss": 0.6738, "step": 3730 }, { "epoch": 1.2441627751834556, "grad_norm": 2.822309732437134, "step": 3730 }, { "epoch": 1.2441627751834556, "learning_rate": 0.00028777813626149653, "step": 3730 }, { "epoch": 1.2441627751834556, "loss": 0.7054616212844849, "step": 3730 }, { "ce_loss": 0.26692935824394226, "epoch": 1.2441627751834556, "step": 3730 }, { "distill_loss": 0.17259365320205688, "epoch": 1.2441627751834556, "step": 3730 }, { "epoch": 1.2441627751834556, "ref_ce_loss": 0.13720785081386566, "step": 3730 }, { "epoch": 1.2441627751834556, "loss": 0.5728940963745117, "step": 3730 }, { "ce_loss": 0.24504394829273224, "epoch": 1.2441627751834556, "step": 3730 }, { "distill_loss": 0.15896917879581451, "epoch": 1.2441627751834556, "step": 3730 }, { "epoch": 1.2441627751834556, "ref_ce_loss": 0.16876274347305298, "step": 3730 }, { "epoch": 1.2441627751834556, "loss": 0.5301774144172668, "step": 3730 }, { "ce_loss": 0.2005605250597, "epoch": 1.2441627751834556, "step": 3730 }, { "distill_loss": 0.1347273737192154, "epoch": 1.2441627751834556, "step": 3730 }, { "epoch": 1.2441627751834556, "ref_ce_loss": 0.13233374059200287, "step": 3730 }, { "epoch": 1.2441627751834556, "loss": 0.832635760307312, "step": 3730 }, { "ce_loss": 0.285343736410141, "epoch": 1.2441627751834556, "step": 3730 }, { "distill_loss": 0.1432167887687683, "epoch": 1.2441627751834556, "step": 3730 }, { "epoch": 1.2441627751834556, "ref_ce_loss": 0.14712287485599518, "step": 3730 }, { "epoch": 1.247498332221481, "loss": 0.6895, "step": 3740 }, { "epoch": 1.247498332221481, "grad_norm": 2.4720518589019775, "step": 3740 }, { "epoch": 1.247498332221481, "learning_rate": 0.0002876979234612153, "step": 3740 }, { "epoch": 1.247498332221481, "loss": 0.4753130078315735, "step": 3740 }, { "ce_loss": 0.1841742992401123, "epoch": 1.247498332221481, "step": 3740 }, { "distill_loss": 0.13408738374710083, "epoch": 1.247498332221481, "step": 3740 }, { "epoch": 1.247498332221481, "ref_ce_loss": 0.15682215988636017, "step": 3740 }, { "epoch": 1.247498332221481, "loss": 0.5171338319778442, "step": 3740 }, { "ce_loss": 0.26918095350265503, "epoch": 1.247498332221481, "step": 3740 }, { "distill_loss": 0.13590781390666962, "epoch": 1.247498332221481, "step": 3740 }, { "epoch": 1.247498332221481, "ref_ce_loss": 0.11171242594718933, "step": 3740 }, { "epoch": 1.247498332221481, "loss": 0.34572017192840576, "step": 3740 }, { "ce_loss": 0.12438411265611649, "epoch": 1.247498332221481, "step": 3740 }, { "distill_loss": 0.10231960564851761, "epoch": 1.247498332221481, "step": 3740 }, { "epoch": 1.247498332221481, "ref_ce_loss": 0.11600019037723541, "step": 3740 }, { "epoch": 1.247498332221481, "loss": 0.4994368553161621, "step": 3740 }, { "ce_loss": 0.15896347165107727, "epoch": 1.247498332221481, "step": 3740 }, { "distill_loss": 0.1320989727973938, "epoch": 1.247498332221481, "step": 3740 }, { "epoch": 1.247498332221481, "ref_ce_loss": 0.1545882225036621, "step": 3740 }, { "epoch": 1.2508338892595063, "loss": 0.6165, "step": 3750 }, { "epoch": 1.2508338892595063, "grad_norm": 2.6057846546173096, "step": 3750 }, { "epoch": 1.2508338892595063, "learning_rate": 0.00028761745955423917, "step": 3750 }, { "epoch": 1.2508338892595063, "loss": 0.9892425537109375, "step": 3750 }, { "ce_loss": 0.2851608395576477, "epoch": 1.2508338892595063, "step": 3750 }, { "distill_loss": 0.12828153371810913, "epoch": 1.2508338892595063, "step": 3750 }, { "epoch": 1.2508338892595063, "ref_ce_loss": 0.17768558859825134, "step": 3750 }, { "epoch": 1.2508338892595063, "loss": 1.2168772220611572, "step": 3750 }, { "ce_loss": 0.2085263729095459, "epoch": 1.2508338892595063, "step": 3750 }, { "distill_loss": 0.10808371007442474, "epoch": 1.2508338892595063, "step": 3750 }, { "epoch": 1.2508338892595063, "ref_ce_loss": 0.12219865620136261, "step": 3750 }, { "epoch": 1.2508338892595063, "loss": 0.38408538699150085, "step": 3750 }, { "ce_loss": 0.17760694026947021, "epoch": 1.2508338892595063, "step": 3750 }, { "distill_loss": 0.11270315945148468, "epoch": 1.2508338892595063, "step": 3750 }, { "epoch": 1.2508338892595063, "ref_ce_loss": 0.0935731828212738, "step": 3750 }, { "epoch": 1.2508338892595063, "loss": 0.8690676093101501, "step": 3750 }, { "ce_loss": 0.3034827411174774, "epoch": 1.2508338892595063, "step": 3750 }, { "distill_loss": 0.12080539017915726, "epoch": 1.2508338892595063, "step": 3750 }, { "epoch": 1.2508338892595063, "ref_ce_loss": 0.2369447648525238, "step": 3750 }, { "epoch": 1.2541694462975317, "loss": 0.6438, "step": 3760 }, { "epoch": 1.2541694462975317, "grad_norm": 2.890160083770752, "step": 3760 }, { "epoch": 1.2541694462975317, "learning_rate": 0.00028753674468730246, "step": 3760 }, { "epoch": 1.2541694462975317, "loss": 0.5276728868484497, "step": 3760 }, { "ce_loss": 0.16678202152252197, "epoch": 1.2541694462975317, "step": 3760 }, { "distill_loss": 0.08398936688899994, "epoch": 1.2541694462975317, "step": 3760 }, { "epoch": 1.2541694462975317, "ref_ce_loss": 0.14039310812950134, "step": 3760 }, { "epoch": 1.2541694462975317, "loss": 0.43622809648513794, "step": 3760 }, { "ce_loss": 0.15578439831733704, "epoch": 1.2541694462975317, "step": 3760 }, { "distill_loss": 0.0966634452342987, "epoch": 1.2541694462975317, "step": 3760 }, { "epoch": 1.2541694462975317, "ref_ce_loss": 0.1068943589925766, "step": 3760 }, { "epoch": 1.2541694462975317, "loss": 0.7373557090759277, "step": 3760 }, { "ce_loss": 0.3678506910800934, "epoch": 1.2541694462975317, "step": 3760 }, { "distill_loss": 0.13448716700077057, "epoch": 1.2541694462975317, "step": 3760 }, { "epoch": 1.2541694462975317, "ref_ce_loss": 0.16213582456111908, "step": 3760 }, { "epoch": 1.2541694462975317, "loss": 0.3833318054676056, "step": 3760 }, { "ce_loss": 0.10126097500324249, "epoch": 1.2541694462975317, "step": 3760 }, { "distill_loss": 0.0806593969464302, "epoch": 1.2541694462975317, "step": 3760 }, { "epoch": 1.2541694462975317, "ref_ce_loss": 0.10641855001449585, "step": 3760 }, { "epoch": 1.257505003335557, "loss": 0.6276, "step": 3770 }, { "epoch": 1.257505003335557, "grad_norm": 3.256934404373169, "step": 3770 }, { "epoch": 1.257505003335557, "learning_rate": 0.00028745577900759724, "step": 3770 }, { "epoch": 1.257505003335557, "loss": 1.00767183303833, "step": 3770 }, { "ce_loss": 0.2344723790884018, "epoch": 1.257505003335557, "step": 3770 }, { "distill_loss": 0.157794788479805, "epoch": 1.257505003335557, "step": 3770 }, { "epoch": 1.257505003335557, "ref_ce_loss": 0.18107791244983673, "step": 3770 }, { "epoch": 1.257505003335557, "loss": 0.5176934599876404, "step": 3770 }, { "ce_loss": 0.21863307058811188, "epoch": 1.257505003335557, "step": 3770 }, { "distill_loss": 0.14407603442668915, "epoch": 1.257505003335557, "step": 3770 }, { "epoch": 1.257505003335557, "ref_ce_loss": 0.08990081399679184, "step": 3770 }, { "epoch": 1.257505003335557, "loss": 0.43721339106559753, "step": 3770 }, { "ce_loss": 0.17311523854732513, "epoch": 1.257505003335557, "step": 3770 }, { "distill_loss": 0.09926941245794296, "epoch": 1.257505003335557, "step": 3770 }, { "epoch": 1.257505003335557, "ref_ce_loss": 0.16452080011367798, "step": 3770 }, { "epoch": 1.257505003335557, "loss": 0.7104783058166504, "step": 3770 }, { "ce_loss": 0.18649962544441223, "epoch": 1.257505003335557, "step": 3770 }, { "distill_loss": 0.1250247359275818, "epoch": 1.257505003335557, "step": 3770 }, { "epoch": 1.257505003335557, "ref_ce_loss": 0.1523253321647644, "step": 3770 }, { "epoch": 1.2608405603735824, "loss": 0.663, "step": 3780 }, { "epoch": 1.2608405603735824, "grad_norm": 2.457021713256836, "step": 3780 }, { "epoch": 1.2608405603735824, "learning_rate": 0.000287374562662773, "step": 3780 }, { "epoch": 1.2608405603735824, "loss": 0.520882785320282, "step": 3780 }, { "ce_loss": 0.24153728783130646, "epoch": 1.2608405603735824, "step": 3780 }, { "distill_loss": 0.1284068077802658, "epoch": 1.2608405603735824, "step": 3780 }, { "epoch": 1.2608405603735824, "ref_ce_loss": 0.15089614689350128, "step": 3780 }, { "epoch": 1.2608405603735824, "loss": 0.5422986745834351, "step": 3780 }, { "ce_loss": 0.15094715356826782, "epoch": 1.2608405603735824, "step": 3780 }, { "distill_loss": 0.10879409313201904, "epoch": 1.2608405603735824, "step": 3780 }, { "epoch": 1.2608405603735824, "ref_ce_loss": 0.15774983167648315, "step": 3780 }, { "epoch": 1.2608405603735824, "loss": 0.5709801912307739, "step": 3780 }, { "ce_loss": 0.211181640625, "epoch": 1.2608405603735824, "step": 3780 }, { "distill_loss": 0.139474555850029, "epoch": 1.2608405603735824, "step": 3780 }, { "epoch": 1.2608405603735824, "ref_ce_loss": 0.1433981955051422, "step": 3780 }, { "epoch": 1.2608405603735824, "loss": 0.6591182947158813, "step": 3780 }, { "ce_loss": 0.25575539469718933, "epoch": 1.2608405603735824, "step": 3780 }, { "distill_loss": 0.13369643688201904, "epoch": 1.2608405603735824, "step": 3780 }, { "epoch": 1.2608405603735824, "ref_ce_loss": 0.17987275123596191, "step": 3780 }, { "epoch": 1.2641761174116077, "loss": 0.6722, "step": 3790 }, { "epoch": 1.2641761174116077, "grad_norm": 3.7740092277526855, "step": 3790 }, { "epoch": 1.2641761174116077, "learning_rate": 0.0002872930958009363, "step": 3790 }, { "epoch": 1.2641761174116077, "loss": 0.5643190145492554, "step": 3790 }, { "ce_loss": 0.2098139077425003, "epoch": 1.2641761174116077, "step": 3790 }, { "distill_loss": 0.11965808272361755, "epoch": 1.2641761174116077, "step": 3790 }, { "epoch": 1.2641761174116077, "ref_ce_loss": 0.18449269235134125, "step": 3790 }, { "epoch": 1.2641761174116077, "loss": 0.638534665107727, "step": 3790 }, { "ce_loss": 0.3083104193210602, "epoch": 1.2641761174116077, "step": 3790 }, { "distill_loss": 0.148521289229393, "epoch": 1.2641761174116077, "step": 3790 }, { "epoch": 1.2641761174116077, "ref_ce_loss": 0.18149085342884064, "step": 3790 }, { "epoch": 1.2641761174116077, "loss": 0.7080193758010864, "step": 3790 }, { "ce_loss": 0.20180247724056244, "epoch": 1.2641761174116077, "step": 3790 }, { "distill_loss": 0.13183492422103882, "epoch": 1.2641761174116077, "step": 3790 }, { "epoch": 1.2641761174116077, "ref_ce_loss": 0.12661731243133545, "step": 3790 }, { "epoch": 1.2641761174116077, "loss": 0.7978255748748779, "step": 3790 }, { "ce_loss": 0.3292304575443268, "epoch": 1.2641761174116077, "step": 3790 }, { "distill_loss": 0.15548944473266602, "epoch": 1.2641761174116077, "step": 3790 }, { "epoch": 1.2641761174116077, "ref_ce_loss": 0.17226652801036835, "step": 3790 }, { "epoch": 1.267511674449633, "loss": 0.6626, "step": 3800 }, { "epoch": 1.267511674449633, "grad_norm": 2.5062668323516846, "step": 3800 }, { "epoch": 1.267511674449633, "learning_rate": 0.0002872113785706506, "step": 3800 }, { "epoch": 1.267511674449633, "loss": 0.6491740942001343, "step": 3800 }, { "ce_loss": 0.23980125784873962, "epoch": 1.267511674449633, "step": 3800 }, { "distill_loss": 0.12023845314979553, "epoch": 1.267511674449633, "step": 3800 }, { "epoch": 1.267511674449633, "ref_ce_loss": 0.12148669362068176, "step": 3800 }, { "epoch": 1.267511674449633, "loss": 0.6998868584632874, "step": 3800 }, { "ce_loss": 0.24571934342384338, "epoch": 1.267511674449633, "step": 3800 }, { "distill_loss": 0.13267642259597778, "epoch": 1.267511674449633, "step": 3800 }, { "epoch": 1.267511674449633, "ref_ce_loss": 0.12511129677295685, "step": 3800 }, { "epoch": 1.267511674449633, "loss": 0.3711976110935211, "step": 3800 }, { "ce_loss": 0.16206589341163635, "epoch": 1.267511674449633, "step": 3800 }, { "distill_loss": 0.11592836678028107, "epoch": 1.267511674449633, "step": 3800 }, { "epoch": 1.267511674449633, "ref_ce_loss": 0.0925321877002716, "step": 3800 }, { "epoch": 1.267511674449633, "loss": 0.34810692071914673, "step": 3800 }, { "ce_loss": 0.14879584312438965, "epoch": 1.267511674449633, "step": 3800 }, { "distill_loss": 0.10272634029388428, "epoch": 1.267511674449633, "step": 3800 }, { "epoch": 1.267511674449633, "ref_ce_loss": 0.09614356607198715, "step": 3800 }, { "epoch": 1.2708472314876584, "loss": 0.6808, "step": 3810 }, { "epoch": 1.2708472314876584, "grad_norm": 2.2756903171539307, "step": 3810 }, { "epoch": 1.2708472314876584, "learning_rate": 0.0002871294111209358, "step": 3810 }, { "epoch": 1.2708472314876584, "loss": 0.6024037599563599, "step": 3810 }, { "ce_loss": 0.16057908535003662, "epoch": 1.2708472314876584, "step": 3810 }, { "distill_loss": 0.13251294195652008, "epoch": 1.2708472314876584, "step": 3810 }, { "epoch": 1.2708472314876584, "ref_ce_loss": 0.14679312705993652, "step": 3810 }, { "epoch": 1.2708472314876584, "loss": 1.074276328086853, "step": 3810 }, { "ce_loss": 0.2896794378757477, "epoch": 1.2708472314876584, "step": 3810 }, { "distill_loss": 0.148069828748703, "epoch": 1.2708472314876584, "step": 3810 }, { "epoch": 1.2708472314876584, "ref_ce_loss": 0.1660357266664505, "step": 3810 }, { "epoch": 1.2708472314876584, "loss": 0.6567397713661194, "step": 3810 }, { "ce_loss": 0.22404265403747559, "epoch": 1.2708472314876584, "step": 3810 }, { "distill_loss": 0.11648007482290268, "epoch": 1.2708472314876584, "step": 3810 }, { "epoch": 1.2708472314876584, "ref_ce_loss": 0.1753883957862854, "step": 3810 }, { "epoch": 1.2708472314876584, "loss": 0.8125191330909729, "step": 3810 }, { "ce_loss": 0.32670843601226807, "epoch": 1.2708472314876584, "step": 3810 }, { "distill_loss": 0.1736506074666977, "epoch": 1.2708472314876584, "step": 3810 }, { "epoch": 1.2708472314876584, "ref_ce_loss": 0.147975355386734, "step": 3810 }, { "epoch": 1.2741827885256838, "loss": 0.6292, "step": 3820 }, { "epoch": 1.2741827885256838, "grad_norm": 5.134918689727783, "step": 3820 }, { "epoch": 1.2741827885256838, "learning_rate": 0.0002870471936012683, "step": 3820 }, { "epoch": 1.2741827885256838, "loss": 0.9305306077003479, "step": 3820 }, { "ce_loss": 0.22662502527236938, "epoch": 1.2741827885256838, "step": 3820 }, { "distill_loss": 0.15132461488246918, "epoch": 1.2741827885256838, "step": 3820 }, { "epoch": 1.2741827885256838, "ref_ce_loss": 0.1143263652920723, "step": 3820 }, { "epoch": 1.2741827885256838, "loss": 0.7782340049743652, "step": 3820 }, { "ce_loss": 0.2704147398471832, "epoch": 1.2741827885256838, "step": 3820 }, { "distill_loss": 0.1646704226732254, "epoch": 1.2741827885256838, "step": 3820 }, { "epoch": 1.2741827885256838, "ref_ce_loss": 0.14147979021072388, "step": 3820 }, { "epoch": 1.2741827885256838, "loss": 0.6947014331817627, "step": 3820 }, { "ce_loss": 0.29456502199172974, "epoch": 1.2741827885256838, "step": 3820 }, { "distill_loss": 0.1552099734544754, "epoch": 1.2741827885256838, "step": 3820 }, { "epoch": 1.2741827885256838, "ref_ce_loss": 0.15763463079929352, "step": 3820 }, { "epoch": 1.2741827885256838, "loss": 0.43103697896003723, "step": 3820 }, { "ce_loss": 0.13802245259284973, "epoch": 1.2741827885256838, "step": 3820 }, { "distill_loss": 0.1441994160413742, "epoch": 1.2741827885256838, "step": 3820 }, { "epoch": 1.2741827885256838, "ref_ce_loss": 0.14861568808555603, "step": 3820 }, { "epoch": 1.2775183455637091, "loss": 0.7171, "step": 3830 }, { "epoch": 1.2775183455637091, "grad_norm": 2.4478530883789062, "step": 3830 }, { "epoch": 1.2775183455637091, "learning_rate": 0.0002869647261615803, "step": 3830 }, { "epoch": 1.2775183455637091, "loss": 0.46987560391426086, "step": 3830 }, { "ce_loss": 0.1733369082212448, "epoch": 1.2775183455637091, "step": 3830 }, { "distill_loss": 0.10979942977428436, "epoch": 1.2775183455637091, "step": 3830 }, { "epoch": 1.2775183455637091, "ref_ce_loss": 0.15175464749336243, "step": 3830 }, { "epoch": 1.2775183455637091, "loss": 1.4397883415222168, "step": 3830 }, { "ce_loss": 0.4084470570087433, "epoch": 1.2775183455637091, "step": 3830 }, { "distill_loss": 0.1578616499900818, "epoch": 1.2775183455637091, "step": 3830 }, { "epoch": 1.2775183455637091, "ref_ce_loss": 0.15876798331737518, "step": 3830 }, { "epoch": 1.2775183455637091, "loss": 0.6761387586593628, "step": 3830 }, { "ce_loss": 0.26621678471565247, "epoch": 1.2775183455637091, "step": 3830 }, { "distill_loss": 0.18939264118671417, "epoch": 1.2775183455637091, "step": 3830 }, { "epoch": 1.2775183455637091, "ref_ce_loss": 0.13820181787014008, "step": 3830 }, { "epoch": 1.2775183455637091, "loss": 0.6198230981826782, "step": 3830 }, { "ce_loss": 0.2509312927722931, "epoch": 1.2775183455637091, "step": 3830 }, { "distill_loss": 0.1535424441099167, "epoch": 1.2775183455637091, "step": 3830 }, { "epoch": 1.2775183455637091, "ref_ce_loss": 0.1641911268234253, "step": 3830 }, { "epoch": 1.2808539026017345, "loss": 0.6643, "step": 3840 }, { "epoch": 1.2808539026017345, "grad_norm": 2.382905960083008, "step": 3840 }, { "epoch": 1.2808539026017345, "learning_rate": 0.00028688200895226, "step": 3840 }, { "epoch": 1.2808539026017345, "loss": 0.6250182390213013, "step": 3840 }, { "ce_loss": 0.20252063870429993, "epoch": 1.2808539026017345, "step": 3840 }, { "distill_loss": 0.14306099712848663, "epoch": 1.2808539026017345, "step": 3840 }, { "epoch": 1.2808539026017345, "ref_ce_loss": 0.1793094277381897, "step": 3840 }, { "epoch": 1.2808539026017345, "loss": 0.47006261348724365, "step": 3840 }, { "ce_loss": 0.15936064720153809, "epoch": 1.2808539026017345, "step": 3840 }, { "distill_loss": 0.11916495114564896, "epoch": 1.2808539026017345, "step": 3840 }, { "epoch": 1.2808539026017345, "ref_ce_loss": 0.11021065711975098, "step": 3840 }, { "epoch": 1.2808539026017345, "loss": 0.47605329751968384, "step": 3840 }, { "ce_loss": 0.17115725576877594, "epoch": 1.2808539026017345, "step": 3840 }, { "distill_loss": 0.13069213926792145, "epoch": 1.2808539026017345, "step": 3840 }, { "epoch": 1.2808539026017345, "ref_ce_loss": 0.10102272033691406, "step": 3840 }, { "epoch": 1.2808539026017345, "loss": 0.6017922163009644, "step": 3840 }, { "ce_loss": 0.19884739816188812, "epoch": 1.2808539026017345, "step": 3840 }, { "distill_loss": 0.24015085399150848, "epoch": 1.2808539026017345, "step": 3840 }, { "epoch": 1.2808539026017345, "ref_ce_loss": 0.08779125660657883, "step": 3840 }, { "epoch": 1.2841894596397598, "loss": 0.6084, "step": 3850 }, { "epoch": 1.2841894596397598, "grad_norm": 2.060270071029663, "step": 3850 }, { "epoch": 1.2841894596397598, "learning_rate": 0.00028679904212415097, "step": 3850 }, { "epoch": 1.2841894596397598, "loss": 0.5458661913871765, "step": 3850 }, { "ce_loss": 0.2180897742509842, "epoch": 1.2841894596397598, "step": 3850 }, { "distill_loss": 0.10629566013813019, "epoch": 1.2841894596397598, "step": 3850 }, { "epoch": 1.2841894596397598, "ref_ce_loss": 0.15160155296325684, "step": 3850 }, { "epoch": 1.2841894596397598, "loss": 0.8067355751991272, "step": 3850 }, { "ce_loss": 0.39867109060287476, "epoch": 1.2841894596397598, "step": 3850 }, { "distill_loss": 0.14299823343753815, "epoch": 1.2841894596397598, "step": 3850 }, { "epoch": 1.2841894596397598, "ref_ce_loss": 0.2649121880531311, "step": 3850 }, { "epoch": 1.2841894596397598, "loss": 0.697068989276886, "step": 3850 }, { "ce_loss": 0.19222131371498108, "epoch": 1.2841894596397598, "step": 3850 }, { "distill_loss": 0.11827998608350754, "epoch": 1.2841894596397598, "step": 3850 }, { "epoch": 1.2841894596397598, "ref_ce_loss": 0.15390853583812714, "step": 3850 }, { "epoch": 1.2841894596397598, "loss": 0.47433117032051086, "step": 3850 }, { "ce_loss": 0.1541450470685959, "epoch": 1.2841894596397598, "step": 3850 }, { "distill_loss": 0.10531582683324814, "epoch": 1.2841894596397598, "step": 3850 }, { "epoch": 1.2841894596397598, "ref_ce_loss": 0.13863231241703033, "step": 3850 }, { "epoch": 1.2875250166777852, "loss": 0.6328, "step": 3860 }, { "epoch": 1.2875250166777852, "grad_norm": 2.1689889430999756, "step": 3860 }, { "epoch": 1.2875250166777852, "learning_rate": 0.00028671582582855186, "step": 3860 }, { "epoch": 1.2875250166777852, "loss": 0.5351623296737671, "step": 3860 }, { "ce_loss": 0.11756846308708191, "epoch": 1.2875250166777852, "step": 3860 }, { "distill_loss": 0.1291414052248001, "epoch": 1.2875250166777852, "step": 3860 }, { "epoch": 1.2875250166777852, "ref_ce_loss": 0.10097652673721313, "step": 3860 }, { "epoch": 1.2875250166777852, "loss": 0.988185703754425, "step": 3860 }, { "ce_loss": 0.2319861650466919, "epoch": 1.2875250166777852, "step": 3860 }, { "distill_loss": 0.1574300080537796, "epoch": 1.2875250166777852, "step": 3860 }, { "epoch": 1.2875250166777852, "ref_ce_loss": 0.18644386529922485, "step": 3860 }, { "epoch": 1.2875250166777852, "loss": 0.729952335357666, "step": 3860 }, { "ce_loss": 0.30251288414001465, "epoch": 1.2875250166777852, "step": 3860 }, { "distill_loss": 0.16945448517799377, "epoch": 1.2875250166777852, "step": 3860 }, { "epoch": 1.2875250166777852, "ref_ce_loss": 0.2575605809688568, "step": 3860 }, { "epoch": 1.2875250166777852, "loss": 0.4808621108531952, "step": 3860 }, { "ce_loss": 0.17800524830818176, "epoch": 1.2875250166777852, "step": 3860 }, { "distill_loss": 0.14971697330474854, "epoch": 1.2875250166777852, "step": 3860 }, { "epoch": 1.2875250166777852, "ref_ce_loss": 0.10496334731578827, "step": 3860 }, { "epoch": 1.2908605737158105, "loss": 0.6641, "step": 3870 }, { "epoch": 1.2908605737158105, "grad_norm": 2.5105247497558594, "step": 3870 }, { "epoch": 1.2908605737158105, "learning_rate": 0.00028663236021721645, "step": 3870 }, { "epoch": 1.2908605737158105, "loss": 0.4470581114292145, "step": 3870 }, { "ce_loss": 0.18677105009555817, "epoch": 1.2908605737158105, "step": 3870 }, { "distill_loss": 0.08762294054031372, "epoch": 1.2908605737158105, "step": 3870 }, { "epoch": 1.2908605737158105, "ref_ce_loss": 0.1723141223192215, "step": 3870 }, { "epoch": 1.2908605737158105, "loss": 0.6644189357757568, "step": 3870 }, { "ce_loss": 0.35353413224220276, "epoch": 1.2908605737158105, "step": 3870 }, { "distill_loss": 0.14174126088619232, "epoch": 1.2908605737158105, "step": 3870 }, { "epoch": 1.2908605737158105, "ref_ce_loss": 0.1683140993118286, "step": 3870 }, { "epoch": 1.2908605737158105, "loss": 0.5526446104049683, "step": 3870 }, { "ce_loss": 0.16454748809337616, "epoch": 1.2908605737158105, "step": 3870 }, { "distill_loss": 0.11832288652658463, "epoch": 1.2908605737158105, "step": 3870 }, { "epoch": 1.2908605737158105, "ref_ce_loss": 0.13812904059886932, "step": 3870 }, { "epoch": 1.2908605737158105, "loss": 1.190387487411499, "step": 3870 }, { "ce_loss": 0.24219068884849548, "epoch": 1.2908605737158105, "step": 3870 }, { "distill_loss": 0.12967538833618164, "epoch": 1.2908605737158105, "step": 3870 }, { "epoch": 1.2908605737158105, "ref_ce_loss": 0.19826941192150116, "step": 3870 }, { "epoch": 1.2941961307538359, "loss": 0.6134, "step": 3880 }, { "epoch": 1.2941961307538359, "grad_norm": 2.6475019454956055, "step": 3880 }, { "epoch": 1.2941961307538359, "learning_rate": 0.00028654864544235307, "step": 3880 }, { "epoch": 1.2941961307538359, "loss": 0.4416970908641815, "step": 3880 }, { "ce_loss": 0.1246199682354927, "epoch": 1.2941961307538359, "step": 3880 }, { "distill_loss": 0.12249705195426941, "epoch": 1.2941961307538359, "step": 3880 }, { "epoch": 1.2941961307538359, "ref_ce_loss": 0.12978720664978027, "step": 3880 }, { "epoch": 1.2941961307538359, "loss": 0.48672401905059814, "step": 3880 }, { "ce_loss": 0.16806533932685852, "epoch": 1.2941961307538359, "step": 3880 }, { "distill_loss": 0.13379566371440887, "epoch": 1.2941961307538359, "step": 3880 }, { "epoch": 1.2941961307538359, "ref_ce_loss": 0.1297888159751892, "step": 3880 }, { "epoch": 1.2941961307538359, "loss": 0.7681386470794678, "step": 3880 }, { "ce_loss": 0.1779935210943222, "epoch": 1.2941961307538359, "step": 3880 }, { "distill_loss": 0.18026070296764374, "epoch": 1.2941961307538359, "step": 3880 }, { "epoch": 1.2941961307538359, "ref_ce_loss": 0.12299671024084091, "step": 3880 }, { "epoch": 1.2941961307538359, "loss": 1.157135009765625, "step": 3880 }, { "ce_loss": 0.31688815355300903, "epoch": 1.2941961307538359, "step": 3880 }, { "distill_loss": 0.1663898229598999, "epoch": 1.2941961307538359, "step": 3880 }, { "epoch": 1.2941961307538359, "ref_ce_loss": 0.14574475586414337, "step": 3880 }, { "epoch": 1.2975316877918612, "loss": 0.6617, "step": 3890 }, { "epoch": 1.2975316877918612, "grad_norm": 2.3238894939422607, "step": 3890 }, { "epoch": 1.2975316877918612, "learning_rate": 0.00028646468165662443, "step": 3890 }, { "epoch": 1.2975316877918612, "loss": 0.45971766114234924, "step": 3890 }, { "ce_loss": 0.20718860626220703, "epoch": 1.2975316877918612, "step": 3890 }, { "distill_loss": 0.10672736167907715, "epoch": 1.2975316877918612, "step": 3890 }, { "epoch": 1.2975316877918612, "ref_ce_loss": 0.14458438754081726, "step": 3890 }, { "epoch": 1.2975316877918612, "loss": 0.4675193130970001, "step": 3890 }, { "ce_loss": 0.17066477239131927, "epoch": 1.2975316877918612, "step": 3890 }, { "distill_loss": 0.10979650914669037, "epoch": 1.2975316877918612, "step": 3890 }, { "epoch": 1.2975316877918612, "ref_ce_loss": 0.13548187911510468, "step": 3890 }, { "epoch": 1.2975316877918612, "loss": 0.8037819862365723, "step": 3890 }, { "ce_loss": 0.26504796743392944, "epoch": 1.2975316877918612, "step": 3890 }, { "distill_loss": 0.14809170365333557, "epoch": 1.2975316877918612, "step": 3890 }, { "epoch": 1.2975316877918612, "ref_ce_loss": 0.16031919419765472, "step": 3890 }, { "epoch": 1.2975316877918612, "loss": 0.4144549071788788, "step": 3890 }, { "ce_loss": 0.18542304635047913, "epoch": 1.2975316877918612, "step": 3890 }, { "distill_loss": 0.11288897693157196, "epoch": 1.2975316877918612, "step": 3890 }, { "epoch": 1.2975316877918612, "ref_ce_loss": 0.11544924974441528, "step": 3890 }, { "epoch": 1.3008672448298866, "loss": 0.6172, "step": 3900 }, { "epoch": 1.3008672448298866, "grad_norm": 2.562744379043579, "step": 3900 }, { "epoch": 1.3008672448298866, "learning_rate": 0.0002863804690131474, "step": 3900 }, { "epoch": 1.3008672448298866, "loss": 0.6487498879432678, "step": 3900 }, { "ce_loss": 0.23641221225261688, "epoch": 1.3008672448298866, "step": 3900 }, { "distill_loss": 0.13633979856967926, "epoch": 1.3008672448298866, "step": 3900 }, { "epoch": 1.3008672448298866, "ref_ce_loss": 0.21698123216629028, "step": 3900 }, { "epoch": 1.3008672448298866, "loss": 0.7199804782867432, "step": 3900 }, { "ce_loss": 0.22160635888576508, "epoch": 1.3008672448298866, "step": 3900 }, { "distill_loss": 0.14556357264518738, "epoch": 1.3008672448298866, "step": 3900 }, { "epoch": 1.3008672448298866, "ref_ce_loss": 0.16044962406158447, "step": 3900 }, { "epoch": 1.3008672448298866, "loss": 0.3134487271308899, "step": 3900 }, { "ce_loss": 0.10460028797388077, "epoch": 1.3008672448298866, "step": 3900 }, { "distill_loss": 0.124151811003685, "epoch": 1.3008672448298866, "step": 3900 }, { "epoch": 1.3008672448298866, "ref_ce_loss": 0.08378157019615173, "step": 3900 }, { "epoch": 1.3008672448298866, "loss": 1.0430158376693726, "step": 3900 }, { "ce_loss": 0.20616380870342255, "epoch": 1.3008672448298866, "step": 3900 }, { "distill_loss": 0.10245034098625183, "epoch": 1.3008672448298866, "step": 3900 }, { "epoch": 1.3008672448298866, "ref_ce_loss": 0.13835114240646362, "step": 3900 }, { "epoch": 1.304202801867912, "loss": 0.6747, "step": 3910 }, { "epoch": 1.304202801867912, "grad_norm": 3.7162861824035645, "step": 3910 }, { "epoch": 1.304202801867912, "learning_rate": 0.00028629600766549266, "step": 3910 }, { "epoch": 1.304202801867912, "loss": 0.5505048036575317, "step": 3910 }, { "ce_loss": 0.2205231487751007, "epoch": 1.304202801867912, "step": 3910 }, { "distill_loss": 0.13492049276828766, "epoch": 1.304202801867912, "step": 3910 }, { "epoch": 1.304202801867912, "ref_ce_loss": 0.130891352891922, "step": 3910 }, { "epoch": 1.304202801867912, "loss": 0.622995138168335, "step": 3910 }, { "ce_loss": 0.23485012352466583, "epoch": 1.304202801867912, "step": 3910 }, { "distill_loss": 0.16180278360843658, "epoch": 1.304202801867912, "step": 3910 }, { "epoch": 1.304202801867912, "ref_ce_loss": 0.22551412880420685, "step": 3910 }, { "epoch": 1.304202801867912, "loss": 0.4668801426887512, "step": 3910 }, { "ce_loss": 0.19062143564224243, "epoch": 1.304202801867912, "step": 3910 }, { "distill_loss": 0.12799620628356934, "epoch": 1.304202801867912, "step": 3910 }, { "epoch": 1.304202801867912, "ref_ce_loss": 0.1471155434846878, "step": 3910 }, { "epoch": 1.304202801867912, "loss": 0.8399578332901001, "step": 3910 }, { "ce_loss": 0.16672414541244507, "epoch": 1.304202801867912, "step": 3910 }, { "distill_loss": 0.13798189163208008, "epoch": 1.304202801867912, "step": 3910 }, { "epoch": 1.304202801867912, "ref_ce_loss": 0.1093071699142456, "step": 3910 }, { "epoch": 1.3075383589059373, "loss": 0.6499, "step": 3920 }, { "epoch": 1.3075383589059373, "grad_norm": 4.548922538757324, "step": 3920 }, { "epoch": 1.3075383589059373, "learning_rate": 0.00028621129776768424, "step": 3920 }, { "epoch": 1.3075383589059373, "loss": 0.6969051361083984, "step": 3920 }, { "ce_loss": 0.2939198613166809, "epoch": 1.3075383589059373, "step": 3920 }, { "distill_loss": 0.1349041759967804, "epoch": 1.3075383589059373, "step": 3920 }, { "epoch": 1.3075383589059373, "ref_ce_loss": 0.1603991836309433, "step": 3920 }, { "epoch": 1.3075383589059373, "loss": 0.665176510810852, "step": 3920 }, { "ce_loss": 0.14564870297908783, "epoch": 1.3075383589059373, "step": 3920 }, { "distill_loss": 0.09071095287799835, "epoch": 1.3075383589059373, "step": 3920 }, { "epoch": 1.3075383589059373, "ref_ce_loss": 0.12003415077924728, "step": 3920 }, { "epoch": 1.3075383589059373, "loss": 0.5591284036636353, "step": 3920 }, { "ce_loss": 0.2728961706161499, "epoch": 1.3075383589059373, "step": 3920 }, { "distill_loss": 0.11606838554143906, "epoch": 1.3075383589059373, "step": 3920 }, { "epoch": 1.3075383589059373, "ref_ce_loss": 0.12034892290830612, "step": 3920 }, { "epoch": 1.3075383589059373, "loss": 0.6922830939292908, "step": 3920 }, { "ce_loss": 0.17354288697242737, "epoch": 1.3075383589059373, "step": 3920 }, { "distill_loss": 0.10280630737543106, "epoch": 1.3075383589059373, "step": 3920 }, { "epoch": 1.3075383589059373, "ref_ce_loss": 0.1029202789068222, "step": 3920 }, { "epoch": 1.3108739159439626, "loss": 0.5879, "step": 3930 }, { "epoch": 1.3108739159439626, "grad_norm": 1.9159053564071655, "step": 3930 }, { "epoch": 1.3108739159439626, "learning_rate": 0.0002861263394741996, "step": 3930 }, { "epoch": 1.3108739159439626, "loss": 0.5317827463150024, "step": 3930 }, { "ce_loss": 0.22766584157943726, "epoch": 1.3108739159439626, "step": 3930 }, { "distill_loss": 0.09339602291584015, "epoch": 1.3108739159439626, "step": 3930 }, { "epoch": 1.3108739159439626, "ref_ce_loss": 0.12791801989078522, "step": 3930 }, { "epoch": 1.3108739159439626, "loss": 0.35215145349502563, "step": 3930 }, { "ce_loss": 0.09111224114894867, "epoch": 1.3108739159439626, "step": 3930 }, { "distill_loss": 0.09845034033060074, "epoch": 1.3108739159439626, "step": 3930 }, { "epoch": 1.3108739159439626, "ref_ce_loss": 0.11704370379447937, "step": 3930 }, { "epoch": 1.3108739159439626, "loss": 0.7495036125183105, "step": 3930 }, { "ce_loss": 0.11601284891366959, "epoch": 1.3108739159439626, "step": 3930 }, { "distill_loss": 0.14619024097919464, "epoch": 1.3108739159439626, "step": 3930 }, { "epoch": 1.3108739159439626, "ref_ce_loss": 0.09743039309978485, "step": 3930 }, { "epoch": 1.3108739159439626, "loss": 0.6388373374938965, "step": 3930 }, { "ce_loss": 0.14739064872264862, "epoch": 1.3108739159439626, "step": 3930 }, { "distill_loss": 0.11072908341884613, "epoch": 1.3108739159439626, "step": 3930 }, { "epoch": 1.3108739159439626, "ref_ce_loss": 0.13757041096687317, "step": 3930 }, { "epoch": 1.314209472981988, "loss": 0.6759, "step": 3940 }, { "epoch": 1.314209472981988, "grad_norm": 2.8044283390045166, "step": 3940 }, { "epoch": 1.314209472981988, "learning_rate": 0.00028604113293996937, "step": 3940 }, { "epoch": 1.314209472981988, "loss": 0.3935573399066925, "step": 3940 }, { "ce_loss": 0.1724274754524231, "epoch": 1.314209472981988, "step": 3940 }, { "distill_loss": 0.10461033880710602, "epoch": 1.314209472981988, "step": 3940 }, { "epoch": 1.314209472981988, "ref_ce_loss": 0.11499915271997452, "step": 3940 }, { "epoch": 1.314209472981988, "loss": 0.5409625172615051, "step": 3940 }, { "ce_loss": 0.19719979166984558, "epoch": 1.314209472981988, "step": 3940 }, { "distill_loss": 0.12273137271404266, "epoch": 1.314209472981988, "step": 3940 }, { "epoch": 1.314209472981988, "ref_ce_loss": 0.11761221289634705, "step": 3940 }, { "epoch": 1.314209472981988, "loss": 0.6423748731613159, "step": 3940 }, { "ce_loss": 0.18682481348514557, "epoch": 1.314209472981988, "step": 3940 }, { "distill_loss": 0.11395197361707687, "epoch": 1.314209472981988, "step": 3940 }, { "epoch": 1.314209472981988, "ref_ce_loss": 0.09778361767530441, "step": 3940 }, { "epoch": 1.314209472981988, "loss": 0.6339865922927856, "step": 3940 }, { "ce_loss": 0.15020598471164703, "epoch": 1.314209472981988, "step": 3940 }, { "distill_loss": 0.11978481709957123, "epoch": 1.314209472981988, "step": 3940 }, { "epoch": 1.314209472981988, "ref_ce_loss": 0.16673775017261505, "step": 3940 }, { "epoch": 1.3175450300200133, "loss": 0.6849, "step": 3950 }, { "epoch": 1.3175450300200133, "grad_norm": 7.299792766571045, "step": 3950 }, { "epoch": 1.3175450300200133, "learning_rate": 0.0002859556783203764, "step": 3950 }, { "epoch": 1.3175450300200133, "loss": 0.6421574950218201, "step": 3950 }, { "ce_loss": 0.2580649256706238, "epoch": 1.3175450300200133, "step": 3950 }, { "distill_loss": 0.15140002965927124, "epoch": 1.3175450300200133, "step": 3950 }, { "epoch": 1.3175450300200133, "ref_ce_loss": 0.1874542534351349, "step": 3950 }, { "epoch": 1.3175450300200133, "loss": 0.6344290375709534, "step": 3950 }, { "ce_loss": 0.22702381014823914, "epoch": 1.3175450300200133, "step": 3950 }, { "distill_loss": 0.17865008115768433, "epoch": 1.3175450300200133, "step": 3950 }, { "epoch": 1.3175450300200133, "ref_ce_loss": 0.08265173435211182, "step": 3950 }, { "epoch": 1.3175450300200133, "loss": 0.9704930782318115, "step": 3950 }, { "ce_loss": 0.24519315361976624, "epoch": 1.3175450300200133, "step": 3950 }, { "distill_loss": 0.10518443584442139, "epoch": 1.3175450300200133, "step": 3950 }, { "epoch": 1.3175450300200133, "ref_ce_loss": 0.15663664042949677, "step": 3950 }, { "epoch": 1.3175450300200133, "loss": 0.48585355281829834, "step": 3950 }, { "ce_loss": 0.1515163630247116, "epoch": 1.3175450300200133, "step": 3950 }, { "distill_loss": 0.17025715112686157, "epoch": 1.3175450300200133, "step": 3950 }, { "epoch": 1.3175450300200133, "ref_ce_loss": 0.11852209270000458, "step": 3950 }, { "epoch": 1.3208805870580387, "loss": 0.7196, "step": 3960 }, { "epoch": 1.3208805870580387, "grad_norm": 2.706219434738159, "step": 3960 }, { "epoch": 1.3208805870580387, "learning_rate": 0.00028586997577125634, "step": 3960 }, { "epoch": 1.3208805870580387, "loss": 0.3871728777885437, "step": 3960 }, { "ce_loss": 0.11327257752418518, "epoch": 1.3208805870580387, "step": 3960 }, { "distill_loss": 0.1323593109846115, "epoch": 1.3208805870580387, "step": 3960 }, { "epoch": 1.3208805870580387, "ref_ce_loss": 0.07896266877651215, "step": 3960 }, { "epoch": 1.3208805870580387, "loss": 0.49185916781425476, "step": 3960 }, { "ce_loss": 0.20270958542823792, "epoch": 1.3208805870580387, "step": 3960 }, { "distill_loss": 0.12782743573188782, "epoch": 1.3208805870580387, "step": 3960 }, { "epoch": 1.3208805870580387, "ref_ce_loss": 0.09821378439664841, "step": 3960 }, { "epoch": 1.3208805870580387, "loss": 0.8527625799179077, "step": 3960 }, { "ce_loss": 0.28227877616882324, "epoch": 1.3208805870580387, "step": 3960 }, { "distill_loss": 0.1517748087644577, "epoch": 1.3208805870580387, "step": 3960 }, { "epoch": 1.3208805870580387, "ref_ce_loss": 0.14076222479343414, "step": 3960 }, { "epoch": 1.3208805870580387, "loss": 0.45873206853866577, "step": 3960 }, { "ce_loss": 0.1606334000825882, "epoch": 1.3208805870580387, "step": 3960 }, { "distill_loss": 0.13742315769195557, "epoch": 1.3208805870580387, "step": 3960 }, { "epoch": 1.3208805870580387, "ref_ce_loss": 0.10351046919822693, "step": 3960 }, { "epoch": 1.324216144096064, "loss": 0.639, "step": 3970 }, { "epoch": 1.324216144096064, "grad_norm": 1.9281312227249146, "step": 3970 }, { "epoch": 1.324216144096064, "learning_rate": 0.0002857840254488968, "step": 3970 }, { "epoch": 1.324216144096064, "loss": 0.3973920941352844, "step": 3970 }, { "ce_loss": 0.12656432390213013, "epoch": 1.324216144096064, "step": 3970 }, { "distill_loss": 0.1339094638824463, "epoch": 1.324216144096064, "step": 3970 }, { "epoch": 1.324216144096064, "ref_ce_loss": 0.1367185264825821, "step": 3970 }, { "epoch": 1.324216144096064, "loss": 0.6172012090682983, "step": 3970 }, { "ce_loss": 0.10915721207857132, "epoch": 1.324216144096064, "step": 3970 }, { "distill_loss": 0.12282588332891464, "epoch": 1.324216144096064, "step": 3970 }, { "epoch": 1.324216144096064, "ref_ce_loss": 0.12575297057628632, "step": 3970 }, { "epoch": 1.324216144096064, "loss": 0.7371445894241333, "step": 3970 }, { "ce_loss": 0.1586148589849472, "epoch": 1.324216144096064, "step": 3970 }, { "distill_loss": 0.13597846031188965, "epoch": 1.324216144096064, "step": 3970 }, { "epoch": 1.324216144096064, "ref_ce_loss": 0.18562176823616028, "step": 3970 }, { "epoch": 1.324216144096064, "loss": 0.5300966501235962, "step": 3970 }, { "ce_loss": 0.14332100749015808, "epoch": 1.324216144096064, "step": 3970 }, { "distill_loss": 0.11754573881626129, "epoch": 1.324216144096064, "step": 3970 }, { "epoch": 1.324216144096064, "ref_ce_loss": 0.10700936615467072, "step": 3970 }, { "epoch": 1.3275517011340894, "loss": 0.6286, "step": 3980 }, { "epoch": 1.3275517011340894, "grad_norm": 3.6821515560150146, "step": 3980 }, { "epoch": 1.3275517011340894, "learning_rate": 0.0002856978275100373, "step": 3980 }, { "epoch": 1.3275517011340894, "loss": 0.6550862789154053, "step": 3980 }, { "ce_loss": 0.28518232703208923, "epoch": 1.3275517011340894, "step": 3980 }, { "distill_loss": 0.15741953253746033, "epoch": 1.3275517011340894, "step": 3980 }, { "epoch": 1.3275517011340894, "ref_ce_loss": 0.13941185176372528, "step": 3980 }, { "epoch": 1.3275517011340894, "loss": 0.5626774430274963, "step": 3980 }, { "ce_loss": 0.22388912737369537, "epoch": 1.3275517011340894, "step": 3980 }, { "distill_loss": 0.16154928505420685, "epoch": 1.3275517011340894, "step": 3980 }, { "epoch": 1.3275517011340894, "ref_ce_loss": 0.11383243650197983, "step": 3980 }, { "epoch": 1.3275517011340894, "loss": 0.7328876852989197, "step": 3980 }, { "ce_loss": 0.21470485627651215, "epoch": 1.3275517011340894, "step": 3980 }, { "distill_loss": 0.15145975351333618, "epoch": 1.3275517011340894, "step": 3980 }, { "epoch": 1.3275517011340894, "ref_ce_loss": 0.1325717270374298, "step": 3980 }, { "epoch": 1.3275517011340894, "loss": 0.8754336833953857, "step": 3980 }, { "ce_loss": 0.36667221784591675, "epoch": 1.3275517011340894, "step": 3980 }, { "distill_loss": 0.20364922285079956, "epoch": 1.3275517011340894, "step": 3980 }, { "epoch": 1.3275517011340894, "ref_ce_loss": 0.14337895810604095, "step": 3980 }, { "epoch": 1.3308872581721147, "loss": 0.6202, "step": 3990 }, { "epoch": 1.3308872581721147, "grad_norm": 2.8255584239959717, "step": 3990 }, { "epoch": 1.3308872581721147, "learning_rate": 0.0002856113821118688, "step": 3990 }, { "epoch": 1.3308872581721147, "loss": 0.9397168159484863, "step": 3990 }, { "ce_loss": 0.26779523491859436, "epoch": 1.3308872581721147, "step": 3990 }, { "distill_loss": 0.1263412982225418, "epoch": 1.3308872581721147, "step": 3990 }, { "epoch": 1.3308872581721147, "ref_ce_loss": 0.12941111624240875, "step": 3990 }, { "epoch": 1.3308872581721147, "loss": 1.023170828819275, "step": 3990 }, { "ce_loss": 0.368866503238678, "epoch": 1.3308872581721147, "step": 3990 }, { "distill_loss": 0.13925743103027344, "epoch": 1.3308872581721147, "step": 3990 }, { "epoch": 1.3308872581721147, "ref_ce_loss": 0.2336951643228531, "step": 3990 }, { "epoch": 1.3308872581721147, "loss": 0.539009153842926, "step": 3990 }, { "ce_loss": 0.19066128134727478, "epoch": 1.3308872581721147, "step": 3990 }, { "distill_loss": 0.13218340277671814, "epoch": 1.3308872581721147, "step": 3990 }, { "epoch": 1.3308872581721147, "ref_ce_loss": 0.21576425433158875, "step": 3990 }, { "epoch": 1.3308872581721147, "loss": 0.4518980085849762, "step": 3990 }, { "ce_loss": 0.17046760022640228, "epoch": 1.3308872581721147, "step": 3990 }, { "distill_loss": 0.11611610651016235, "epoch": 1.3308872581721147, "step": 3990 }, { "epoch": 1.3308872581721147, "ref_ce_loss": 0.16487550735473633, "step": 3990 }, { "epoch": 1.33422281521014, "loss": 0.6956, "step": 4000 }, { "epoch": 1.33422281521014, "grad_norm": 5.867918968200684, "step": 4000 }, { "epoch": 1.33422281521014, "learning_rate": 0.00028552468941203364, "step": 4000 }, { "epoch": 1.33422281521014, "loss": 0.5843743085861206, "step": 4000 }, { "ce_loss": 0.18848250806331635, "epoch": 1.33422281521014, "step": 4000 }, { "distill_loss": 0.11338036507368088, "epoch": 1.33422281521014, "step": 4000 }, { "epoch": 1.33422281521014, "ref_ce_loss": 0.17525966465473175, "step": 4000 }, { "epoch": 1.33422281521014, "loss": 1.891869068145752, "step": 4000 }, { "ce_loss": 0.308139830827713, "epoch": 1.33422281521014, "step": 4000 }, { "distill_loss": 0.1472199261188507, "epoch": 1.33422281521014, "step": 4000 }, { "epoch": 1.33422281521014, "ref_ce_loss": 0.19832171499729156, "step": 4000 }, { "epoch": 1.33422281521014, "loss": 0.421846479177475, "step": 4000 }, { "ce_loss": 0.18127351999282837, "epoch": 1.33422281521014, "step": 4000 }, { "distill_loss": 0.11779454350471497, "epoch": 1.33422281521014, "step": 4000 }, { "epoch": 1.33422281521014, "ref_ce_loss": 0.12023581564426422, "step": 4000 }, { "epoch": 1.33422281521014, "loss": 0.7354398369789124, "step": 4000 }, { "ce_loss": 0.24078884720802307, "epoch": 1.33422281521014, "step": 4000 }, { "distill_loss": 0.16594208776950836, "epoch": 1.33422281521014, "step": 4000 }, { "epoch": 1.33422281521014, "ref_ce_loss": 0.15056820213794708, "step": 4000 }, { "epoch": 1.3375583722481654, "loss": 0.7304, "step": 4010 }, { "epoch": 1.3375583722481654, "grad_norm": 2.075334072113037, "step": 4010 }, { "epoch": 1.3375583722481654, "learning_rate": 0.0002854377495686252, "step": 4010 }, { "epoch": 1.3375583722481654, "loss": 0.5401085615158081, "step": 4010 }, { "ce_loss": 0.24888886511325836, "epoch": 1.3375583722481654, "step": 4010 }, { "distill_loss": 0.10953701287508011, "epoch": 1.3375583722481654, "step": 4010 }, { "epoch": 1.3375583722481654, "ref_ce_loss": 0.10295696556568146, "step": 4010 }, { "epoch": 1.3375583722481654, "loss": 0.48950207233428955, "step": 4010 }, { "ce_loss": 0.2334887534379959, "epoch": 1.3375583722481654, "step": 4010 }, { "distill_loss": 0.1012965738773346, "epoch": 1.3375583722481654, "step": 4010 }, { "epoch": 1.3375583722481654, "ref_ce_loss": 0.11197902262210846, "step": 4010 }, { "epoch": 1.3375583722481654, "loss": 0.5430691242218018, "step": 4010 }, { "ce_loss": 0.19202624261379242, "epoch": 1.3375583722481654, "step": 4010 }, { "distill_loss": 0.11316399276256561, "epoch": 1.3375583722481654, "step": 4010 }, { "epoch": 1.3375583722481654, "ref_ce_loss": 0.13813838362693787, "step": 4010 }, { "epoch": 1.3375583722481654, "loss": 0.5464562177658081, "step": 4010 }, { "ce_loss": 0.2877531051635742, "epoch": 1.3375583722481654, "step": 4010 }, { "distill_loss": 0.1209205761551857, "epoch": 1.3375583722481654, "step": 4010 }, { "epoch": 1.3375583722481654, "ref_ce_loss": 0.1376003623008728, "step": 4010 }, { "epoch": 1.3408939292861908, "loss": 0.5972, "step": 4020 }, { "epoch": 1.3408939292861908, "grad_norm": 2.555873155593872, "step": 4020 }, { "epoch": 1.3408939292861908, "learning_rate": 0.0002853505627401873, "step": 4020 }, { "epoch": 1.3408939292861908, "loss": 0.9355064630508423, "step": 4020 }, { "ce_loss": 0.26096606254577637, "epoch": 1.3408939292861908, "step": 4020 }, { "distill_loss": 0.17161086201667786, "epoch": 1.3408939292861908, "step": 4020 }, { "epoch": 1.3408939292861908, "ref_ce_loss": 0.23946815729141235, "step": 4020 }, { "epoch": 1.3408939292861908, "loss": 1.841139316558838, "step": 4020 }, { "ce_loss": 0.791628360748291, "epoch": 1.3408939292861908, "step": 4020 }, { "distill_loss": 0.15540112555027008, "epoch": 1.3408939292861908, "step": 4020 }, { "epoch": 1.3408939292861908, "ref_ce_loss": 0.3392656743526459, "step": 4020 }, { "epoch": 1.3408939292861908, "loss": 1.5659822225570679, "step": 4020 }, { "ce_loss": 0.5201265811920166, "epoch": 1.3408939292861908, "step": 4020 }, { "distill_loss": 0.11186160892248154, "epoch": 1.3408939292861908, "step": 4020 }, { "epoch": 1.3408939292861908, "ref_ce_loss": 0.272470086812973, "step": 4020 }, { "epoch": 1.3408939292861908, "loss": 0.9584152698516846, "step": 4020 }, { "ce_loss": 0.46013176441192627, "epoch": 1.3408939292861908, "step": 4020 }, { "distill_loss": 0.15966302156448364, "epoch": 1.3408939292861908, "step": 4020 }, { "epoch": 1.3408939292861908, "ref_ce_loss": 0.25039541721343994, "step": 4020 }, { "epoch": 1.3442294863242161, "loss": 0.7497, "step": 4030 }, { "epoch": 1.3442294863242161, "grad_norm": 12.790151596069336, "step": 4030 }, { "epoch": 1.3442294863242161, "learning_rate": 0.00028526312908571446, "step": 4030 }, { "epoch": 1.3442294863242161, "loss": 0.656446099281311, "step": 4030 }, { "ce_loss": 0.15182964503765106, "epoch": 1.3442294863242161, "step": 4030 }, { "distill_loss": 0.1116434782743454, "epoch": 1.3442294863242161, "step": 4030 }, { "epoch": 1.3442294863242161, "ref_ce_loss": 0.12290992587804794, "step": 4030 }, { "epoch": 1.3442294863242161, "loss": 0.40624871850013733, "step": 4030 }, { "ce_loss": 0.1445799022912979, "epoch": 1.3442294863242161, "step": 4030 }, { "distill_loss": 0.0997803807258606, "epoch": 1.3442294863242161, "step": 4030 }, { "epoch": 1.3442294863242161, "ref_ce_loss": 0.11327410489320755, "step": 4030 }, { "epoch": 1.3442294863242161, "loss": 0.6003686189651489, "step": 4030 }, { "ce_loss": 0.1528758853673935, "epoch": 1.3442294863242161, "step": 4030 }, { "distill_loss": 0.11801660060882568, "epoch": 1.3442294863242161, "step": 4030 }, { "epoch": 1.3442294863242161, "ref_ce_loss": 0.1294703334569931, "step": 4030 }, { "epoch": 1.3442294863242161, "loss": 0.6489979028701782, "step": 4030 }, { "ce_loss": 0.2071543037891388, "epoch": 1.3442294863242161, "step": 4030 }, { "distill_loss": 0.139252707362175, "epoch": 1.3442294863242161, "step": 4030 }, { "epoch": 1.3442294863242161, "ref_ce_loss": 0.1136828362941742, "step": 4030 }, { "epoch": 1.3475650433622415, "loss": 0.6939, "step": 4040 }, { "epoch": 1.3475650433622415, "grad_norm": 3.448014259338379, "step": 4040 }, { "epoch": 1.3475650433622415, "learning_rate": 0.00028517544876465107, "step": 4040 }, { "epoch": 1.3475650433622415, "loss": 0.5830159783363342, "step": 4040 }, { "ce_loss": 0.22778114676475525, "epoch": 1.3475650433622415, "step": 4040 }, { "distill_loss": 0.13242411613464355, "epoch": 1.3475650433622415, "step": 4040 }, { "epoch": 1.3475650433622415, "ref_ce_loss": 0.13737395405769348, "step": 4040 }, { "epoch": 1.3475650433622415, "loss": 0.3476133346557617, "step": 4040 }, { "ce_loss": 0.129276305437088, "epoch": 1.3475650433622415, "step": 4040 }, { "distill_loss": 0.1174379512667656, "epoch": 1.3475650433622415, "step": 4040 }, { "epoch": 1.3475650433622415, "ref_ce_loss": 0.0901045873761177, "step": 4040 }, { "epoch": 1.3475650433622415, "loss": 0.6042814254760742, "step": 4040 }, { "ce_loss": 0.25413164496421814, "epoch": 1.3475650433622415, "step": 4040 }, { "distill_loss": 0.16901132464408875, "epoch": 1.3475650433622415, "step": 4040 }, { "epoch": 1.3475650433622415, "ref_ce_loss": 0.1391698718070984, "step": 4040 }, { "epoch": 1.3475650433622415, "loss": 0.3692128360271454, "step": 4040 }, { "ce_loss": 0.11316437274217606, "epoch": 1.3475650433622415, "step": 4040 }, { "distill_loss": 0.11189061403274536, "epoch": 1.3475650433622415, "step": 4040 }, { "epoch": 1.3475650433622415, "ref_ce_loss": 0.08131895214319229, "step": 4040 }, { "epoch": 1.3509006004002668, "loss": 0.6413, "step": 4050 }, { "epoch": 1.3509006004002668, "grad_norm": 2.262253761291504, "step": 4050 }, { "epoch": 1.3509006004002668, "learning_rate": 0.00028508752193689155, "step": 4050 }, { "epoch": 1.3509006004002668, "loss": 0.4915662407875061, "step": 4050 }, { "ce_loss": 0.22006672620773315, "epoch": 1.3509006004002668, "step": 4050 }, { "distill_loss": 0.14548489451408386, "epoch": 1.3509006004002668, "step": 4050 }, { "epoch": 1.3509006004002668, "ref_ce_loss": 0.12591445446014404, "step": 4050 }, { "epoch": 1.3509006004002668, "loss": 0.8049219846725464, "step": 4050 }, { "ce_loss": 0.16947396099567413, "epoch": 1.3509006004002668, "step": 4050 }, { "distill_loss": 0.1450093388557434, "epoch": 1.3509006004002668, "step": 4050 }, { "epoch": 1.3509006004002668, "ref_ce_loss": 0.10676059871912003, "step": 4050 }, { "epoch": 1.3509006004002668, "loss": 0.47339928150177, "step": 4050 }, { "ce_loss": 0.1769871562719345, "epoch": 1.3509006004002668, "step": 4050 }, { "distill_loss": 0.14859752357006073, "epoch": 1.3509006004002668, "step": 4050 }, { "epoch": 1.3509006004002668, "ref_ce_loss": 0.14764495193958282, "step": 4050 }, { "epoch": 1.3509006004002668, "loss": 0.47751790285110474, "step": 4050 }, { "ce_loss": 0.1551894247531891, "epoch": 1.3509006004002668, "step": 4050 }, { "distill_loss": 0.13194677233695984, "epoch": 1.3509006004002668, "step": 4050 }, { "epoch": 1.3509006004002668, "ref_ce_loss": 0.10842660814523697, "step": 4050 }, { "epoch": 1.3542361574382922, "loss": 0.6347, "step": 4060 }, { "epoch": 1.3542361574382922, "grad_norm": 2.2766051292419434, "step": 4060 }, { "epoch": 1.3542361574382922, "learning_rate": 0.0002849993487627797, "step": 4060 }, { "epoch": 1.3542361574382922, "loss": 0.8118981122970581, "step": 4060 }, { "ce_loss": 0.1651322841644287, "epoch": 1.3542361574382922, "step": 4060 }, { "distill_loss": 0.10374452918767929, "epoch": 1.3542361574382922, "step": 4060 }, { "epoch": 1.3542361574382922, "ref_ce_loss": 0.11030875891447067, "step": 4060 }, { "epoch": 1.3542361574382922, "loss": 0.5269935727119446, "step": 4060 }, { "ce_loss": 0.25401464104652405, "epoch": 1.3542361574382922, "step": 4060 }, { "distill_loss": 0.11832722276449203, "epoch": 1.3542361574382922, "step": 4060 }, { "epoch": 1.3542361574382922, "ref_ce_loss": 0.15434080362319946, "step": 4060 }, { "epoch": 1.3542361574382922, "loss": 0.4596205949783325, "step": 4060 }, { "ce_loss": 0.19093306362628937, "epoch": 1.3542361574382922, "step": 4060 }, { "distill_loss": 0.11900342255830765, "epoch": 1.3542361574382922, "step": 4060 }, { "epoch": 1.3542361574382922, "ref_ce_loss": 0.1062200739979744, "step": 4060 }, { "epoch": 1.3542361574382922, "loss": 0.43231284618377686, "step": 4060 }, { "ce_loss": 0.1506810486316681, "epoch": 1.3542361574382922, "step": 4060 }, { "distill_loss": 0.10709468275308609, "epoch": 1.3542361574382922, "step": 4060 }, { "epoch": 1.3542361574382922, "ref_ce_loss": 0.10686808824539185, "step": 4060 }, { "epoch": 1.3575717144763175, "loss": 0.632, "step": 4070 }, { "epoch": 1.3575717144763175, "grad_norm": 2.651055335998535, "step": 4070 }, { "epoch": 1.3575717144763175, "learning_rate": 0.0002849109294031085, "step": 4070 }, { "epoch": 1.3575717144763175, "loss": 0.4504760205745697, "step": 4070 }, { "ce_loss": 0.1254345029592514, "epoch": 1.3575717144763175, "step": 4070 }, { "distill_loss": 0.09903445094823837, "epoch": 1.3575717144763175, "step": 4070 }, { "epoch": 1.3575717144763175, "ref_ce_loss": 0.1227196678519249, "step": 4070 }, { "epoch": 1.3575717144763175, "loss": 0.8664339780807495, "step": 4070 }, { "ce_loss": 0.265777051448822, "epoch": 1.3575717144763175, "step": 4070 }, { "distill_loss": 0.131103977560997, "epoch": 1.3575717144763175, "step": 4070 }, { "epoch": 1.3575717144763175, "ref_ce_loss": 0.14062733948230743, "step": 4070 }, { "epoch": 1.3575717144763175, "loss": 0.5157327651977539, "step": 4070 }, { "ce_loss": 0.14958372712135315, "epoch": 1.3575717144763175, "step": 4070 }, { "distill_loss": 0.10253413021564484, "epoch": 1.3575717144763175, "step": 4070 }, { "epoch": 1.3575717144763175, "ref_ce_loss": 0.1118377149105072, "step": 4070 }, { "epoch": 1.3575717144763175, "loss": 0.3796268105506897, "step": 4070 }, { "ce_loss": 0.16047704219818115, "epoch": 1.3575717144763175, "step": 4070 }, { "distill_loss": 0.10889697074890137, "epoch": 1.3575717144763175, "step": 4070 }, { "epoch": 1.3575717144763175, "ref_ce_loss": 0.11008761078119278, "step": 4070 }, { "epoch": 1.3609072715143429, "loss": 0.7058, "step": 4080 }, { "epoch": 1.3609072715143429, "grad_norm": 3.449279308319092, "step": 4080 }, { "epoch": 1.3609072715143429, "learning_rate": 0.00028482226401912016, "step": 4080 }, { "epoch": 1.3609072715143429, "loss": 0.780163049697876, "step": 4080 }, { "ce_loss": 0.33349332213401794, "epoch": 1.3609072715143429, "step": 4080 }, { "distill_loss": 0.1183605045080185, "epoch": 1.3609072715143429, "step": 4080 }, { "epoch": 1.3609072715143429, "ref_ce_loss": 0.161131352186203, "step": 4080 }, { "epoch": 1.3609072715143429, "loss": 0.6341320276260376, "step": 4080 }, { "ce_loss": 0.17753848433494568, "epoch": 1.3609072715143429, "step": 4080 }, { "distill_loss": 0.10119305551052094, "epoch": 1.3609072715143429, "step": 4080 }, { "epoch": 1.3609072715143429, "ref_ce_loss": 0.1781836748123169, "step": 4080 }, { "epoch": 1.3609072715143429, "loss": 0.8396756052970886, "step": 4080 }, { "ce_loss": 0.21004533767700195, "epoch": 1.3609072715143429, "step": 4080 }, { "distill_loss": 0.1059829443693161, "epoch": 1.3609072715143429, "step": 4080 }, { "epoch": 1.3609072715143429, "ref_ce_loss": 0.18439234793186188, "step": 4080 }, { "epoch": 1.3609072715143429, "loss": 0.5295935869216919, "step": 4080 }, { "ce_loss": 0.2418661117553711, "epoch": 1.3609072715143429, "step": 4080 }, { "distill_loss": 0.11171921342611313, "epoch": 1.3609072715143429, "step": 4080 }, { "epoch": 1.3609072715143429, "ref_ce_loss": 0.11636696010828018, "step": 4080 }, { "epoch": 1.3642428285523682, "loss": 0.6155, "step": 4090 }, { "epoch": 1.3642428285523682, "grad_norm": 2.1769909858703613, "step": 4090 }, { "epoch": 1.3642428285523682, "learning_rate": 0.00028473335277250534, "step": 4090 }, { "epoch": 1.3642428285523682, "loss": 0.4903789162635803, "step": 4090 }, { "ce_loss": 0.1835421472787857, "epoch": 1.3642428285523682, "step": 4090 }, { "distill_loss": 0.07726556807756424, "epoch": 1.3642428285523682, "step": 4090 }, { "epoch": 1.3642428285523682, "ref_ce_loss": 0.13395462930202484, "step": 4090 }, { "epoch": 1.3642428285523682, "loss": 0.5141618251800537, "step": 4090 }, { "ce_loss": 0.21861040592193604, "epoch": 1.3642428285523682, "step": 4090 }, { "distill_loss": 0.09555073082447052, "epoch": 1.3642428285523682, "step": 4090 }, { "epoch": 1.3642428285523682, "ref_ce_loss": 0.13811346888542175, "step": 4090 }, { "epoch": 1.3642428285523682, "loss": 0.7884995341300964, "step": 4090 }, { "ce_loss": 0.4218871593475342, "epoch": 1.3642428285523682, "step": 4090 }, { "distill_loss": 0.1013653576374054, "epoch": 1.3642428285523682, "step": 4090 }, { "epoch": 1.3642428285523682, "ref_ce_loss": 0.19294792413711548, "step": 4090 }, { "epoch": 1.3642428285523682, "loss": 0.6917357444763184, "step": 4090 }, { "ce_loss": 0.2670474946498871, "epoch": 1.3642428285523682, "step": 4090 }, { "distill_loss": 0.11097948253154755, "epoch": 1.3642428285523682, "step": 4090 }, { "epoch": 1.3642428285523682, "ref_ce_loss": 0.12580280005931854, "step": 4090 }, { "epoch": 1.3675783855903936, "loss": 0.6481, "step": 4100 }, { "epoch": 1.3675783855903936, "grad_norm": 2.9323387145996094, "step": 4100 }, { "epoch": 1.3675783855903936, "learning_rate": 0.00028464419582540295, "step": 4100 }, { "epoch": 1.3675783855903936, "loss": 0.6545873880386353, "step": 4100 }, { "ce_loss": 0.3142815828323364, "epoch": 1.3675783855903936, "step": 4100 }, { "distill_loss": 0.13354846835136414, "epoch": 1.3675783855903936, "step": 4100 }, { "epoch": 1.3675783855903936, "ref_ce_loss": 0.17563565075397491, "step": 4100 }, { "epoch": 1.3675783855903936, "loss": 0.8540979623794556, "step": 4100 }, { "ce_loss": 0.21789871156215668, "epoch": 1.3675783855903936, "step": 4100 }, { "distill_loss": 0.12937171757221222, "epoch": 1.3675783855903936, "step": 4100 }, { "epoch": 1.3675783855903936, "ref_ce_loss": 0.15378691256046295, "step": 4100 }, { "epoch": 1.3675783855903936, "loss": 0.7798890471458435, "step": 4100 }, { "ce_loss": 0.3511600196361542, "epoch": 1.3675783855903936, "step": 4100 }, { "distill_loss": 0.14735189080238342, "epoch": 1.3675783855903936, "step": 4100 }, { "epoch": 1.3675783855903936, "ref_ce_loss": 0.15318696200847626, "step": 4100 }, { "epoch": 1.3675783855903936, "loss": 0.5215632915496826, "step": 4100 }, { "ce_loss": 0.15824168920516968, "epoch": 1.3675783855903936, "step": 4100 }, { "distill_loss": 0.10788905620574951, "epoch": 1.3675783855903936, "step": 4100 }, { "epoch": 1.3675783855903936, "ref_ce_loss": 0.11126714199781418, "step": 4100 }, { "epoch": 1.370913942628419, "loss": 0.6594, "step": 4110 }, { "epoch": 1.370913942628419, "grad_norm": 1.8464733362197876, "step": 4110 }, { "epoch": 1.370913942628419, "learning_rate": 0.0002845547933404002, "step": 4110 }, { "epoch": 1.370913942628419, "loss": 0.34922516345977783, "step": 4110 }, { "ce_loss": 0.15206386148929596, "epoch": 1.370913942628419, "step": 4110 }, { "distill_loss": 0.08681447803974152, "epoch": 1.370913942628419, "step": 4110 }, { "epoch": 1.370913942628419, "ref_ce_loss": 0.10820655524730682, "step": 4110 }, { "epoch": 1.370913942628419, "loss": 0.3788524568080902, "step": 4110 }, { "ce_loss": 0.15653900802135468, "epoch": 1.370913942628419, "step": 4110 }, { "distill_loss": 0.09988351911306381, "epoch": 1.370913942628419, "step": 4110 }, { "epoch": 1.370913942628419, "ref_ce_loss": 0.12210514396429062, "step": 4110 }, { "epoch": 1.370913942628419, "loss": 0.6386069655418396, "step": 4110 }, { "ce_loss": 0.16010144352912903, "epoch": 1.370913942628419, "step": 4110 }, { "distill_loss": 0.13131670653820038, "epoch": 1.370913942628419, "step": 4110 }, { "epoch": 1.370913942628419, "ref_ce_loss": 0.12684138119220734, "step": 4110 }, { "epoch": 1.370913942628419, "loss": 0.5312026143074036, "step": 4110 }, { "ce_loss": 0.2253277450799942, "epoch": 1.370913942628419, "step": 4110 }, { "distill_loss": 0.10457585752010345, "epoch": 1.370913942628419, "step": 4110 }, { "epoch": 1.370913942628419, "ref_ce_loss": 0.1474202573299408, "step": 4110 }, { "epoch": 1.3742494996664443, "loss": 0.5724, "step": 4120 }, { "epoch": 1.3742494996664443, "grad_norm": 2.848768949508667, "step": 4120 }, { "epoch": 1.3742494996664443, "learning_rate": 0.00028446514548053194, "step": 4120 }, { "epoch": 1.3742494996664443, "loss": 0.35788092017173767, "step": 4120 }, { "ce_loss": 0.08504143357276917, "epoch": 1.3742494996664443, "step": 4120 }, { "distill_loss": 0.08752532303333282, "epoch": 1.3742494996664443, "step": 4120 }, { "epoch": 1.3742494996664443, "ref_ce_loss": 0.0958314761519432, "step": 4120 }, { "epoch": 1.3742494996664443, "loss": 1.4491925239562988, "step": 4120 }, { "ce_loss": 0.3018847107887268, "epoch": 1.3742494996664443, "step": 4120 }, { "distill_loss": 0.14247466623783112, "epoch": 1.3742494996664443, "step": 4120 }, { "epoch": 1.3742494996664443, "ref_ce_loss": 0.14462648332118988, "step": 4120 }, { "epoch": 1.3742494996664443, "loss": 0.3580704927444458, "step": 4120 }, { "ce_loss": 0.12576836347579956, "epoch": 1.3742494996664443, "step": 4120 }, { "distill_loss": 0.11499390751123428, "epoch": 1.3742494996664443, "step": 4120 }, { "epoch": 1.3742494996664443, "ref_ce_loss": 0.1170811876654625, "step": 4120 }, { "epoch": 1.3742494996664443, "loss": 0.6562156081199646, "step": 4120 }, { "ce_loss": 0.2465030401945114, "epoch": 1.3742494996664443, "step": 4120 }, { "distill_loss": 0.12957395613193512, "epoch": 1.3742494996664443, "step": 4120 }, { "epoch": 1.3742494996664443, "ref_ce_loss": 0.13839758932590485, "step": 4120 }, { "epoch": 1.3775850567044696, "loss": 0.6316, "step": 4130 }, { "epoch": 1.3775850567044696, "grad_norm": 2.6617259979248047, "step": 4130 }, { "epoch": 1.3775850567044696, "learning_rate": 0.0002843752524092805, "step": 4130 }, { "epoch": 1.3775850567044696, "loss": 0.5867189764976501, "step": 4130 }, { "ce_loss": 0.22917450964450836, "epoch": 1.3775850567044696, "step": 4130 }, { "distill_loss": 0.10425989329814911, "epoch": 1.3775850567044696, "step": 4130 }, { "epoch": 1.3775850567044696, "ref_ce_loss": 0.21919582784175873, "step": 4130 }, { "epoch": 1.3775850567044696, "loss": 0.7949258685112, "step": 4130 }, { "ce_loss": 0.29066002368927, "epoch": 1.3775850567044696, "step": 4130 }, { "distill_loss": 0.10276904702186584, "epoch": 1.3775850567044696, "step": 4130 }, { "epoch": 1.3775850567044696, "ref_ce_loss": 0.1894405633211136, "step": 4130 }, { "epoch": 1.3775850567044696, "loss": 0.5634397268295288, "step": 4130 }, { "ce_loss": 0.204814150929451, "epoch": 1.3775850567044696, "step": 4130 }, { "distill_loss": 0.09537995606660843, "epoch": 1.3775850567044696, "step": 4130 }, { "epoch": 1.3775850567044696, "ref_ce_loss": 0.13756123185157776, "step": 4130 }, { "epoch": 1.3775850567044696, "loss": 0.7808976769447327, "step": 4130 }, { "ce_loss": 0.25047364830970764, "epoch": 1.3775850567044696, "step": 4130 }, { "distill_loss": 0.12359421700239182, "epoch": 1.3775850567044696, "step": 4130 }, { "epoch": 1.3775850567044696, "ref_ce_loss": 0.22865727543830872, "step": 4130 }, { "epoch": 1.380920613742495, "loss": 0.6169, "step": 4140 }, { "epoch": 1.380920613742495, "grad_norm": 2.3741250038146973, "step": 4140 }, { "epoch": 1.380920613742495, "learning_rate": 0.0002842851142905754, "step": 4140 }, { "epoch": 1.380920613742495, "loss": 0.8009947538375854, "step": 4140 }, { "ce_loss": 0.3151392638683319, "epoch": 1.380920613742495, "step": 4140 }, { "distill_loss": 0.13409563899040222, "epoch": 1.380920613742495, "step": 4140 }, { "epoch": 1.380920613742495, "ref_ce_loss": 0.17703871428966522, "step": 4140 }, { "epoch": 1.380920613742495, "loss": 0.525648295879364, "step": 4140 }, { "ce_loss": 0.203111931681633, "epoch": 1.380920613742495, "step": 4140 }, { "distill_loss": 0.09106729179620743, "epoch": 1.380920613742495, "step": 4140 }, { "epoch": 1.380920613742495, "ref_ce_loss": 0.1332019865512848, "step": 4140 }, { "epoch": 1.380920613742495, "loss": 0.4340347349643707, "step": 4140 }, { "ce_loss": 0.14689216017723083, "epoch": 1.380920613742495, "step": 4140 }, { "distill_loss": 0.1072496846318245, "epoch": 1.380920613742495, "step": 4140 }, { "epoch": 1.380920613742495, "ref_ce_loss": 0.12199627608060837, "step": 4140 }, { "epoch": 1.380920613742495, "loss": 0.6135590076446533, "step": 4140 }, { "ce_loss": 0.24815963208675385, "epoch": 1.380920613742495, "step": 4140 }, { "distill_loss": 0.09719424694776535, "epoch": 1.380920613742495, "step": 4140 }, { "epoch": 1.380920613742495, "ref_ce_loss": 0.2031925618648529, "step": 4140 }, { "epoch": 1.3842561707805203, "loss": 0.6132, "step": 4150 }, { "epoch": 1.3842561707805203, "grad_norm": 2.4850473403930664, "step": 4150 }, { "epoch": 1.3842561707805203, "learning_rate": 0.0002841947312887929, "step": 4150 }, { "epoch": 1.3842561707805203, "loss": 0.6122986078262329, "step": 4150 }, { "ce_loss": 0.19411823153495789, "epoch": 1.3842561707805203, "step": 4150 }, { "distill_loss": 0.09509175270795822, "epoch": 1.3842561707805203, "step": 4150 }, { "epoch": 1.3842561707805203, "ref_ce_loss": 0.12494823336601257, "step": 4150 }, { "epoch": 1.3842561707805203, "loss": 0.2789407968521118, "step": 4150 }, { "ce_loss": 0.11748460680246353, "epoch": 1.3842561707805203, "step": 4150 }, { "distill_loss": 0.09389721602201462, "epoch": 1.3842561707805203, "step": 4150 }, { "epoch": 1.3842561707805203, "ref_ce_loss": 0.0674012303352356, "step": 4150 }, { "epoch": 1.3842561707805203, "loss": 0.4519701600074768, "step": 4150 }, { "ce_loss": 0.22965377569198608, "epoch": 1.3842561707805203, "step": 4150 }, { "distill_loss": 0.07461632788181305, "epoch": 1.3842561707805203, "step": 4150 }, { "epoch": 1.3842561707805203, "ref_ce_loss": 0.14758454263210297, "step": 4150 }, { "epoch": 1.3842561707805203, "loss": 0.3582833409309387, "step": 4150 }, { "ce_loss": 0.0969460979104042, "epoch": 1.3842561707805203, "step": 4150 }, { "distill_loss": 0.07818230241537094, "epoch": 1.3842561707805203, "step": 4150 }, { "epoch": 1.3842561707805203, "ref_ce_loss": 0.1007329449057579, "step": 4150 }, { "epoch": 1.3875917278185457, "loss": 0.6136, "step": 4160 }, { "epoch": 1.3875917278185457, "grad_norm": 2.14587664604187, "step": 4160 }, { "epoch": 1.3875917278185457, "learning_rate": 0.00028410410356875614, "step": 4160 }, { "epoch": 1.3875917278185457, "loss": 0.6464195251464844, "step": 4160 }, { "ce_loss": 0.2074149250984192, "epoch": 1.3875917278185457, "step": 4160 }, { "distill_loss": 0.10325966775417328, "epoch": 1.3875917278185457, "step": 4160 }, { "epoch": 1.3875917278185457, "ref_ce_loss": 0.12344758957624435, "step": 4160 }, { "epoch": 1.3875917278185457, "loss": 0.7069451212882996, "step": 4160 }, { "ce_loss": 0.1398283839225769, "epoch": 1.3875917278185457, "step": 4160 }, { "distill_loss": 0.10599468648433685, "epoch": 1.3875917278185457, "step": 4160 }, { "epoch": 1.3875917278185457, "ref_ce_loss": 0.15541532635688782, "step": 4160 }, { "epoch": 1.3875917278185457, "loss": 0.52448970079422, "step": 4160 }, { "ce_loss": 0.18425686657428741, "epoch": 1.3875917278185457, "step": 4160 }, { "distill_loss": 0.10802476853132248, "epoch": 1.3875917278185457, "step": 4160 }, { "epoch": 1.3875917278185457, "ref_ce_loss": 0.1491173356771469, "step": 4160 }, { "epoch": 1.3875917278185457, "loss": 0.5847429037094116, "step": 4160 }, { "ce_loss": 0.18347282707691193, "epoch": 1.3875917278185457, "step": 4160 }, { "distill_loss": 0.09988974034786224, "epoch": 1.3875917278185457, "step": 4160 }, { "epoch": 1.3875917278185457, "ref_ce_loss": 0.19402439892292023, "step": 4160 }, { "epoch": 1.390927284856571, "loss": 0.651, "step": 4170 }, { "epoch": 1.390927284856571, "grad_norm": 4.222642421722412, "step": 4170 }, { "epoch": 1.390927284856571, "learning_rate": 0.00028401323129573415, "step": 4170 }, { "epoch": 1.390927284856571, "loss": 0.5438768863677979, "step": 4170 }, { "ce_loss": 0.24094192683696747, "epoch": 1.390927284856571, "step": 4170 }, { "distill_loss": 0.10817679762840271, "epoch": 1.390927284856571, "step": 4170 }, { "epoch": 1.390927284856571, "ref_ce_loss": 0.15798398852348328, "step": 4170 }, { "epoch": 1.390927284856571, "loss": 0.7662238478660583, "step": 4170 }, { "ce_loss": 0.28996947407722473, "epoch": 1.390927284856571, "step": 4170 }, { "distill_loss": 0.12934328615665436, "epoch": 1.390927284856571, "step": 4170 }, { "epoch": 1.390927284856571, "ref_ce_loss": 0.1425507664680481, "step": 4170 }, { "epoch": 1.390927284856571, "loss": 0.790197491645813, "step": 4170 }, { "ce_loss": 0.18555812537670135, "epoch": 1.390927284856571, "step": 4170 }, { "distill_loss": 0.08629944920539856, "epoch": 1.390927284856571, "step": 4170 }, { "epoch": 1.390927284856571, "ref_ce_loss": 0.12687258422374725, "step": 4170 }, { "epoch": 1.390927284856571, "loss": 0.6215685606002808, "step": 4170 }, { "ce_loss": 0.21749264001846313, "epoch": 1.390927284856571, "step": 4170 }, { "distill_loss": 0.09972754865884781, "epoch": 1.390927284856571, "step": 4170 }, { "epoch": 1.390927284856571, "ref_ce_loss": 0.10281194746494293, "step": 4170 }, { "epoch": 1.3942628418945964, "loss": 0.5929, "step": 4180 }, { "epoch": 1.3942628418945964, "grad_norm": 3.127805709838867, "step": 4180 }, { "epoch": 1.3942628418945964, "learning_rate": 0.00028392211463544224, "step": 4180 }, { "epoch": 1.3942628418945964, "loss": 1.1103284358978271, "step": 4180 }, { "ce_loss": 0.14541105926036835, "epoch": 1.3942628418945964, "step": 4180 }, { "distill_loss": 0.08611320704221725, "epoch": 1.3942628418945964, "step": 4180 }, { "epoch": 1.3942628418945964, "ref_ce_loss": 0.1386411339044571, "step": 4180 }, { "epoch": 1.3942628418945964, "loss": 0.7187755107879639, "step": 4180 }, { "ce_loss": 0.12636052072048187, "epoch": 1.3942628418945964, "step": 4180 }, { "distill_loss": 0.08641035109758377, "epoch": 1.3942628418945964, "step": 4180 }, { "epoch": 1.3942628418945964, "ref_ce_loss": 0.09005328267812729, "step": 4180 }, { "epoch": 1.3942628418945964, "loss": 0.580119252204895, "step": 4180 }, { "ce_loss": 0.16198816895484924, "epoch": 1.3942628418945964, "step": 4180 }, { "distill_loss": 0.07411843538284302, "epoch": 1.3942628418945964, "step": 4180 }, { "epoch": 1.3942628418945964, "ref_ce_loss": 0.13253140449523926, "step": 4180 }, { "epoch": 1.3942628418945964, "loss": 0.5698190927505493, "step": 4180 }, { "ce_loss": 0.2511466443538666, "epoch": 1.3942628418945964, "step": 4180 }, { "distill_loss": 0.11713631451129913, "epoch": 1.3942628418945964, "step": 4180 }, { "epoch": 1.3942628418945964, "ref_ce_loss": 0.08613384515047073, "step": 4180 }, { "epoch": 1.3975983989326217, "loss": 0.6207, "step": 4190 }, { "epoch": 1.3975983989326217, "grad_norm": 2.769015312194824, "step": 4190 }, { "epoch": 1.3975983989326217, "learning_rate": 0.0002838307537540411, "step": 4190 }, { "epoch": 1.3975983989326217, "loss": 0.7650259733200073, "step": 4190 }, { "ce_loss": 0.2717476189136505, "epoch": 1.3975983989326217, "step": 4190 }, { "distill_loss": 0.0811992660164833, "epoch": 1.3975983989326217, "step": 4190 }, { "epoch": 1.3975983989326217, "ref_ce_loss": 0.21618877351284027, "step": 4190 }, { "epoch": 1.3975983989326217, "loss": 0.9913129806518555, "step": 4190 }, { "ce_loss": 0.22515885531902313, "epoch": 1.3975983989326217, "step": 4190 }, { "distill_loss": 0.10443403571844101, "epoch": 1.3975983989326217, "step": 4190 }, { "epoch": 1.3975983989326217, "ref_ce_loss": 0.1330164223909378, "step": 4190 }, { "epoch": 1.3975983989326217, "loss": 0.458392858505249, "step": 4190 }, { "ce_loss": 0.2054453045129776, "epoch": 1.3975983989326217, "step": 4190 }, { "distill_loss": 0.08475649356842041, "epoch": 1.3975983989326217, "step": 4190 }, { "epoch": 1.3975983989326217, "ref_ce_loss": 0.11416659504175186, "step": 4190 }, { "epoch": 1.3975983989326217, "loss": 0.9844976663589478, "step": 4190 }, { "ce_loss": 0.2999069392681122, "epoch": 1.3975983989326217, "step": 4190 }, { "distill_loss": 0.10008185356855392, "epoch": 1.3975983989326217, "step": 4190 }, { "epoch": 1.3975983989326217, "ref_ce_loss": 0.1291126161813736, "step": 4190 }, { "epoch": 1.400933955970647, "loss": 0.6316, "step": 4200 }, { "epoch": 1.400933955970647, "grad_norm": 1.8868513107299805, "step": 4200 }, { "epoch": 1.400933955970647, "learning_rate": 0.00028373914881813715, "step": 4200 }, { "epoch": 1.400933955970647, "loss": 0.910756528377533, "step": 4200 }, { "ce_loss": 0.16571709513664246, "epoch": 1.400933955970647, "step": 4200 }, { "distill_loss": 0.09719116985797882, "epoch": 1.400933955970647, "step": 4200 }, { "epoch": 1.400933955970647, "ref_ce_loss": 0.1444815844297409, "step": 4200 }, { "epoch": 1.400933955970647, "loss": 0.41164451837539673, "step": 4200 }, { "ce_loss": 0.12055205553770065, "epoch": 1.400933955970647, "step": 4200 }, { "distill_loss": 0.0802890807390213, "epoch": 1.400933955970647, "step": 4200 }, { "epoch": 1.400933955970647, "ref_ce_loss": 0.07115507870912552, "step": 4200 }, { "epoch": 1.400933955970647, "loss": 0.6578470468521118, "step": 4200 }, { "ce_loss": 0.11765425652265549, "epoch": 1.400933955970647, "step": 4200 }, { "distill_loss": 0.11294358223676682, "epoch": 1.400933955970647, "step": 4200 }, { "epoch": 1.400933955970647, "ref_ce_loss": 0.16021910309791565, "step": 4200 }, { "epoch": 1.400933955970647, "loss": 0.5982992649078369, "step": 4200 }, { "ce_loss": 0.2084062397480011, "epoch": 1.400933955970647, "step": 4200 }, { "distill_loss": 0.1283988654613495, "epoch": 1.400933955970647, "step": 4200 }, { "epoch": 1.400933955970647, "ref_ce_loss": 0.18639861047267914, "step": 4200 }, { "epoch": 1.4042695130086724, "loss": 0.6715, "step": 4210 }, { "epoch": 1.4042695130086724, "grad_norm": 2.1879544258117676, "step": 4210 }, { "epoch": 1.4042695130086724, "learning_rate": 0.00028364729999478145, "step": 4210 }, { "epoch": 1.4042695130086724, "loss": 0.6384764909744263, "step": 4210 }, { "ce_loss": 0.2525661289691925, "epoch": 1.4042695130086724, "step": 4210 }, { "distill_loss": 0.09707111120223999, "epoch": 1.4042695130086724, "step": 4210 }, { "epoch": 1.4042695130086724, "ref_ce_loss": 0.18559350073337555, "step": 4210 }, { "epoch": 1.4042695130086724, "loss": 0.5137649178504944, "step": 4210 }, { "ce_loss": 0.20408010482788086, "epoch": 1.4042695130086724, "step": 4210 }, { "distill_loss": 0.12385561317205429, "epoch": 1.4042695130086724, "step": 4210 }, { "epoch": 1.4042695130086724, "ref_ce_loss": 0.1269007921218872, "step": 4210 }, { "epoch": 1.4042695130086724, "loss": 0.5365128517150879, "step": 4210 }, { "ce_loss": 0.2322646677494049, "epoch": 1.4042695130086724, "step": 4210 }, { "distill_loss": 0.1266290843486786, "epoch": 1.4042695130086724, "step": 4210 }, { "epoch": 1.4042695130086724, "ref_ce_loss": 0.12365609407424927, "step": 4210 }, { "epoch": 1.4042695130086724, "loss": 0.30896055698394775, "step": 4210 }, { "ce_loss": 0.10330818593502045, "epoch": 1.4042695130086724, "step": 4210 }, { "distill_loss": 0.0924607366323471, "epoch": 1.4042695130086724, "step": 4210 }, { "epoch": 1.4042695130086724, "ref_ce_loss": 0.11242125183343887, "step": 4210 }, { "epoch": 1.4076050700466978, "loss": 0.6023, "step": 4220 }, { "epoch": 1.4076050700466978, "grad_norm": 3.4202616214752197, "step": 4220 }, { "epoch": 1.4076050700466978, "learning_rate": 0.0002835552074514702, "step": 4220 }, { "epoch": 1.4076050700466978, "loss": 0.3378927707672119, "step": 4220 }, { "ce_loss": 0.07960563153028488, "epoch": 1.4076050700466978, "step": 4220 }, { "distill_loss": 0.0758289247751236, "epoch": 1.4076050700466978, "step": 4220 }, { "epoch": 1.4076050700466978, "ref_ce_loss": 0.06120488420128822, "step": 4220 }, { "epoch": 1.4076050700466978, "loss": 0.4977499842643738, "step": 4220 }, { "ce_loss": 0.19765213131904602, "epoch": 1.4076050700466978, "step": 4220 }, { "distill_loss": 0.10244348645210266, "epoch": 1.4076050700466978, "step": 4220 }, { "epoch": 1.4076050700466978, "ref_ce_loss": 0.15283609926700592, "step": 4220 }, { "epoch": 1.4076050700466978, "loss": 0.6151387691497803, "step": 4220 }, { "ce_loss": 0.2646377980709076, "epoch": 1.4076050700466978, "step": 4220 }, { "distill_loss": 0.1459384262561798, "epoch": 1.4076050700466978, "step": 4220 }, { "epoch": 1.4076050700466978, "ref_ce_loss": 0.1401008814573288, "step": 4220 }, { "epoch": 1.4076050700466978, "loss": 0.5750916004180908, "step": 4220 }, { "ce_loss": 0.2228243350982666, "epoch": 1.4076050700466978, "step": 4220 }, { "distill_loss": 0.09344765543937683, "epoch": 1.4076050700466978, "step": 4220 }, { "epoch": 1.4076050700466978, "ref_ce_loss": 0.1685912162065506, "step": 4220 }, { "epoch": 1.4109406270847231, "loss": 0.6674, "step": 4230 }, { "epoch": 1.4109406270847231, "grad_norm": 2.043844223022461, "step": 4230 }, { "epoch": 1.4109406270847231, "learning_rate": 0.00028346287135614376, "step": 4230 }, { "epoch": 1.4109406270847231, "loss": 0.5440961122512817, "step": 4230 }, { "ce_loss": 0.21927763521671295, "epoch": 1.4109406270847231, "step": 4230 }, { "distill_loss": 0.09879957139492035, "epoch": 1.4109406270847231, "step": 4230 }, { "epoch": 1.4109406270847231, "ref_ce_loss": 0.11110997945070267, "step": 4230 }, { "epoch": 1.4109406270847231, "loss": 0.6085052490234375, "step": 4230 }, { "ce_loss": 0.2495298832654953, "epoch": 1.4109406270847231, "step": 4230 }, { "distill_loss": 0.12600085139274597, "epoch": 1.4109406270847231, "step": 4230 }, { "epoch": 1.4109406270847231, "ref_ce_loss": 0.1036033183336258, "step": 4230 }, { "epoch": 1.4109406270847231, "loss": 0.7630167007446289, "step": 4230 }, { "ce_loss": 0.21309255063533783, "epoch": 1.4109406270847231, "step": 4230 }, { "distill_loss": 0.11631793528795242, "epoch": 1.4109406270847231, "step": 4230 }, { "epoch": 1.4109406270847231, "ref_ce_loss": 0.14728353917598724, "step": 4230 }, { "epoch": 1.4109406270847231, "loss": 0.3512745201587677, "step": 4230 }, { "ce_loss": 0.15760624408721924, "epoch": 1.4109406270847231, "step": 4230 }, { "distill_loss": 0.08606228977441788, "epoch": 1.4109406270847231, "step": 4230 }, { "epoch": 1.4109406270847231, "ref_ce_loss": 0.10728107392787933, "step": 4230 }, { "epoch": 1.4142761841227485, "loss": 0.6071, "step": 4240 }, { "epoch": 1.4142761841227485, "grad_norm": 3.563107967376709, "step": 4240 }, { "epoch": 1.4142761841227485, "learning_rate": 0.0002833702918771868, "step": 4240 }, { "epoch": 1.4142761841227485, "loss": 0.6661812663078308, "step": 4240 }, { "ce_loss": 0.2621847689151764, "epoch": 1.4142761841227485, "step": 4240 }, { "distill_loss": 0.12339521944522858, "epoch": 1.4142761841227485, "step": 4240 }, { "epoch": 1.4142761841227485, "ref_ce_loss": 0.11897028237581253, "step": 4240 }, { "epoch": 1.4142761841227485, "loss": 0.3537221848964691, "step": 4240 }, { "ce_loss": 0.12370649725198746, "epoch": 1.4142761841227485, "step": 4240 }, { "distill_loss": 0.08683275431394577, "epoch": 1.4142761841227485, "step": 4240 }, { "epoch": 1.4142761841227485, "ref_ce_loss": 0.08618742972612381, "step": 4240 }, { "epoch": 1.4142761841227485, "loss": 1.1888618469238281, "step": 4240 }, { "ce_loss": 0.35263460874557495, "epoch": 1.4142761841227485, "step": 4240 }, { "distill_loss": 0.15241080522537231, "epoch": 1.4142761841227485, "step": 4240 }, { "epoch": 1.4142761841227485, "ref_ce_loss": 0.19804182648658752, "step": 4240 }, { "epoch": 1.4142761841227485, "loss": 0.6669833064079285, "step": 4240 }, { "ce_loss": 0.19273287057876587, "epoch": 1.4142761841227485, "step": 4240 }, { "distill_loss": 0.11122193187475204, "epoch": 1.4142761841227485, "step": 4240 }, { "epoch": 1.4142761841227485, "ref_ce_loss": 0.159165620803833, "step": 4240 }, { "epoch": 1.4176117411607738, "loss": 0.596, "step": 4250 }, { "epoch": 1.4176117411607738, "grad_norm": 3.0415971279144287, "step": 4250 }, { "epoch": 1.4176117411607738, "learning_rate": 0.00028327746918342764, "step": 4250 }, { "epoch": 1.4176117411607738, "loss": 0.6574098467826843, "step": 4250 }, { "ce_loss": 0.35821470618247986, "epoch": 1.4176117411607738, "step": 4250 }, { "distill_loss": 0.10992508381605148, "epoch": 1.4176117411607738, "step": 4250 }, { "epoch": 1.4176117411607738, "ref_ce_loss": 0.14264172315597534, "step": 4250 }, { "epoch": 1.4176117411607738, "loss": 0.7249197959899902, "step": 4250 }, { "ce_loss": 0.32912251353263855, "epoch": 1.4176117411607738, "step": 4250 }, { "distill_loss": 0.11514975875616074, "epoch": 1.4176117411607738, "step": 4250 }, { "epoch": 1.4176117411607738, "ref_ce_loss": 0.1938469558954239, "step": 4250 }, { "epoch": 1.4176117411607738, "loss": 0.790244460105896, "step": 4250 }, { "ce_loss": 0.29465973377227783, "epoch": 1.4176117411607738, "step": 4250 }, { "distill_loss": 0.09840358048677444, "epoch": 1.4176117411607738, "step": 4250 }, { "epoch": 1.4176117411607738, "ref_ce_loss": 0.1359245926141739, "step": 4250 }, { "epoch": 1.4176117411607738, "loss": 0.7141227722167969, "step": 4250 }, { "ce_loss": 0.3344563841819763, "epoch": 1.4176117411607738, "step": 4250 }, { "distill_loss": 0.11084728688001633, "epoch": 1.4176117411607738, "step": 4250 }, { "epoch": 1.4176117411607738, "ref_ce_loss": 0.17706482112407684, "step": 4250 }, { "epoch": 1.4209472981987992, "loss": 0.6059, "step": 4260 }, { "epoch": 1.4209472981987992, "grad_norm": 2.2450828552246094, "step": 4260 }, { "epoch": 1.4209472981987992, "learning_rate": 0.0002831844034441384, "step": 4260 }, { "epoch": 1.4209472981987992, "loss": 0.878659725189209, "step": 4260 }, { "ce_loss": 0.29933464527130127, "epoch": 1.4209472981987992, "step": 4260 }, { "distill_loss": 0.1180991381406784, "epoch": 1.4209472981987992, "step": 4260 }, { "epoch": 1.4209472981987992, "ref_ce_loss": 0.1688729077577591, "step": 4260 }, { "epoch": 1.4209472981987992, "loss": 0.820519745349884, "step": 4260 }, { "ce_loss": 0.38678011298179626, "epoch": 1.4209472981987992, "step": 4260 }, { "distill_loss": 0.12214456498622894, "epoch": 1.4209472981987992, "step": 4260 }, { "epoch": 1.4209472981987992, "ref_ce_loss": 0.1592690348625183, "step": 4260 }, { "epoch": 1.4209472981987992, "loss": 0.5947272777557373, "step": 4260 }, { "ce_loss": 0.23920579254627228, "epoch": 1.4209472981987992, "step": 4260 }, { "distill_loss": 0.09341529756784439, "epoch": 1.4209472981987992, "step": 4260 }, { "epoch": 1.4209472981987992, "ref_ce_loss": 0.15410996973514557, "step": 4260 }, { "epoch": 1.4209472981987992, "loss": 0.5958114862442017, "step": 4260 }, { "ce_loss": 0.22531208395957947, "epoch": 1.4209472981987992, "step": 4260 }, { "distill_loss": 0.10374058037996292, "epoch": 1.4209472981987992, "step": 4260 }, { "epoch": 1.4209472981987992, "ref_ce_loss": 0.16780097782611847, "step": 4260 }, { "epoch": 1.4242828552368245, "loss": 0.591, "step": 4270 }, { "epoch": 1.4242828552368245, "grad_norm": 2.1468281745910645, "step": 4270 }, { "epoch": 1.4242828552368245, "learning_rate": 0.0002830910948290343, "step": 4270 }, { "epoch": 1.4242828552368245, "loss": 0.4912702739238739, "step": 4270 }, { "ce_loss": 0.15433534979820251, "epoch": 1.4242828552368245, "step": 4270 }, { "distill_loss": 0.09245024621486664, "epoch": 1.4242828552368245, "step": 4270 }, { "epoch": 1.4242828552368245, "ref_ce_loss": 0.10943099856376648, "step": 4270 }, { "epoch": 1.4242828552368245, "loss": 0.5627715587615967, "step": 4270 }, { "ce_loss": 0.20639285445213318, "epoch": 1.4242828552368245, "step": 4270 }, { "distill_loss": 0.10644049197435379, "epoch": 1.4242828552368245, "step": 4270 }, { "epoch": 1.4242828552368245, "ref_ce_loss": 0.12591655552387238, "step": 4270 }, { "epoch": 1.4242828552368245, "loss": 0.43795809149742126, "step": 4270 }, { "ce_loss": 0.21394628286361694, "epoch": 1.4242828552368245, "step": 4270 }, { "distill_loss": 0.10047397017478943, "epoch": 1.4242828552368245, "step": 4270 }, { "epoch": 1.4242828552368245, "ref_ce_loss": 0.12295599281787872, "step": 4270 }, { "epoch": 1.4242828552368245, "loss": 0.5432189702987671, "step": 4270 }, { "ce_loss": 0.16253937780857086, "epoch": 1.4242828552368245, "step": 4270 }, { "distill_loss": 0.11227938532829285, "epoch": 1.4242828552368245, "step": 4270 }, { "epoch": 1.4242828552368245, "ref_ce_loss": 0.14385241270065308, "step": 4270 }, { "epoch": 1.4276184122748499, "loss": 0.9214, "step": 4280 }, { "epoch": 1.4276184122748499, "grad_norm": 4.369274616241455, "step": 4280 }, { "epoch": 1.4276184122748499, "learning_rate": 0.00028299754350827333, "step": 4280 }, { "epoch": 1.4276184122748499, "loss": 1.351196527481079, "step": 4280 }, { "ce_loss": 0.18473805487155914, "epoch": 1.4276184122748499, "step": 4280 }, { "distill_loss": 0.4738437831401825, "epoch": 1.4276184122748499, "step": 4280 }, { "epoch": 1.4276184122748499, "ref_ce_loss": 0.16030266880989075, "step": 4280 }, { "epoch": 1.4276184122748499, "loss": 1.2438302040100098, "step": 4280 }, { "ce_loss": 0.26313990354537964, "epoch": 1.4276184122748499, "step": 4280 }, { "distill_loss": 0.6745567321777344, "epoch": 1.4276184122748499, "step": 4280 }, { "epoch": 1.4276184122748499, "ref_ce_loss": 0.19730761647224426, "step": 4280 }, { "epoch": 1.4276184122748499, "loss": 2.1819138526916504, "step": 4280 }, { "ce_loss": 0.30457887053489685, "epoch": 1.4276184122748499, "step": 4280 }, { "distill_loss": 0.563275933265686, "epoch": 1.4276184122748499, "step": 4280 }, { "epoch": 1.4276184122748499, "ref_ce_loss": 0.16263462603092194, "step": 4280 }, { "epoch": 1.4276184122748499, "loss": 0.6994156837463379, "step": 4280 }, { "ce_loss": 0.14273491501808167, "epoch": 1.4276184122748499, "step": 4280 }, { "distill_loss": 0.3719584345817566, "epoch": 1.4276184122748499, "step": 4280 }, { "epoch": 1.4276184122748499, "ref_ce_loss": 0.13406719267368317, "step": 4280 }, { "epoch": 1.4309539693128752, "loss": 0.9138, "step": 4290 }, { "epoch": 1.4309539693128752, "grad_norm": 3.0709683895111084, "step": 4290 }, { "epoch": 1.4309539693128752, "learning_rate": 0.00028290374965245625, "step": 4290 }, { "epoch": 1.4309539693128752, "loss": 0.8565329909324646, "step": 4290 }, { "ce_loss": 0.26970455050468445, "epoch": 1.4309539693128752, "step": 4290 }, { "distill_loss": 0.3865727484226227, "epoch": 1.4309539693128752, "step": 4290 }, { "epoch": 1.4309539693128752, "ref_ce_loss": 0.15481248497962952, "step": 4290 }, { "epoch": 1.4309539693128752, "loss": 0.8524794578552246, "step": 4290 }, { "ce_loss": 0.3532913625240326, "epoch": 1.4309539693128752, "step": 4290 }, { "distill_loss": 0.355815589427948, "epoch": 1.4309539693128752, "step": 4290 }, { "epoch": 1.4309539693128752, "ref_ce_loss": 0.14321820437908173, "step": 4290 }, { "epoch": 1.4309539693128752, "loss": 0.5584589242935181, "step": 4290 }, { "ce_loss": 0.18174344301223755, "epoch": 1.4309539693128752, "step": 4290 }, { "distill_loss": 0.24146713316440582, "epoch": 1.4309539693128752, "step": 4290 }, { "epoch": 1.4309539693128752, "ref_ce_loss": 0.13489829003810883, "step": 4290 }, { "epoch": 1.4309539693128752, "loss": 0.9188706874847412, "step": 4290 }, { "ce_loss": 0.2061683088541031, "epoch": 1.4309539693128752, "step": 4290 }, { "distill_loss": 0.24299702048301697, "epoch": 1.4309539693128752, "step": 4290 }, { "epoch": 1.4309539693128752, "ref_ce_loss": 0.1520266830921173, "step": 4290 }, { "epoch": 1.4342895263509006, "loss": 0.7849, "step": 4300 }, { "epoch": 1.4342895263509006, "grad_norm": 3.8154234886169434, "step": 4300 }, { "epoch": 1.4342895263509006, "learning_rate": 0.0002828097134326261, "step": 4300 }, { "epoch": 1.4342895263509006, "loss": 0.8924685716629028, "step": 4300 }, { "ce_loss": 0.22914117574691772, "epoch": 1.4342895263509006, "step": 4300 }, { "distill_loss": 0.32076436281204224, "epoch": 1.4342895263509006, "step": 4300 }, { "epoch": 1.4342895263509006, "ref_ce_loss": 0.2145729959011078, "step": 4300 }, { "epoch": 1.4342895263509006, "loss": 0.9288495779037476, "step": 4300 }, { "ce_loss": 0.28972870111465454, "epoch": 1.4342895263509006, "step": 4300 }, { "distill_loss": 0.4693968892097473, "epoch": 1.4342895263509006, "step": 4300 }, { "epoch": 1.4342895263509006, "ref_ce_loss": 0.11189575493335724, "step": 4300 }, { "epoch": 1.4342895263509006, "loss": 0.7833391427993774, "step": 4300 }, { "ce_loss": 0.21527405083179474, "epoch": 1.4342895263509006, "step": 4300 }, { "distill_loss": 0.2836889922618866, "epoch": 1.4342895263509006, "step": 4300 }, { "epoch": 1.4342895263509006, "ref_ce_loss": 0.1465625911951065, "step": 4300 }, { "epoch": 1.4342895263509006, "loss": 0.5358696579933167, "step": 4300 }, { "ce_loss": 0.1487964242696762, "epoch": 1.4342895263509006, "step": 4300 }, { "distill_loss": 0.2489507794380188, "epoch": 1.4342895263509006, "step": 4300 }, { "epoch": 1.4342895263509006, "ref_ce_loss": 0.09456692636013031, "step": 4300 }, { "epoch": 1.437625083388926, "loss": 0.8125, "step": 4310 }, { "epoch": 1.437625083388926, "grad_norm": 3.7421693801879883, "step": 4310 }, { "epoch": 1.437625083388926, "learning_rate": 0.0002827154350202678, "step": 4310 }, { "epoch": 1.437625083388926, "loss": 0.7215092778205872, "step": 4310 }, { "ce_loss": 0.19476166367530823, "epoch": 1.437625083388926, "step": 4310 }, { "distill_loss": 0.2617197632789612, "epoch": 1.437625083388926, "step": 4310 }, { "epoch": 1.437625083388926, "ref_ce_loss": 0.1460052728652954, "step": 4310 }, { "epoch": 1.437625083388926, "loss": 0.44765257835388184, "step": 4310 }, { "ce_loss": 0.0965539962053299, "epoch": 1.437625083388926, "step": 4310 }, { "distill_loss": 0.24878545105457306, "epoch": 1.437625083388926, "step": 4310 }, { "epoch": 1.437625083388926, "ref_ce_loss": 0.1019732877612114, "step": 4310 }, { "epoch": 1.437625083388926, "loss": 0.609957754611969, "step": 4310 }, { "ce_loss": 0.18919765949249268, "epoch": 1.437625083388926, "step": 4310 }, { "distill_loss": 0.27180278301239014, "epoch": 1.437625083388926, "step": 4310 }, { "epoch": 1.437625083388926, "ref_ce_loss": 0.11347243934869766, "step": 4310 }, { "epoch": 1.437625083388926, "loss": 0.624266505241394, "step": 4310 }, { "ce_loss": 0.2113901674747467, "epoch": 1.437625083388926, "step": 4310 }, { "distill_loss": 0.22982464730739594, "epoch": 1.437625083388926, "step": 4310 }, { "epoch": 1.437625083388926, "ref_ce_loss": 0.11574660986661911, "step": 4310 }, { "epoch": 1.4409606404269513, "loss": 0.7445, "step": 4320 }, { "epoch": 1.4409606404269513, "grad_norm": 3.571347713470459, "step": 4320 }, { "epoch": 1.4409606404269513, "learning_rate": 0.000282620914587308, "step": 4320 }, { "epoch": 1.4409606404269513, "loss": 0.48458024859428406, "step": 4320 }, { "ce_loss": 0.1750793755054474, "epoch": 1.4409606404269513, "step": 4320 }, { "distill_loss": 0.19086973369121552, "epoch": 1.4409606404269513, "step": 4320 }, { "epoch": 1.4409606404269513, "ref_ce_loss": 0.11848335713148117, "step": 4320 }, { "epoch": 1.4409606404269513, "loss": 0.6750709414482117, "step": 4320 }, { "ce_loss": 0.23150256276130676, "epoch": 1.4409606404269513, "step": 4320 }, { "distill_loss": 0.22192597389221191, "epoch": 1.4409606404269513, "step": 4320 }, { "epoch": 1.4409606404269513, "ref_ce_loss": 0.1723523885011673, "step": 4320 }, { "epoch": 1.4409606404269513, "loss": 0.7323600053787231, "step": 4320 }, { "ce_loss": 0.3054851293563843, "epoch": 1.4409606404269513, "step": 4320 }, { "distill_loss": 0.23578448593616486, "epoch": 1.4409606404269513, "step": 4320 }, { "epoch": 1.4409606404269513, "ref_ce_loss": 0.19094038009643555, "step": 4320 }, { "epoch": 1.4409606404269513, "loss": 0.7869250178337097, "step": 4320 }, { "ce_loss": 0.27868741750717163, "epoch": 1.4409606404269513, "step": 4320 }, { "distill_loss": 0.2345450520515442, "epoch": 1.4409606404269513, "step": 4320 }, { "epoch": 1.4409606404269513, "ref_ce_loss": 0.18001769483089447, "step": 4320 }, { "epoch": 1.4442961974649766, "loss": 0.6926, "step": 4330 }, { "epoch": 1.4442961974649766, "grad_norm": 2.8783340454101562, "step": 4330 }, { "epoch": 1.4442961974649766, "learning_rate": 0.0002825261523061146, "step": 4330 }, { "epoch": 1.4442961974649766, "loss": 1.5428657531738281, "step": 4330 }, { "ce_loss": 0.20685748755931854, "epoch": 1.4442961974649766, "step": 4330 }, { "distill_loss": 0.15463702380657196, "epoch": 1.4442961974649766, "step": 4330 }, { "epoch": 1.4442961974649766, "ref_ce_loss": 0.14646972715854645, "step": 4330 }, { "epoch": 1.4442961974649766, "loss": 1.6755146980285645, "step": 4330 }, { "ce_loss": 0.1802901327610016, "epoch": 1.4442961974649766, "step": 4330 }, { "distill_loss": 0.1890738308429718, "epoch": 1.4442961974649766, "step": 4330 }, { "epoch": 1.4442961974649766, "ref_ce_loss": 0.12323080003261566, "step": 4330 }, { "epoch": 1.4442961974649766, "loss": 0.6056669354438782, "step": 4330 }, { "ce_loss": 0.13862594962120056, "epoch": 1.4442961974649766, "step": 4330 }, { "distill_loss": 0.15863680839538574, "epoch": 1.4442961974649766, "step": 4330 }, { "epoch": 1.4442961974649766, "ref_ce_loss": 0.20575878024101257, "step": 4330 }, { "epoch": 1.4442961974649766, "loss": 0.531682014465332, "step": 4330 }, { "ce_loss": 0.20502084493637085, "epoch": 1.4442961974649766, "step": 4330 }, { "distill_loss": 0.17332421243190765, "epoch": 1.4442961974649766, "step": 4330 }, { "epoch": 1.4442961974649766, "ref_ce_loss": 0.1205807700753212, "step": 4330 }, { "epoch": 1.447631754503002, "loss": 0.6714, "step": 4340 }, { "epoch": 1.447631754503002, "grad_norm": 3.1532366275787354, "step": 4340 }, { "epoch": 1.447631754503002, "learning_rate": 0.00028243114834949673, "step": 4340 }, { "epoch": 1.447631754503002, "loss": 0.8360211849212646, "step": 4340 }, { "ce_loss": 0.3114469051361084, "epoch": 1.447631754503002, "step": 4340 }, { "distill_loss": 0.14694617688655853, "epoch": 1.447631754503002, "step": 4340 }, { "epoch": 1.447631754503002, "ref_ce_loss": 0.15568295121192932, "step": 4340 }, { "epoch": 1.447631754503002, "loss": 0.6691051721572876, "step": 4340 }, { "ce_loss": 0.3203543722629547, "epoch": 1.447631754503002, "step": 4340 }, { "distill_loss": 0.13674457371234894, "epoch": 1.447631754503002, "step": 4340 }, { "epoch": 1.447631754503002, "ref_ce_loss": 0.15188376605510712, "step": 4340 }, { "epoch": 1.447631754503002, "loss": 0.6616179347038269, "step": 4340 }, { "ce_loss": 0.26779434084892273, "epoch": 1.447631754503002, "step": 4340 }, { "distill_loss": 0.11278953403234482, "epoch": 1.447631754503002, "step": 4340 }, { "epoch": 1.447631754503002, "ref_ce_loss": 0.21746957302093506, "step": 4340 }, { "epoch": 1.447631754503002, "loss": 0.48690155148506165, "step": 4340 }, { "ce_loss": 0.1967121809720993, "epoch": 1.447631754503002, "step": 4340 }, { "distill_loss": 0.10658518970012665, "epoch": 1.447631754503002, "step": 4340 }, { "epoch": 1.447631754503002, "ref_ce_loss": 0.12141291052103043, "step": 4340 }, { "epoch": 1.4509673115410273, "loss": 0.6241, "step": 4350 }, { "epoch": 1.4509673115410273, "grad_norm": 2.640625476837158, "step": 4350 }, { "epoch": 1.4509673115410273, "learning_rate": 0.0002823359028907041, "step": 4350 }, { "epoch": 1.4509673115410273, "loss": 0.7299114465713501, "step": 4350 }, { "ce_loss": 0.22083674371242523, "epoch": 1.4509673115410273, "step": 4350 }, { "distill_loss": 0.19083622097969055, "epoch": 1.4509673115410273, "step": 4350 }, { "epoch": 1.4509673115410273, "ref_ce_loss": 0.12957577407360077, "step": 4350 }, { "epoch": 1.4509673115410273, "loss": 0.5755342841148376, "step": 4350 }, { "ce_loss": 0.25721633434295654, "epoch": 1.4509673115410273, "step": 4350 }, { "distill_loss": 0.1757301241159439, "epoch": 1.4509673115410273, "step": 4350 }, { "epoch": 1.4509673115410273, "ref_ce_loss": 0.14243139326572418, "step": 4350 }, { "epoch": 1.4509673115410273, "loss": 0.7849111557006836, "step": 4350 }, { "ce_loss": 0.2349243015050888, "epoch": 1.4509673115410273, "step": 4350 }, { "distill_loss": 0.17022936046123505, "epoch": 1.4509673115410273, "step": 4350 }, { "epoch": 1.4509673115410273, "ref_ce_loss": 0.15478086471557617, "step": 4350 }, { "epoch": 1.4509673115410273, "loss": 0.49518242478370667, "step": 4350 }, { "ce_loss": 0.12407956272363663, "epoch": 1.4509673115410273, "step": 4350 }, { "distill_loss": 0.18435871601104736, "epoch": 1.4509673115410273, "step": 4350 }, { "epoch": 1.4509673115410273, "ref_ce_loss": 0.08474914729595184, "step": 4350 }, { "epoch": 1.4543028685790527, "loss": 0.6729, "step": 4360 }, { "epoch": 1.4543028685790527, "grad_norm": 3.134436845779419, "step": 4360 }, { "epoch": 1.4543028685790527, "learning_rate": 0.00028224041610342684, "step": 4360 }, { "epoch": 1.4543028685790527, "loss": 0.6206045746803284, "step": 4360 }, { "ce_loss": 0.19397678971290588, "epoch": 1.4543028685790527, "step": 4360 }, { "distill_loss": 0.17355449497699738, "epoch": 1.4543028685790527, "step": 4360 }, { "epoch": 1.4543028685790527, "ref_ce_loss": 0.1407630443572998, "step": 4360 }, { "epoch": 1.4543028685790527, "loss": 0.5650532245635986, "step": 4360 }, { "ce_loss": 0.1520107090473175, "epoch": 1.4543028685790527, "step": 4360 }, { "distill_loss": 0.20400208234786987, "epoch": 1.4543028685790527, "step": 4360 }, { "epoch": 1.4543028685790527, "ref_ce_loss": 0.12605783343315125, "step": 4360 }, { "epoch": 1.4543028685790527, "loss": 0.7112383246421814, "step": 4360 }, { "ce_loss": 0.2116408795118332, "epoch": 1.4543028685790527, "step": 4360 }, { "distill_loss": 0.253677099943161, "epoch": 1.4543028685790527, "step": 4360 }, { "epoch": 1.4543028685790527, "ref_ce_loss": 0.15583555400371552, "step": 4360 }, { "epoch": 1.4543028685790527, "loss": 0.557046115398407, "step": 4360 }, { "ce_loss": 0.21234719455242157, "epoch": 1.4543028685790527, "step": 4360 }, { "distill_loss": 0.2124667763710022, "epoch": 1.4543028685790527, "step": 4360 }, { "epoch": 1.4543028685790527, "ref_ce_loss": 0.1319587528705597, "step": 4360 }, { "epoch": 1.457638425617078, "loss": 0.7405, "step": 4370 }, { "epoch": 1.457638425617078, "grad_norm": 2.821734666824341, "step": 4370 }, { "epoch": 1.457638425617078, "learning_rate": 0.0002821446881617952, "step": 4370 }, { "epoch": 1.457638425617078, "loss": 0.4984533488750458, "step": 4370 }, { "ce_loss": 0.16452477872371674, "epoch": 1.457638425617078, "step": 4370 }, { "distill_loss": 0.17812155187129974, "epoch": 1.457638425617078, "step": 4370 }, { "epoch": 1.457638425617078, "ref_ce_loss": 0.12596623599529266, "step": 4370 }, { "epoch": 1.457638425617078, "loss": 0.5950712561607361, "step": 4370 }, { "ce_loss": 0.1918494552373886, "epoch": 1.457638425617078, "step": 4370 }, { "distill_loss": 0.24175810813903809, "epoch": 1.457638425617078, "step": 4370 }, { "epoch": 1.457638425617078, "ref_ce_loss": 0.1162458136677742, "step": 4370 }, { "epoch": 1.457638425617078, "loss": 0.5311948657035828, "step": 4370 }, { "ce_loss": 0.1166113018989563, "epoch": 1.457638425617078, "step": 4370 }, { "distill_loss": 0.1816932111978531, "epoch": 1.457638425617078, "step": 4370 }, { "epoch": 1.457638425617078, "ref_ce_loss": 0.117859847843647, "step": 4370 }, { "epoch": 1.457638425617078, "loss": 1.321441650390625, "step": 4370 }, { "ce_loss": 0.1763760894536972, "epoch": 1.457638425617078, "step": 4370 }, { "distill_loss": 0.16680777072906494, "epoch": 1.457638425617078, "step": 4370 }, { "epoch": 1.457638425617078, "ref_ce_loss": 0.1586695909500122, "step": 4370 }, { "epoch": 1.4609739826551034, "loss": 0.6801, "step": 4380 }, { "epoch": 1.4609739826551034, "grad_norm": 2.483330726623535, "step": 4380 }, { "epoch": 1.4609739826551034, "learning_rate": 0.0002820487192403792, "step": 4380 }, { "epoch": 1.4609739826551034, "loss": 0.8458122611045837, "step": 4380 }, { "ce_loss": 0.18738530576229095, "epoch": 1.4609739826551034, "step": 4380 }, { "distill_loss": 0.14879602193832397, "epoch": 1.4609739826551034, "step": 4380 }, { "epoch": 1.4609739826551034, "ref_ce_loss": 0.1313273310661316, "step": 4380 }, { "epoch": 1.4609739826551034, "loss": 0.9360214471817017, "step": 4380 }, { "ce_loss": 0.09423506259918213, "epoch": 1.4609739826551034, "step": 4380 }, { "distill_loss": 0.1366136372089386, "epoch": 1.4609739826551034, "step": 4380 }, { "epoch": 1.4609739826551034, "ref_ce_loss": 0.11418969184160233, "step": 4380 }, { "epoch": 1.4609739826551034, "loss": 0.4507545232772827, "step": 4380 }, { "ce_loss": 0.16262701153755188, "epoch": 1.4609739826551034, "step": 4380 }, { "distill_loss": 0.13440194725990295, "epoch": 1.4609739826551034, "step": 4380 }, { "epoch": 1.4609739826551034, "ref_ce_loss": 0.1535230129957199, "step": 4380 }, { "epoch": 1.4609739826551034, "loss": 0.8710918426513672, "step": 4380 }, { "ce_loss": 0.18411631882190704, "epoch": 1.4609739826551034, "step": 4380 }, { "distill_loss": 0.14756955206394196, "epoch": 1.4609739826551034, "step": 4380 }, { "epoch": 1.4609739826551034, "ref_ce_loss": 0.15467248857021332, "step": 4380 }, { "epoch": 1.4643095396931287, "loss": 0.6259, "step": 4390 }, { "epoch": 1.4643095396931287, "grad_norm": 2.173874855041504, "step": 4390 }, { "epoch": 1.4643095396931287, "learning_rate": 0.0002819525095141883, "step": 4390 }, { "epoch": 1.4643095396931287, "loss": 0.42656779289245605, "step": 4390 }, { "ce_loss": 0.12446770817041397, "epoch": 1.4643095396931287, "step": 4390 }, { "distill_loss": 0.1477583944797516, "epoch": 1.4643095396931287, "step": 4390 }, { "epoch": 1.4643095396931287, "ref_ce_loss": 0.08694154024124146, "step": 4390 }, { "epoch": 1.4643095396931287, "loss": 0.5260968804359436, "step": 4390 }, { "ce_loss": 0.18589136004447937, "epoch": 1.4643095396931287, "step": 4390 }, { "distill_loss": 0.16689875721931458, "epoch": 1.4643095396931287, "step": 4390 }, { "epoch": 1.4643095396931287, "ref_ce_loss": 0.17246730625629425, "step": 4390 }, { "epoch": 1.4643095396931287, "loss": 0.5359268188476562, "step": 4390 }, { "ce_loss": 0.11834832280874252, "epoch": 1.4643095396931287, "step": 4390 }, { "distill_loss": 0.1549883782863617, "epoch": 1.4643095396931287, "step": 4390 }, { "epoch": 1.4643095396931287, "ref_ce_loss": 0.1455782651901245, "step": 4390 }, { "epoch": 1.4643095396931287, "loss": 0.4305720925331116, "step": 4390 }, { "ce_loss": 0.16029563546180725, "epoch": 1.4643095396931287, "step": 4390 }, { "distill_loss": 0.15057703852653503, "epoch": 1.4643095396931287, "step": 4390 }, { "epoch": 1.4643095396931287, "ref_ce_loss": 0.1186305582523346, "step": 4390 }, { "epoch": 1.467645096731154, "loss": 0.6841, "step": 4400 }, { "epoch": 1.467645096731154, "grad_norm": 4.759480953216553, "step": 4400 }, { "epoch": 1.467645096731154, "learning_rate": 0.000281856059158671, "step": 4400 }, { "epoch": 1.467645096731154, "loss": 0.7277843952178955, "step": 4400 }, { "ce_loss": 0.26591551303863525, "epoch": 1.467645096731154, "step": 4400 }, { "distill_loss": 0.1408987045288086, "epoch": 1.467645096731154, "step": 4400 }, { "epoch": 1.467645096731154, "ref_ce_loss": 0.15926964581012726, "step": 4400 }, { "epoch": 1.467645096731154, "loss": 0.5168325901031494, "step": 4400 }, { "ce_loss": 0.23409435153007507, "epoch": 1.467645096731154, "step": 4400 }, { "distill_loss": 0.10548853874206543, "epoch": 1.467645096731154, "step": 4400 }, { "epoch": 1.467645096731154, "ref_ce_loss": 0.175302654504776, "step": 4400 }, { "epoch": 1.467645096731154, "loss": 0.7794098854064941, "step": 4400 }, { "ce_loss": 0.19064731895923615, "epoch": 1.467645096731154, "step": 4400 }, { "distill_loss": 0.1546446979045868, "epoch": 1.467645096731154, "step": 4400 }, { "epoch": 1.467645096731154, "ref_ce_loss": 0.1138349249958992, "step": 4400 }, { "epoch": 1.467645096731154, "loss": 0.7018890380859375, "step": 4400 }, { "ce_loss": 0.205464169383049, "epoch": 1.467645096731154, "step": 4400 }, { "distill_loss": 0.14657650887966156, "epoch": 1.467645096731154, "step": 4400 }, { "epoch": 1.467645096731154, "ref_ce_loss": 0.1572255939245224, "step": 4400 }, { "epoch": 1.4709806537691794, "loss": 0.6051, "step": 4410 }, { "epoch": 1.4709806537691794, "grad_norm": 2.3284032344818115, "step": 4410 }, { "epoch": 1.4709806537691794, "learning_rate": 0.0002817593683497148, "step": 4410 }, { "epoch": 1.4709806537691794, "loss": 0.4930034279823303, "step": 4410 }, { "ce_loss": 0.15108412504196167, "epoch": 1.4709806537691794, "step": 4410 }, { "distill_loss": 0.10944245755672455, "epoch": 1.4709806537691794, "step": 4410 }, { "epoch": 1.4709806537691794, "ref_ce_loss": 0.14975978434085846, "step": 4410 }, { "epoch": 1.4709806537691794, "loss": 0.7254599928855896, "step": 4410 }, { "ce_loss": 0.21982063353061676, "epoch": 1.4709806537691794, "step": 4410 }, { "distill_loss": 0.10534757375717163, "epoch": 1.4709806537691794, "step": 4410 }, { "epoch": 1.4709806537691794, "ref_ce_loss": 0.20826774835586548, "step": 4410 }, { "epoch": 1.4709806537691794, "loss": 0.5473403930664062, "step": 4410 }, { "ce_loss": 0.1677827686071396, "epoch": 1.4709806537691794, "step": 4410 }, { "distill_loss": 0.12926937639713287, "epoch": 1.4709806537691794, "step": 4410 }, { "epoch": 1.4709806537691794, "ref_ce_loss": 0.07822735607624054, "step": 4410 }, { "epoch": 1.4709806537691794, "loss": 1.0237935781478882, "step": 4410 }, { "ce_loss": 0.198973149061203, "epoch": 1.4709806537691794, "step": 4410 }, { "distill_loss": 0.12356914579868317, "epoch": 1.4709806537691794, "step": 4410 }, { "epoch": 1.4709806537691794, "ref_ce_loss": 0.1262451410293579, "step": 4410 }, { "epoch": 1.4743162108072048, "loss": 0.6375, "step": 4420 }, { "epoch": 1.4743162108072048, "grad_norm": 5.017458915710449, "step": 4420 }, { "epoch": 1.4743162108072048, "learning_rate": 0.00028166243726364555, "step": 4420 }, { "epoch": 1.4743162108072048, "loss": 0.5311963558197021, "step": 4420 }, { "ce_loss": 0.1883728802204132, "epoch": 1.4743162108072048, "step": 4420 }, { "distill_loss": 0.10420016944408417, "epoch": 1.4743162108072048, "step": 4420 }, { "epoch": 1.4743162108072048, "ref_ce_loss": 0.09754882007837296, "step": 4420 }, { "epoch": 1.4743162108072048, "loss": 0.5212543606758118, "step": 4420 }, { "ce_loss": 0.2502850592136383, "epoch": 1.4743162108072048, "step": 4420 }, { "distill_loss": 0.13868878781795502, "epoch": 1.4743162108072048, "step": 4420 }, { "epoch": 1.4743162108072048, "ref_ce_loss": 0.1322166621685028, "step": 4420 }, { "epoch": 1.4743162108072048, "loss": 0.7725924849510193, "step": 4420 }, { "ce_loss": 0.211654394865036, "epoch": 1.4743162108072048, "step": 4420 }, { "distill_loss": 0.12484170496463776, "epoch": 1.4743162108072048, "step": 4420 }, { "epoch": 1.4743162108072048, "ref_ce_loss": 0.10742399841547012, "step": 4420 }, { "epoch": 1.4743162108072048, "loss": 0.449934184551239, "step": 4420 }, { "ce_loss": 0.21506257355213165, "epoch": 1.4743162108072048, "step": 4420 }, { "distill_loss": 0.11620468646287918, "epoch": 1.4743162108072048, "step": 4420 }, { "epoch": 1.4743162108072048, "ref_ce_loss": 0.09754442423582077, "step": 4420 }, { "epoch": 1.4776517678452301, "loss": 0.6099, "step": 4430 }, { "epoch": 1.4776517678452301, "grad_norm": 2.544800281524658, "step": 4430 }, { "epoch": 1.4776517678452301, "learning_rate": 0.0002815652660772273, "step": 4430 }, { "epoch": 1.4776517678452301, "loss": 0.5477819442749023, "step": 4430 }, { "ce_loss": 0.24007096886634827, "epoch": 1.4776517678452301, "step": 4430 }, { "distill_loss": 0.08831313997507095, "epoch": 1.4776517678452301, "step": 4430 }, { "epoch": 1.4776517678452301, "ref_ce_loss": 0.11866340786218643, "step": 4430 }, { "epoch": 1.4776517678452301, "loss": 0.46992701292037964, "step": 4430 }, { "ce_loss": 0.19611972570419312, "epoch": 1.4776517678452301, "step": 4430 }, { "distill_loss": 0.09721183776855469, "epoch": 1.4776517678452301, "step": 4430 }, { "epoch": 1.4776517678452301, "ref_ce_loss": 0.11860304325819016, "step": 4430 }, { "epoch": 1.4776517678452301, "loss": 0.48709186911582947, "step": 4430 }, { "ce_loss": 0.22293464839458466, "epoch": 1.4776517678452301, "step": 4430 }, { "distill_loss": 0.07582428306341171, "epoch": 1.4776517678452301, "step": 4430 }, { "epoch": 1.4776517678452301, "ref_ce_loss": 0.11847430467605591, "step": 4430 }, { "epoch": 1.4776517678452301, "loss": 0.9923147559165955, "step": 4430 }, { "ce_loss": 0.3206259310245514, "epoch": 1.4776517678452301, "step": 4430 }, { "distill_loss": 0.10310949385166168, "epoch": 1.4776517678452301, "step": 4430 }, { "epoch": 1.4776517678452301, "ref_ce_loss": 0.25434812903404236, "step": 4430 }, { "epoch": 1.4809873248832555, "loss": 0.6815, "step": 4440 }, { "epoch": 1.4809873248832555, "grad_norm": 3.2926695346832275, "step": 4440 }, { "epoch": 1.4809873248832555, "learning_rate": 0.000281467854967662, "step": 4440 }, { "epoch": 1.4809873248832555, "loss": 0.7626287937164307, "step": 4440 }, { "ce_loss": 0.09362614899873734, "epoch": 1.4809873248832555, "step": 4440 }, { "distill_loss": 0.06907413899898529, "epoch": 1.4809873248832555, "step": 4440 }, { "epoch": 1.4809873248832555, "ref_ce_loss": 0.0958833321928978, "step": 4440 }, { "epoch": 1.4809873248832555, "loss": 0.49843913316726685, "step": 4440 }, { "ce_loss": 0.2558717429637909, "epoch": 1.4809873248832555, "step": 4440 }, { "distill_loss": 0.09227704256772995, "epoch": 1.4809873248832555, "step": 4440 }, { "epoch": 1.4809873248832555, "ref_ce_loss": 0.15015363693237305, "step": 4440 }, { "epoch": 1.4809873248832555, "loss": 0.6889381408691406, "step": 4440 }, { "ce_loss": 0.18226727843284607, "epoch": 1.4809873248832555, "step": 4440 }, { "distill_loss": 0.09716913849115372, "epoch": 1.4809873248832555, "step": 4440 }, { "epoch": 1.4809873248832555, "ref_ce_loss": 0.1393125206232071, "step": 4440 }, { "epoch": 1.4809873248832555, "loss": 0.5315539240837097, "step": 4440 }, { "ce_loss": 0.2396669238805771, "epoch": 1.4809873248832555, "step": 4440 }, { "distill_loss": 0.09958633780479431, "epoch": 1.4809873248832555, "step": 4440 }, { "epoch": 1.4809873248832555, "ref_ce_loss": 0.12504830956459045, "step": 4440 }, { "epoch": 1.4843228819212808, "loss": 0.5759, "step": 4450 }, { "epoch": 1.4843228819212808, "grad_norm": 1.7213834524154663, "step": 4450 }, { "epoch": 1.4843228819212808, "learning_rate": 0.0002813702041125891, "step": 4450 }, { "epoch": 1.4843228819212808, "loss": 0.5091803073883057, "step": 4450 }, { "ce_loss": 0.20656217634677887, "epoch": 1.4843228819212808, "step": 4450 }, { "distill_loss": 0.09335896372795105, "epoch": 1.4843228819212808, "step": 4450 }, { "epoch": 1.4843228819212808, "ref_ce_loss": 0.13938568532466888, "step": 4450 }, { "epoch": 1.4843228819212808, "loss": 0.5872694253921509, "step": 4450 }, { "ce_loss": 0.2452130764722824, "epoch": 1.4843228819212808, "step": 4450 }, { "distill_loss": 0.10319434106349945, "epoch": 1.4843228819212808, "step": 4450 }, { "epoch": 1.4843228819212808, "ref_ce_loss": 0.1681223064661026, "step": 4450 }, { "epoch": 1.4843228819212808, "loss": 0.6491241455078125, "step": 4450 }, { "ce_loss": 0.2033555805683136, "epoch": 1.4843228819212808, "step": 4450 }, { "distill_loss": 0.10134235769510269, "epoch": 1.4843228819212808, "step": 4450 }, { "epoch": 1.4843228819212808, "ref_ce_loss": 0.09184518456459045, "step": 4450 }, { "epoch": 1.4843228819212808, "loss": 1.2535475492477417, "step": 4450 }, { "ce_loss": 0.2783401310443878, "epoch": 1.4843228819212808, "step": 4450 }, { "distill_loss": 0.09875951707363129, "epoch": 1.4843228819212808, "step": 4450 }, { "epoch": 1.4843228819212808, "ref_ce_loss": 0.16019049286842346, "step": 4450 }, { "epoch": 1.4876584389593062, "loss": 0.6338, "step": 4460 }, { "epoch": 1.4876584389593062, "grad_norm": 2.5184757709503174, "step": 4460 }, { "epoch": 1.4876584389593062, "learning_rate": 0.00028127231369008525, "step": 4460 }, { "epoch": 1.4876584389593062, "loss": 0.8658751845359802, "step": 4460 }, { "ce_loss": 0.23773770034313202, "epoch": 1.4876584389593062, "step": 4460 }, { "distill_loss": 0.11125155538320541, "epoch": 1.4876584389593062, "step": 4460 }, { "epoch": 1.4876584389593062, "ref_ce_loss": 0.12172891944646835, "step": 4460 }, { "epoch": 1.4876584389593062, "loss": 1.2557917833328247, "step": 4460 }, { "ce_loss": 0.18311569094657898, "epoch": 1.4876584389593062, "step": 4460 }, { "distill_loss": 0.0814177542924881, "epoch": 1.4876584389593062, "step": 4460 }, { "epoch": 1.4876584389593062, "ref_ce_loss": 0.2201717346906662, "step": 4460 }, { "epoch": 1.4876584389593062, "loss": 0.4614761173725128, "step": 4460 }, { "ce_loss": 0.2173597365617752, "epoch": 1.4876584389593062, "step": 4460 }, { "distill_loss": 0.08315160125494003, "epoch": 1.4876584389593062, "step": 4460 }, { "epoch": 1.4876584389593062, "ref_ce_loss": 0.1606503278017044, "step": 4460 }, { "epoch": 1.4876584389593062, "loss": 0.82685387134552, "step": 4460 }, { "ce_loss": 0.33986949920654297, "epoch": 1.4876584389593062, "step": 4460 }, { "distill_loss": 0.12730133533477783, "epoch": 1.4876584389593062, "step": 4460 }, { "epoch": 1.4876584389593062, "ref_ce_loss": 0.1949513703584671, "step": 4460 }, { "epoch": 1.4909939959973315, "loss": 0.6531, "step": 4470 }, { "epoch": 1.4909939959973315, "grad_norm": 4.018879413604736, "step": 4470 }, { "epoch": 1.4909939959973315, "learning_rate": 0.00028117418387866384, "step": 4470 }, { "epoch": 1.4909939959973315, "loss": 0.4994458556175232, "step": 4470 }, { "ce_loss": 0.1709452122449875, "epoch": 1.4909939959973315, "step": 4470 }, { "distill_loss": 0.0967760682106018, "epoch": 1.4909939959973315, "step": 4470 }, { "epoch": 1.4909939959973315, "ref_ce_loss": 0.1585451066493988, "step": 4470 }, { "epoch": 1.4909939959973315, "loss": 0.7291620969772339, "step": 4470 }, { "ce_loss": 0.1918286234140396, "epoch": 1.4909939959973315, "step": 4470 }, { "distill_loss": 0.13846111297607422, "epoch": 1.4909939959973315, "step": 4470 }, { "epoch": 1.4909939959973315, "ref_ce_loss": 0.15093106031417847, "step": 4470 }, { "epoch": 1.4909939959973315, "loss": 0.5087245106697083, "step": 4470 }, { "ce_loss": 0.2672545909881592, "epoch": 1.4909939959973315, "step": 4470 }, { "distill_loss": 0.1381414383649826, "epoch": 1.4909939959973315, "step": 4470 }, { "epoch": 1.4909939959973315, "ref_ce_loss": 0.10308218747377396, "step": 4470 }, { "epoch": 1.4909939959973315, "loss": 0.6515514850616455, "step": 4470 }, { "ce_loss": 0.15585504472255707, "epoch": 1.4909939959973315, "step": 4470 }, { "distill_loss": 0.10814927518367767, "epoch": 1.4909939959973315, "step": 4470 }, { "epoch": 1.4909939959973315, "ref_ce_loss": 0.12409359216690063, "step": 4470 }, { "epoch": 1.4943295530353569, "loss": 0.5677, "step": 4480 }, { "epoch": 1.4943295530353569, "grad_norm": 2.1926746368408203, "step": 4480 }, { "epoch": 1.4943295530353569, "learning_rate": 0.00028107581485727507, "step": 4480 }, { "epoch": 1.4943295530353569, "loss": 0.43809303641319275, "step": 4480 }, { "ce_loss": 0.16458266973495483, "epoch": 1.4943295530353569, "step": 4480 }, { "distill_loss": 0.1329808533191681, "epoch": 1.4943295530353569, "step": 4480 }, { "epoch": 1.4943295530353569, "ref_ce_loss": 0.14041352272033691, "step": 4480 }, { "epoch": 1.4943295530353569, "loss": 0.442251980304718, "step": 4480 }, { "ce_loss": 0.20063066482543945, "epoch": 1.4943295530353569, "step": 4480 }, { "distill_loss": 0.12663978338241577, "epoch": 1.4943295530353569, "step": 4480 }, { "epoch": 1.4943295530353569, "ref_ce_loss": 0.11457744985818863, "step": 4480 }, { "epoch": 1.4943295530353569, "loss": 0.544518768787384, "step": 4480 }, { "ce_loss": 0.20446360111236572, "epoch": 1.4943295530353569, "step": 4480 }, { "distill_loss": 0.12162137031555176, "epoch": 1.4943295530353569, "step": 4480 }, { "epoch": 1.4943295530353569, "ref_ce_loss": 0.15851354598999023, "step": 4480 }, { "epoch": 1.4943295530353569, "loss": 1.210282802581787, "step": 4480 }, { "ce_loss": 0.15788322687149048, "epoch": 1.4943295530353569, "step": 4480 }, { "distill_loss": 0.12018802762031555, "epoch": 1.4943295530353569, "step": 4480 }, { "epoch": 1.4943295530353569, "ref_ce_loss": 0.13062524795532227, "step": 4480 }, { "epoch": 1.4976651100733822, "loss": 0.6311, "step": 4490 }, { "epoch": 1.4976651100733822, "grad_norm": 3.151811122894287, "step": 4490 }, { "epoch": 1.4976651100733822, "learning_rate": 0.0002809772068053052, "step": 4490 }, { "epoch": 1.4976651100733822, "loss": 1.0704823732376099, "step": 4490 }, { "ce_loss": 0.2743171453475952, "epoch": 1.4976651100733822, "step": 4490 }, { "distill_loss": 0.13374269008636475, "epoch": 1.4976651100733822, "step": 4490 }, { "epoch": 1.4976651100733822, "ref_ce_loss": 0.09331765025854111, "step": 4490 }, { "epoch": 1.4976651100733822, "loss": 0.5207926630973816, "step": 4490 }, { "ce_loss": 0.23340195417404175, "epoch": 1.4976651100733822, "step": 4490 }, { "distill_loss": 0.11091122031211853, "epoch": 1.4976651100733822, "step": 4490 }, { "epoch": 1.4976651100733822, "ref_ce_loss": 0.11333482712507248, "step": 4490 }, { "epoch": 1.4976651100733822, "loss": 0.5735177397727966, "step": 4490 }, { "ce_loss": 0.15200015902519226, "epoch": 1.4976651100733822, "step": 4490 }, { "distill_loss": 0.10044776648283005, "epoch": 1.4976651100733822, "step": 4490 }, { "epoch": 1.4976651100733822, "ref_ce_loss": 0.11417778581380844, "step": 4490 }, { "epoch": 1.4976651100733822, "loss": 0.44036537408828735, "step": 4490 }, { "ce_loss": 0.18106389045715332, "epoch": 1.4976651100733822, "step": 4490 }, { "distill_loss": 0.09861503541469574, "epoch": 1.4976651100733822, "step": 4490 }, { "epoch": 1.4976651100733822, "ref_ce_loss": 0.1276373416185379, "step": 4490 }, { "epoch": 1.5010006671114076, "loss": 0.6131, "step": 4500 }, { "epoch": 1.5010006671114076, "grad_norm": 2.3492748737335205, "step": 4500 }, { "epoch": 1.5010006671114076, "learning_rate": 0.0002808783599025764, "step": 4500 }, { "epoch": 1.5010006671114076, "loss": 0.5812758803367615, "step": 4500 }, { "ce_loss": 0.2145005613565445, "epoch": 1.5010006671114076, "step": 4500 }, { "distill_loss": 0.11526884138584137, "epoch": 1.5010006671114076, "step": 4500 }, { "epoch": 1.5010006671114076, "ref_ce_loss": 0.12662525475025177, "step": 4500 }, { "epoch": 1.5010006671114076, "loss": 0.7451175451278687, "step": 4500 }, { "ce_loss": 0.18240326642990112, "epoch": 1.5010006671114076, "step": 4500 }, { "distill_loss": 0.09705304354429245, "epoch": 1.5010006671114076, "step": 4500 }, { "epoch": 1.5010006671114076, "ref_ce_loss": 0.1561345010995865, "step": 4500 }, { "epoch": 1.5010006671114076, "loss": 0.4314347207546234, "step": 4500 }, { "ce_loss": 0.2048446536064148, "epoch": 1.5010006671114076, "step": 4500 }, { "distill_loss": 0.11508011817932129, "epoch": 1.5010006671114076, "step": 4500 }, { "epoch": 1.5010006671114076, "ref_ce_loss": 0.11129327863454819, "step": 4500 }, { "epoch": 1.5010006671114076, "loss": 0.5526909232139587, "step": 4500 }, { "ce_loss": 0.20685704052448273, "epoch": 1.5010006671114076, "step": 4500 }, { "distill_loss": 0.13564878702163696, "epoch": 1.5010006671114076, "step": 4500 }, { "epoch": 1.5010006671114076, "ref_ce_loss": 0.13427762687206268, "step": 4500 }, { "epoch": 1.504336224149433, "loss": 0.5843, "step": 4510 }, { "epoch": 1.504336224149433, "grad_norm": 3.733797788619995, "step": 4510 }, { "epoch": 1.504336224149433, "learning_rate": 0.00028077927432934645, "step": 4510 }, { "epoch": 1.504336224149433, "loss": 0.6804251670837402, "step": 4510 }, { "ce_loss": 0.32606402039527893, "epoch": 1.504336224149433, "step": 4510 }, { "distill_loss": 0.1469072699546814, "epoch": 1.504336224149433, "step": 4510 }, { "epoch": 1.504336224149433, "ref_ce_loss": 0.16901257634162903, "step": 4510 }, { "epoch": 1.504336224149433, "loss": 0.5420999526977539, "step": 4510 }, { "ce_loss": 0.23655718564987183, "epoch": 1.504336224149433, "step": 4510 }, { "distill_loss": 0.09165996313095093, "epoch": 1.504336224149433, "step": 4510 }, { "epoch": 1.504336224149433, "ref_ce_loss": 0.21362152695655823, "step": 4510 }, { "epoch": 1.504336224149433, "loss": 0.35762321949005127, "step": 4510 }, { "ce_loss": 0.15837861597537994, "epoch": 1.504336224149433, "step": 4510 }, { "distill_loss": 0.09633652865886688, "epoch": 1.504336224149433, "step": 4510 }, { "epoch": 1.504336224149433, "ref_ce_loss": 0.1026814803481102, "step": 4510 }, { "epoch": 1.504336224149433, "loss": 0.5618549585342407, "step": 4510 }, { "ce_loss": 0.2075524926185608, "epoch": 1.504336224149433, "step": 4510 }, { "distill_loss": 0.10399427264928818, "epoch": 1.504336224149433, "step": 4510 }, { "epoch": 1.504336224149433, "ref_ce_loss": 0.12820473313331604, "step": 4510 }, { "epoch": 1.5076717811874583, "loss": 0.5818, "step": 4520 }, { "epoch": 1.5076717811874583, "grad_norm": 3.101632595062256, "step": 4520 }, { "epoch": 1.5076717811874583, "learning_rate": 0.0002806799502663083, "step": 4520 }, { "epoch": 1.5076717811874583, "loss": 0.5161787867546082, "step": 4520 }, { "ce_loss": 0.2295653074979782, "epoch": 1.5076717811874583, "step": 4520 }, { "distill_loss": 0.1541547179222107, "epoch": 1.5076717811874583, "step": 4520 }, { "epoch": 1.5076717811874583, "ref_ce_loss": 0.13225367665290833, "step": 4520 }, { "epoch": 1.5076717811874583, "loss": 0.7354413270950317, "step": 4520 }, { "ce_loss": 0.4115031957626343, "epoch": 1.5076717811874583, "step": 4520 }, { "distill_loss": 0.15912646055221558, "epoch": 1.5076717811874583, "step": 4520 }, { "epoch": 1.5076717811874583, "ref_ce_loss": 0.16471584141254425, "step": 4520 }, { "epoch": 1.5076717811874583, "loss": 1.3074798583984375, "step": 4520 }, { "ce_loss": 0.2868303060531616, "epoch": 1.5076717811874583, "step": 4520 }, { "distill_loss": 0.14963319897651672, "epoch": 1.5076717811874583, "step": 4520 }, { "epoch": 1.5076717811874583, "ref_ce_loss": 0.14439374208450317, "step": 4520 }, { "epoch": 1.5076717811874583, "loss": 0.6702625751495361, "step": 4520 }, { "ce_loss": 0.2194564938545227, "epoch": 1.5076717811874583, "step": 4520 }, { "distill_loss": 0.13500796258449554, "epoch": 1.5076717811874583, "step": 4520 }, { "epoch": 1.5076717811874583, "ref_ce_loss": 0.14340227842330933, "step": 4520 }, { "epoch": 1.5110073382254836, "loss": 0.7013, "step": 4530 }, { "epoch": 1.5110073382254836, "grad_norm": 4.905083656311035, "step": 4530 }, { "epoch": 1.5110073382254836, "learning_rate": 0.00028058038789458993, "step": 4530 }, { "epoch": 1.5110073382254836, "loss": 0.6859695315361023, "step": 4530 }, { "ce_loss": 0.19414359331130981, "epoch": 1.5110073382254836, "step": 4530 }, { "distill_loss": 0.12613198161125183, "epoch": 1.5110073382254836, "step": 4530 }, { "epoch": 1.5110073382254836, "ref_ce_loss": 0.11808284372091293, "step": 4530 }, { "epoch": 1.5110073382254836, "loss": 0.5683508515357971, "step": 4530 }, { "ce_loss": 0.237142413854599, "epoch": 1.5110073382254836, "step": 4530 }, { "distill_loss": 0.15945348143577576, "epoch": 1.5110073382254836, "step": 4530 }, { "epoch": 1.5110073382254836, "ref_ce_loss": 0.16993550956249237, "step": 4530 }, { "epoch": 1.5110073382254836, "loss": 0.4896041452884674, "step": 4530 }, { "ce_loss": 0.1853276491165161, "epoch": 1.5110073382254836, "step": 4530 }, { "distill_loss": 0.14374326169490814, "epoch": 1.5110073382254836, "step": 4530 }, { "epoch": 1.5110073382254836, "ref_ce_loss": 0.1603091061115265, "step": 4530 }, { "epoch": 1.5110073382254836, "loss": 0.8239613771438599, "step": 4530 }, { "ce_loss": 0.23428012430667877, "epoch": 1.5110073382254836, "step": 4530 }, { "distill_loss": 0.17898336052894592, "epoch": 1.5110073382254836, "step": 4530 }, { "epoch": 1.5110073382254836, "ref_ce_loss": 0.13119062781333923, "step": 4530 }, { "epoch": 1.514342895263509, "loss": 0.6341, "step": 4540 }, { "epoch": 1.514342895263509, "grad_norm": 2.772738456726074, "step": 4540 }, { "epoch": 1.514342895263509, "learning_rate": 0.0002804805873957538, "step": 4540 }, { "epoch": 1.514342895263509, "loss": 0.5715469717979431, "step": 4540 }, { "ce_loss": 0.18093213438987732, "epoch": 1.514342895263509, "step": 4540 }, { "distill_loss": 0.15032875537872314, "epoch": 1.514342895263509, "step": 4540 }, { "epoch": 1.514342895263509, "ref_ce_loss": 0.15656159818172455, "step": 4540 }, { "epoch": 1.514342895263509, "loss": 0.45982062816619873, "step": 4540 }, { "ce_loss": 0.1117219477891922, "epoch": 1.514342895263509, "step": 4540 }, { "distill_loss": 0.1376478224992752, "epoch": 1.514342895263509, "step": 4540 }, { "epoch": 1.514342895263509, "ref_ce_loss": 0.08686988800764084, "step": 4540 }, { "epoch": 1.514342895263509, "loss": 0.4616468548774719, "step": 4540 }, { "ce_loss": 0.14209826290607452, "epoch": 1.514342895263509, "step": 4540 }, { "distill_loss": 0.13099880516529083, "epoch": 1.514342895263509, "step": 4540 }, { "epoch": 1.514342895263509, "ref_ce_loss": 0.09180080890655518, "step": 4540 }, { "epoch": 1.514342895263509, "loss": 0.6827181577682495, "step": 4540 }, { "ce_loss": 0.24990394711494446, "epoch": 1.514342895263509, "step": 4540 }, { "distill_loss": 0.15088555216789246, "epoch": 1.514342895263509, "step": 4540 }, { "epoch": 1.514342895263509, "ref_ce_loss": 0.09840228408575058, "step": 4540 }, { "epoch": 1.5176784523015343, "loss": 0.6297, "step": 4550 }, { "epoch": 1.5176784523015343, "grad_norm": 2.13785719871521, "step": 4550 }, { "epoch": 1.5176784523015343, "learning_rate": 0.0002803805489517966, "step": 4550 }, { "epoch": 1.5176784523015343, "loss": 0.4930049777030945, "step": 4550 }, { "ce_loss": 0.17963160574436188, "epoch": 1.5176784523015343, "step": 4550 }, { "distill_loss": 0.1052674651145935, "epoch": 1.5176784523015343, "step": 4550 }, { "epoch": 1.5176784523015343, "ref_ce_loss": 0.1195664182305336, "step": 4550 }, { "epoch": 1.5176784523015343, "loss": 0.6021493077278137, "step": 4550 }, { "ce_loss": 0.1968289315700531, "epoch": 1.5176784523015343, "step": 4550 }, { "distill_loss": 0.1056264191865921, "epoch": 1.5176784523015343, "step": 4550 }, { "epoch": 1.5176784523015343, "ref_ce_loss": 0.15359090268611908, "step": 4550 }, { "epoch": 1.5176784523015343, "loss": 0.7040922045707703, "step": 4550 }, { "ce_loss": 0.12671718001365662, "epoch": 1.5176784523015343, "step": 4550 }, { "distill_loss": 0.07407093793153763, "epoch": 1.5176784523015343, "step": 4550 }, { "epoch": 1.5176784523015343, "ref_ce_loss": 0.13461308181285858, "step": 4550 }, { "epoch": 1.5176784523015343, "loss": 0.4749680161476135, "step": 4550 }, { "ce_loss": 0.19816166162490845, "epoch": 1.5176784523015343, "step": 4550 }, { "distill_loss": 0.09519590437412262, "epoch": 1.5176784523015343, "step": 4550 }, { "epoch": 1.5176784523015343, "ref_ce_loss": 0.11878227442502975, "step": 4550 }, { "epoch": 1.5210140093395597, "loss": 0.589, "step": 4560 }, { "epoch": 1.5210140093395597, "grad_norm": 2.2736968994140625, "step": 4560 }, { "epoch": 1.5210140093395597, "learning_rate": 0.0002802802727451491, "step": 4560 }, { "epoch": 1.5210140093395597, "loss": 0.8191660642623901, "step": 4560 }, { "ce_loss": 0.22039499878883362, "epoch": 1.5210140093395597, "step": 4560 }, { "distill_loss": 0.09533695131540298, "epoch": 1.5210140093395597, "step": 4560 }, { "epoch": 1.5210140093395597, "ref_ce_loss": 0.17217586934566498, "step": 4560 }, { "epoch": 1.5210140093395597, "loss": 0.4060024917125702, "step": 4560 }, { "ce_loss": 0.11546523869037628, "epoch": 1.5210140093395597, "step": 4560 }, { "distill_loss": 0.10482259094715118, "epoch": 1.5210140093395597, "step": 4560 }, { "epoch": 1.5210140093395597, "ref_ce_loss": 0.14050264656543732, "step": 4560 }, { "epoch": 1.5210140093395597, "loss": 0.3823060989379883, "step": 4560 }, { "ce_loss": 0.127862811088562, "epoch": 1.5210140093395597, "step": 4560 }, { "distill_loss": 0.12342780083417892, "epoch": 1.5210140093395597, "step": 4560 }, { "epoch": 1.5210140093395597, "ref_ce_loss": 0.09424252063035965, "step": 4560 }, { "epoch": 1.5210140093395597, "loss": 1.0516396760940552, "step": 4560 }, { "ce_loss": 0.30975213646888733, "epoch": 1.5210140093395597, "step": 4560 }, { "distill_loss": 0.13757649064064026, "epoch": 1.5210140093395597, "step": 4560 }, { "epoch": 1.5210140093395597, "ref_ce_loss": 0.19756831228733063, "step": 4560 }, { "epoch": 1.524349566377585, "loss": 0.583, "step": 4570 }, { "epoch": 1.524349566377585, "grad_norm": 2.270359754562378, "step": 4570 }, { "epoch": 1.524349566377585, "learning_rate": 0.0002801797589586755, "step": 4570 }, { "epoch": 1.524349566377585, "loss": 0.4193212687969208, "step": 4570 }, { "ce_loss": 0.14371411502361298, "epoch": 1.524349566377585, "step": 4570 }, { "distill_loss": 0.10037150233983994, "epoch": 1.524349566377585, "step": 4570 }, { "epoch": 1.524349566377585, "ref_ce_loss": 0.11395347863435745, "step": 4570 }, { "epoch": 1.524349566377585, "loss": 0.4830290973186493, "step": 4570 }, { "ce_loss": 0.221247598528862, "epoch": 1.524349566377585, "step": 4570 }, { "distill_loss": 0.0934707373380661, "epoch": 1.524349566377585, "step": 4570 }, { "epoch": 1.524349566377585, "ref_ce_loss": 0.12097950279712677, "step": 4570 }, { "epoch": 1.524349566377585, "loss": 0.5064443349838257, "step": 4570 }, { "ce_loss": 0.16764579713344574, "epoch": 1.524349566377585, "step": 4570 }, { "distill_loss": 0.10355889797210693, "epoch": 1.524349566377585, "step": 4570 }, { "epoch": 1.524349566377585, "ref_ce_loss": 0.1548304706811905, "step": 4570 }, { "epoch": 1.524349566377585, "loss": 0.9362093210220337, "step": 4570 }, { "ce_loss": 0.29452425241470337, "epoch": 1.524349566377585, "step": 4570 }, { "distill_loss": 0.13609839975833893, "epoch": 1.524349566377585, "step": 4570 }, { "epoch": 1.524349566377585, "ref_ce_loss": 0.13039226830005646, "step": 4570 }, { "epoch": 1.5276851234156104, "loss": 0.6303, "step": 4580 }, { "epoch": 1.5276851234156104, "grad_norm": 2.9846997261047363, "step": 4580 }, { "epoch": 1.5276851234156104, "learning_rate": 0.00028007900777567325, "step": 4580 }, { "epoch": 1.5276851234156104, "loss": 1.1162803173065186, "step": 4580 }, { "ce_loss": 0.3441166579723358, "epoch": 1.5276851234156104, "step": 4580 }, { "distill_loss": 0.13471393287181854, "epoch": 1.5276851234156104, "step": 4580 }, { "epoch": 1.5276851234156104, "ref_ce_loss": 0.14935973286628723, "step": 4580 }, { "epoch": 1.5276851234156104, "loss": 0.4999135136604309, "step": 4580 }, { "ce_loss": 0.19262626767158508, "epoch": 1.5276851234156104, "step": 4580 }, { "distill_loss": 0.13174332678318024, "epoch": 1.5276851234156104, "step": 4580 }, { "epoch": 1.5276851234156104, "ref_ce_loss": 0.11405842751264572, "step": 4580 }, { "epoch": 1.5276851234156104, "loss": 0.5764414072036743, "step": 4580 }, { "ce_loss": 0.27998676896095276, "epoch": 1.5276851234156104, "step": 4580 }, { "distill_loss": 0.1316446214914322, "epoch": 1.5276851234156104, "step": 4580 }, { "epoch": 1.5276851234156104, "ref_ce_loss": 0.16455060243606567, "step": 4580 }, { "epoch": 1.5276851234156104, "loss": 0.822860836982727, "step": 4580 }, { "ce_loss": 0.30782365798950195, "epoch": 1.5276851234156104, "step": 4580 }, { "distill_loss": 0.09736745804548264, "epoch": 1.5276851234156104, "step": 4580 }, { "epoch": 1.5276851234156104, "ref_ce_loss": 0.22478896379470825, "step": 4580 }, { "epoch": 1.5310206804536357, "loss": 0.6253, "step": 4590 }, { "epoch": 1.5310206804536357, "grad_norm": 2.1859419345855713, "step": 4590 }, { "epoch": 1.5310206804536357, "learning_rate": 0.0002799780193798728, "step": 4590 }, { "epoch": 1.5310206804536357, "loss": 0.46541544795036316, "step": 4590 }, { "ce_loss": 0.17891079187393188, "epoch": 1.5310206804536357, "step": 4590 }, { "distill_loss": 0.1077502891421318, "epoch": 1.5310206804536357, "step": 4590 }, { "epoch": 1.5310206804536357, "ref_ce_loss": 0.1313294768333435, "step": 4590 }, { "epoch": 1.5310206804536357, "loss": 0.5507728457450867, "step": 4590 }, { "ce_loss": 0.2049635499715805, "epoch": 1.5310206804536357, "step": 4590 }, { "distill_loss": 0.10741844028234482, "epoch": 1.5310206804536357, "step": 4590 }, { "epoch": 1.5310206804536357, "ref_ce_loss": 0.192083477973938, "step": 4590 }, { "epoch": 1.5310206804536357, "loss": 0.22651554644107819, "step": 4590 }, { "ce_loss": 0.0875018909573555, "epoch": 1.5310206804536357, "step": 4590 }, { "distill_loss": 0.07626447826623917, "epoch": 1.5310206804536357, "step": 4590 }, { "epoch": 1.5310206804536357, "ref_ce_loss": 0.06258339434862137, "step": 4590 }, { "epoch": 1.5310206804536357, "loss": 0.5488446354866028, "step": 4590 }, { "ce_loss": 0.25994938611984253, "epoch": 1.5310206804536357, "step": 4590 }, { "distill_loss": 0.114890918135643, "epoch": 1.5310206804536357, "step": 4590 }, { "epoch": 1.5310206804536357, "ref_ce_loss": 0.17376874387264252, "step": 4590 }, { "epoch": 1.534356237491661, "loss": 0.702, "step": 4600 }, { "epoch": 1.534356237491661, "grad_norm": 5.405256271362305, "step": 4600 }, { "epoch": 1.534356237491661, "learning_rate": 0.0002798767939554372, "step": 4600 }, { "epoch": 1.534356237491661, "loss": 0.5550327897071838, "step": 4600 }, { "ce_loss": 0.22164778411388397, "epoch": 1.534356237491661, "step": 4600 }, { "distill_loss": 0.10468171536922455, "epoch": 1.534356237491661, "step": 4600 }, { "epoch": 1.534356237491661, "ref_ce_loss": 0.14506924152374268, "step": 4600 }, { "epoch": 1.534356237491661, "loss": 0.38334205746650696, "step": 4600 }, { "ce_loss": 0.1728275716304779, "epoch": 1.534356237491661, "step": 4600 }, { "distill_loss": 0.09280980378389359, "epoch": 1.534356237491661, "step": 4600 }, { "epoch": 1.534356237491661, "ref_ce_loss": 0.1168849989771843, "step": 4600 }, { "epoch": 1.534356237491661, "loss": 0.43203508853912354, "step": 4600 }, { "ce_loss": 0.1975594013929367, "epoch": 1.534356237491661, "step": 4600 }, { "distill_loss": 0.08630432188510895, "epoch": 1.534356237491661, "step": 4600 }, { "epoch": 1.534356237491661, "ref_ce_loss": 0.11872578412294388, "step": 4600 }, { "epoch": 1.534356237491661, "loss": 0.5808179378509521, "step": 4600 }, { "ce_loss": 0.3010261654853821, "epoch": 1.534356237491661, "step": 4600 }, { "distill_loss": 0.10343575477600098, "epoch": 1.534356237491661, "step": 4600 }, { "epoch": 1.534356237491661, "ref_ce_loss": 0.1339024305343628, "step": 4600 }, { "epoch": 1.5376917945296864, "loss": 0.5234, "step": 4610 }, { "epoch": 1.5376917945296864, "grad_norm": 2.9711384773254395, "step": 4610 }, { "epoch": 1.5376917945296864, "learning_rate": 0.00027977533168696154, "step": 4610 }, { "epoch": 1.5376917945296864, "loss": 0.45412778854370117, "step": 4610 }, { "ce_loss": 0.17941448092460632, "epoch": 1.5376917945296864, "step": 4610 }, { "distill_loss": 0.1014786809682846, "epoch": 1.5376917945296864, "step": 4610 }, { "epoch": 1.5376917945296864, "ref_ce_loss": 0.10625439882278442, "step": 4610 }, { "epoch": 1.5376917945296864, "loss": 0.4874230623245239, "step": 4610 }, { "ce_loss": 0.12424776703119278, "epoch": 1.5376917945296864, "step": 4610 }, { "distill_loss": 0.0906376838684082, "epoch": 1.5376917945296864, "step": 4610 }, { "epoch": 1.5376917945296864, "ref_ce_loss": 0.12060170620679855, "step": 4610 }, { "epoch": 1.5376917945296864, "loss": 0.4538404643535614, "step": 4610 }, { "ce_loss": 0.16458889842033386, "epoch": 1.5376917945296864, "step": 4610 }, { "distill_loss": 0.09361595660448074, "epoch": 1.5376917945296864, "step": 4610 }, { "epoch": 1.5376917945296864, "ref_ce_loss": 0.13653051853179932, "step": 4610 }, { "epoch": 1.5376917945296864, "loss": 0.681818962097168, "step": 4610 }, { "ce_loss": 0.1615886092185974, "epoch": 1.5376917945296864, "step": 4610 }, { "distill_loss": 0.1266528218984604, "epoch": 1.5376917945296864, "step": 4610 }, { "epoch": 1.5376917945296864, "ref_ce_loss": 0.12056848406791687, "step": 4610 }, { "epoch": 1.5410273515677118, "loss": 0.582, "step": 4620 }, { "epoch": 1.5410273515677118, "grad_norm": 2.548248052597046, "step": 4620 }, { "epoch": 1.5410273515677118, "learning_rate": 0.0002796736327594731, "step": 4620 }, { "epoch": 1.5410273515677118, "loss": 0.5229469537734985, "step": 4620 }, { "ce_loss": 0.23538225889205933, "epoch": 1.5410273515677118, "step": 4620 }, { "distill_loss": 0.11060227453708649, "epoch": 1.5410273515677118, "step": 4620 }, { "epoch": 1.5410273515677118, "ref_ce_loss": 0.12148841470479965, "step": 4620 }, { "epoch": 1.5410273515677118, "loss": 0.6585526466369629, "step": 4620 }, { "ce_loss": 0.27497315406799316, "epoch": 1.5410273515677118, "step": 4620 }, { "distill_loss": 0.12979480624198914, "epoch": 1.5410273515677118, "step": 4620 }, { "epoch": 1.5410273515677118, "ref_ce_loss": 0.14562958478927612, "step": 4620 }, { "epoch": 1.5410273515677118, "loss": 0.9131790399551392, "step": 4620 }, { "ce_loss": 0.2843893766403198, "epoch": 1.5410273515677118, "step": 4620 }, { "distill_loss": 0.11908374726772308, "epoch": 1.5410273515677118, "step": 4620 }, { "epoch": 1.5410273515677118, "ref_ce_loss": 0.1165936067700386, "step": 4620 }, { "epoch": 1.5410273515677118, "loss": 0.46176856756210327, "step": 4620 }, { "ce_loss": 0.11503535509109497, "epoch": 1.5410273515677118, "step": 4620 }, { "distill_loss": 0.08880730718374252, "epoch": 1.5410273515677118, "step": 4620 }, { "epoch": 1.5410273515677118, "ref_ce_loss": 0.12023654580116272, "step": 4620 }, { "epoch": 1.544362908605737, "loss": 0.6127, "step": 4630 }, { "epoch": 1.544362908605737, "grad_norm": 2.780367612838745, "step": 4630 }, { "epoch": 1.544362908605737, "learning_rate": 0.00027957169735843066, "step": 4630 }, { "epoch": 1.544362908605737, "loss": 0.544620931148529, "step": 4630 }, { "ce_loss": 0.14425063133239746, "epoch": 1.544362908605737, "step": 4630 }, { "distill_loss": 0.11910367757081985, "epoch": 1.544362908605737, "step": 4630 }, { "epoch": 1.544362908605737, "ref_ce_loss": 0.08964873105287552, "step": 4630 }, { "epoch": 1.544362908605737, "loss": 0.4384223520755768, "step": 4630 }, { "ce_loss": 0.2096438705921173, "epoch": 1.544362908605737, "step": 4630 }, { "distill_loss": 0.1158871054649353, "epoch": 1.544362908605737, "step": 4630 }, { "epoch": 1.544362908605737, "ref_ce_loss": 0.11236631125211716, "step": 4630 }, { "epoch": 1.544362908605737, "loss": 0.34739792346954346, "step": 4630 }, { "ce_loss": 0.15089234709739685, "epoch": 1.544362908605737, "step": 4630 }, { "distill_loss": 0.11569201201200485, "epoch": 1.544362908605737, "step": 4630 }, { "epoch": 1.544362908605737, "ref_ce_loss": 0.07834845036268234, "step": 4630 }, { "epoch": 1.544362908605737, "loss": 0.6551713943481445, "step": 4630 }, { "ce_loss": 0.14992360770702362, "epoch": 1.544362908605737, "step": 4630 }, { "distill_loss": 0.10660592466592789, "epoch": 1.544362908605737, "step": 4630 }, { "epoch": 1.544362908605737, "ref_ce_loss": 0.10331352800130844, "step": 4630 }, { "epoch": 1.5476984656437625, "loss": 0.5794, "step": 4640 }, { "epoch": 1.5476984656437625, "grad_norm": 1.8713886737823486, "step": 4640 }, { "epoch": 1.5476984656437625, "learning_rate": 0.00027946952566972397, "step": 4640 }, { "epoch": 1.5476984656437625, "loss": 0.416147381067276, "step": 4640 }, { "ce_loss": 0.17810320854187012, "epoch": 1.5476984656437625, "step": 4640 }, { "distill_loss": 0.07940616458654404, "epoch": 1.5476984656437625, "step": 4640 }, { "epoch": 1.5476984656437625, "ref_ce_loss": 0.15827806293964386, "step": 4640 }, { "epoch": 1.5476984656437625, "loss": 0.4797060191631317, "step": 4640 }, { "ce_loss": 0.16103573143482208, "epoch": 1.5476984656437625, "step": 4640 }, { "distill_loss": 0.10082913935184479, "epoch": 1.5476984656437625, "step": 4640 }, { "epoch": 1.5476984656437625, "ref_ce_loss": 0.15501374006271362, "step": 4640 }, { "epoch": 1.5476984656437625, "loss": 0.5062236189842224, "step": 4640 }, { "ce_loss": 0.2246425896883011, "epoch": 1.5476984656437625, "step": 4640 }, { "distill_loss": 0.11969659477472305, "epoch": 1.5476984656437625, "step": 4640 }, { "epoch": 1.5476984656437625, "ref_ce_loss": 0.12336175888776779, "step": 4640 }, { "epoch": 1.5476984656437625, "loss": 0.3968091905117035, "step": 4640 }, { "ce_loss": 0.17112435400485992, "epoch": 1.5476984656437625, "step": 4640 }, { "distill_loss": 0.10536085069179535, "epoch": 1.5476984656437625, "step": 4640 }, { "epoch": 1.5476984656437625, "ref_ce_loss": 0.12002520263195038, "step": 4640 }, { "epoch": 1.5510340226817878, "loss": 0.5791, "step": 4650 }, { "epoch": 1.5510340226817878, "grad_norm": 4.521932601928711, "step": 4650 }, { "epoch": 1.5510340226817878, "learning_rate": 0.0002793671178796741, "step": 4650 }, { "epoch": 1.5510340226817878, "loss": 0.5670678615570068, "step": 4650 }, { "ce_loss": 0.2741747796535492, "epoch": 1.5510340226817878, "step": 4650 }, { "distill_loss": 0.10975248366594315, "epoch": 1.5510340226817878, "step": 4650 }, { "epoch": 1.5510340226817878, "ref_ce_loss": 0.12581507861614227, "step": 4650 }, { "epoch": 1.5510340226817878, "loss": 0.31926068663597107, "step": 4650 }, { "ce_loss": 0.08233984559774399, "epoch": 1.5510340226817878, "step": 4650 }, { "distill_loss": 0.07135706394910812, "epoch": 1.5510340226817878, "step": 4650 }, { "epoch": 1.5510340226817878, "ref_ce_loss": 0.0867433100938797, "step": 4650 }, { "epoch": 1.5510340226817878, "loss": 0.834353506565094, "step": 4650 }, { "ce_loss": 0.20248062908649445, "epoch": 1.5510340226817878, "step": 4650 }, { "distill_loss": 0.10472960025072098, "epoch": 1.5510340226817878, "step": 4650 }, { "epoch": 1.5510340226817878, "ref_ce_loss": 0.13779492676258087, "step": 4650 }, { "epoch": 1.5510340226817878, "loss": 0.7164924740791321, "step": 4650 }, { "ce_loss": 0.24499934911727905, "epoch": 1.5510340226817878, "step": 4650 }, { "distill_loss": 0.10008464753627777, "epoch": 1.5510340226817878, "step": 4650 }, { "epoch": 1.5510340226817878, "ref_ce_loss": 0.18830597400665283, "step": 4650 }, { "epoch": 1.5543695797198132, "loss": 0.5762, "step": 4660 }, { "epoch": 1.5543695797198132, "grad_norm": 1.9185824394226074, "step": 4660 }, { "epoch": 1.5543695797198132, "learning_rate": 0.0002792644741750324, "step": 4660 }, { "epoch": 1.5543695797198132, "loss": 0.3904956877231598, "step": 4660 }, { "ce_loss": 0.17337539792060852, "epoch": 1.5543695797198132, "step": 4660 }, { "distill_loss": 0.10445573925971985, "epoch": 1.5543695797198132, "step": 4660 }, { "epoch": 1.5543695797198132, "ref_ce_loss": 0.07155634462833405, "step": 4660 }, { "epoch": 1.5543695797198132, "loss": 0.6196936368942261, "step": 4660 }, { "ce_loss": 0.14578856527805328, "epoch": 1.5543695797198132, "step": 4660 }, { "distill_loss": 0.09393522143363953, "epoch": 1.5543695797198132, "step": 4660 }, { "epoch": 1.5543695797198132, "ref_ce_loss": 0.10801822692155838, "step": 4660 }, { "epoch": 1.5543695797198132, "loss": 0.9515413045883179, "step": 4660 }, { "ce_loss": 0.2024691253900528, "epoch": 1.5543695797198132, "step": 4660 }, { "distill_loss": 0.10020405799150467, "epoch": 1.5543695797198132, "step": 4660 }, { "epoch": 1.5543695797198132, "ref_ce_loss": 0.1126125305891037, "step": 4660 }, { "epoch": 1.5543695797198132, "loss": 0.7406129240989685, "step": 4660 }, { "ce_loss": 0.3020225763320923, "epoch": 1.5543695797198132, "step": 4660 }, { "distill_loss": 0.10811947286128998, "epoch": 1.5543695797198132, "step": 4660 }, { "epoch": 1.5543695797198132, "ref_ce_loss": 0.19355078041553497, "step": 4660 }, { "epoch": 1.5577051367578385, "loss": 0.6206, "step": 4670 }, { "epoch": 1.5577051367578385, "grad_norm": 2.085416555404663, "step": 4670 }, { "epoch": 1.5577051367578385, "learning_rate": 0.00027916159474298044, "step": 4670 }, { "epoch": 1.5577051367578385, "loss": 0.5231167078018188, "step": 4670 }, { "ce_loss": 0.14092274010181427, "epoch": 1.5577051367578385, "step": 4670 }, { "distill_loss": 0.0953192263841629, "epoch": 1.5577051367578385, "step": 4670 }, { "epoch": 1.5577051367578385, "ref_ce_loss": 0.13460183143615723, "step": 4670 }, { "epoch": 1.5577051367578385, "loss": 0.6252128481864929, "step": 4670 }, { "ce_loss": 0.15868394076824188, "epoch": 1.5577051367578385, "step": 4670 }, { "distill_loss": 0.0933477133512497, "epoch": 1.5577051367578385, "step": 4670 }, { "epoch": 1.5577051367578385, "ref_ce_loss": 0.15329962968826294, "step": 4670 }, { "epoch": 1.5577051367578385, "loss": 0.6975128650665283, "step": 4670 }, { "ce_loss": 0.21315570175647736, "epoch": 1.5577051367578385, "step": 4670 }, { "distill_loss": 0.10002562403678894, "epoch": 1.5577051367578385, "step": 4670 }, { "epoch": 1.5577051367578385, "ref_ce_loss": 0.1326710730791092, "step": 4670 }, { "epoch": 1.5577051367578385, "loss": 0.3720162510871887, "step": 4670 }, { "ce_loss": 0.15149645507335663, "epoch": 1.5577051367578385, "step": 4670 }, { "distill_loss": 0.08944686502218246, "epoch": 1.5577051367578385, "step": 4670 }, { "epoch": 1.5577051367578385, "ref_ce_loss": 0.12873674929141998, "step": 4670 }, { "epoch": 1.5610406937958639, "loss": 0.59, "step": 4680 }, { "epoch": 1.5610406937958639, "grad_norm": 2.7784948348999023, "step": 4680 }, { "epoch": 1.5610406937958639, "learning_rate": 0.0002790584797711298, "step": 4680 }, { "epoch": 1.5610406937958639, "loss": 0.7926986813545227, "step": 4680 }, { "ce_loss": 0.2878677248954773, "epoch": 1.5610406937958639, "step": 4680 }, { "distill_loss": 0.16262328624725342, "epoch": 1.5610406937958639, "step": 4680 }, { "epoch": 1.5610406937958639, "ref_ce_loss": 0.14896634221076965, "step": 4680 }, { "epoch": 1.5610406937958639, "loss": 0.7145441770553589, "step": 4680 }, { "ce_loss": 0.21132363379001617, "epoch": 1.5610406937958639, "step": 4680 }, { "distill_loss": 0.11646532267332077, "epoch": 1.5610406937958639, "step": 4680 }, { "epoch": 1.5610406937958639, "ref_ce_loss": 0.15072284638881683, "step": 4680 }, { "epoch": 1.5610406937958639, "loss": 0.5834494233131409, "step": 4680 }, { "ce_loss": 0.22991342842578888, "epoch": 1.5610406937958639, "step": 4680 }, { "distill_loss": 0.1131071075797081, "epoch": 1.5610406937958639, "step": 4680 }, { "epoch": 1.5610406937958639, "ref_ce_loss": 0.12597393989562988, "step": 4680 }, { "epoch": 1.5610406937958639, "loss": 0.4927375316619873, "step": 4680 }, { "ce_loss": 0.17881889641284943, "epoch": 1.5610406937958639, "step": 4680 }, { "distill_loss": 0.10333593934774399, "epoch": 1.5610406937958639, "step": 4680 }, { "epoch": 1.5610406937958639, "ref_ce_loss": 0.14724135398864746, "step": 4680 }, { "epoch": 1.5643762508338894, "loss": 0.6419, "step": 4690 }, { "epoch": 1.5643762508338894, "grad_norm": 3.5349230766296387, "step": 4690 }, { "epoch": 1.5643762508338894, "learning_rate": 0.00027895512944752144, "step": 4690 }, { "epoch": 1.5643762508338894, "loss": 0.8685083389282227, "step": 4690 }, { "ce_loss": 0.23852185904979706, "epoch": 1.5643762508338894, "step": 4690 }, { "distill_loss": 0.18311463296413422, "epoch": 1.5643762508338894, "step": 4690 }, { "epoch": 1.5643762508338894, "ref_ce_loss": 0.18137918412685394, "step": 4690 }, { "epoch": 1.5643762508338894, "loss": 0.5344275832176208, "step": 4690 }, { "ce_loss": 0.20123222470283508, "epoch": 1.5643762508338894, "step": 4690 }, { "distill_loss": 0.13716256618499756, "epoch": 1.5643762508338894, "step": 4690 }, { "epoch": 1.5643762508338894, "ref_ce_loss": 0.14230753481388092, "step": 4690 }, { "epoch": 1.5643762508338894, "loss": 0.738693118095398, "step": 4690 }, { "ce_loss": 0.28084349632263184, "epoch": 1.5643762508338894, "step": 4690 }, { "distill_loss": 0.15618476271629333, "epoch": 1.5643762508338894, "step": 4690 }, { "epoch": 1.5643762508338894, "ref_ce_loss": 0.17222802340984344, "step": 4690 }, { "epoch": 1.5643762508338894, "loss": 0.5586490035057068, "step": 4690 }, { "ce_loss": 0.21215829253196716, "epoch": 1.5643762508338894, "step": 4690 }, { "distill_loss": 0.13161195814609528, "epoch": 1.5643762508338894, "step": 4690 }, { "epoch": 1.5643762508338894, "ref_ce_loss": 0.18186885118484497, "step": 4690 }, { "epoch": 1.5677118078719148, "loss": 0.6155, "step": 4700 }, { "epoch": 1.5677118078719148, "grad_norm": 2.473024845123291, "step": 4700 }, { "epoch": 1.5677118078719148, "learning_rate": 0.0002788515439606256, "step": 4700 }, { "epoch": 1.5677118078719148, "loss": 0.37272197008132935, "step": 4700 }, { "ce_loss": 0.1430395096540451, "epoch": 1.5677118078719148, "step": 4700 }, { "distill_loss": 0.1030561700463295, "epoch": 1.5677118078719148, "step": 4700 }, { "epoch": 1.5677118078719148, "ref_ce_loss": 0.09838125109672546, "step": 4700 }, { "epoch": 1.5677118078719148, "loss": 0.6110117435455322, "step": 4700 }, { "ce_loss": 0.23183700442314148, "epoch": 1.5677118078719148, "step": 4700 }, { "distill_loss": 0.13275600969791412, "epoch": 1.5677118078719148, "step": 4700 }, { "epoch": 1.5677118078719148, "ref_ce_loss": 0.16894279420375824, "step": 4700 }, { "epoch": 1.5677118078719148, "loss": 0.42745649814605713, "step": 4700 }, { "ce_loss": 0.16604764759540558, "epoch": 1.5677118078719148, "step": 4700 }, { "distill_loss": 0.09421001374721527, "epoch": 1.5677118078719148, "step": 4700 }, { "epoch": 1.5677118078719148, "ref_ce_loss": 0.10031646490097046, "step": 4700 }, { "epoch": 1.5677118078719148, "loss": 0.6442577838897705, "step": 4700 }, { "ce_loss": 0.2546035349369049, "epoch": 1.5677118078719148, "step": 4700 }, { "distill_loss": 0.14264395833015442, "epoch": 1.5677118078719148, "step": 4700 }, { "epoch": 1.5677118078719148, "ref_ce_loss": 0.17145292460918427, "step": 4700 }, { "epoch": 1.5710473649099401, "loss": 0.6352, "step": 4710 }, { "epoch": 1.5710473649099401, "grad_norm": 2.054325819015503, "step": 4710 }, { "epoch": 1.5710473649099401, "learning_rate": 0.0002787477234993414, "step": 4710 }, { "epoch": 1.5710473649099401, "loss": 0.8072826862335205, "step": 4710 }, { "ce_loss": 0.2330678254365921, "epoch": 1.5710473649099401, "step": 4710 }, { "distill_loss": 0.11896157264709473, "epoch": 1.5710473649099401, "step": 4710 }, { "epoch": 1.5710473649099401, "ref_ce_loss": 0.15135934948921204, "step": 4710 }, { "epoch": 1.5710473649099401, "loss": 0.5528466105461121, "step": 4710 }, { "ce_loss": 0.19632616639137268, "epoch": 1.5710473649099401, "step": 4710 }, { "distill_loss": 0.139164000749588, "epoch": 1.5710473649099401, "step": 4710 }, { "epoch": 1.5710473649099401, "ref_ce_loss": 0.09339972585439682, "step": 4710 }, { "epoch": 1.5710473649099401, "loss": 0.8057548403739929, "step": 4710 }, { "ce_loss": 0.25221431255340576, "epoch": 1.5710473649099401, "step": 4710 }, { "distill_loss": 0.12725681066513062, "epoch": 1.5710473649099401, "step": 4710 }, { "epoch": 1.5710473649099401, "ref_ce_loss": 0.14530764520168304, "step": 4710 }, { "epoch": 1.5710473649099401, "loss": 0.5259864926338196, "step": 4710 }, { "ce_loss": 0.18317881226539612, "epoch": 1.5710473649099401, "step": 4710 }, { "distill_loss": 0.11473749577999115, "epoch": 1.5710473649099401, "step": 4710 }, { "epoch": 1.5710473649099401, "ref_ce_loss": 0.14014258980751038, "step": 4710 }, { "epoch": 1.5743829219479655, "loss": 0.5894, "step": 4720 }, { "epoch": 1.5743829219479655, "grad_norm": 2.167114019393921, "step": 4720 }, { "epoch": 1.5743829219479655, "learning_rate": 0.00027864366825299636, "step": 4720 }, { "epoch": 1.5743829219479655, "loss": 0.48211607336997986, "step": 4720 }, { "ce_loss": 0.19091051816940308, "epoch": 1.5743829219479655, "step": 4720 }, { "distill_loss": 0.1282716691493988, "epoch": 1.5743829219479655, "step": 4720 }, { "epoch": 1.5743829219479655, "ref_ce_loss": 0.09083818644285202, "step": 4720 }, { "epoch": 1.5743829219479655, "loss": 0.47632259130477905, "step": 4720 }, { "ce_loss": 0.23266349732875824, "epoch": 1.5743829219479655, "step": 4720 }, { "distill_loss": 0.10076127201318741, "epoch": 1.5743829219479655, "step": 4720 }, { "epoch": 1.5743829219479655, "ref_ce_loss": 0.1049838587641716, "step": 4720 }, { "epoch": 1.5743829219479655, "loss": 0.542456865310669, "step": 4720 }, { "ce_loss": 0.19646567106246948, "epoch": 1.5743829219479655, "step": 4720 }, { "distill_loss": 0.13279792666435242, "epoch": 1.5743829219479655, "step": 4720 }, { "epoch": 1.5743829219479655, "ref_ce_loss": 0.12299693375825882, "step": 4720 }, { "epoch": 1.5743829219479655, "loss": 0.44261011481285095, "step": 4720 }, { "ce_loss": 0.20179417729377747, "epoch": 1.5743829219479655, "step": 4720 }, { "distill_loss": 0.09362678974866867, "epoch": 1.5743829219479655, "step": 4720 }, { "epoch": 1.5743829219479655, "ref_ce_loss": 0.10843309015035629, "step": 4720 }, { "epoch": 1.5777184789859908, "loss": 0.6023, "step": 4730 }, { "epoch": 1.5777184789859908, "grad_norm": 1.6021579504013062, "step": 4730 }, { "epoch": 1.5777184789859908, "learning_rate": 0.0002785393784113462, "step": 4730 }, { "epoch": 1.5777184789859908, "loss": 0.7584335803985596, "step": 4730 }, { "ce_loss": 0.29199421405792236, "epoch": 1.5777184789859908, "step": 4730 }, { "distill_loss": 0.12377098947763443, "epoch": 1.5777184789859908, "step": 4730 }, { "epoch": 1.5777184789859908, "ref_ce_loss": 0.15207377076148987, "step": 4730 }, { "epoch": 1.5777184789859908, "loss": 0.4357783794403076, "step": 4730 }, { "ce_loss": 0.15077053010463715, "epoch": 1.5777184789859908, "step": 4730 }, { "distill_loss": 0.08500176668167114, "epoch": 1.5777184789859908, "step": 4730 }, { "epoch": 1.5777184789859908, "ref_ce_loss": 0.11844529956579208, "step": 4730 }, { "epoch": 1.5777184789859908, "loss": 0.5543081164360046, "step": 4730 }, { "ce_loss": 0.2360016107559204, "epoch": 1.5777184789859908, "step": 4730 }, { "distill_loss": 0.11848665773868561, "epoch": 1.5777184789859908, "step": 4730 }, { "epoch": 1.5777184789859908, "ref_ce_loss": 0.15166281163692474, "step": 4730 }, { "epoch": 1.5777184789859908, "loss": 0.992057204246521, "step": 4730 }, { "ce_loss": 0.28633835911750793, "epoch": 1.5777184789859908, "step": 4730 }, { "distill_loss": 0.11764726042747498, "epoch": 1.5777184789859908, "step": 4730 }, { "epoch": 1.5777184789859908, "ref_ce_loss": 0.1075211763381958, "step": 4730 }, { "epoch": 1.5810540360240162, "loss": 0.5752, "step": 4740 }, { "epoch": 1.5810540360240162, "grad_norm": 2.260157346725464, "step": 4740 }, { "epoch": 1.5810540360240162, "learning_rate": 0.00027843485416457445, "step": 4740 }, { "epoch": 1.5810540360240162, "loss": 0.42545413970947266, "step": 4740 }, { "ce_loss": 0.1473778933286667, "epoch": 1.5810540360240162, "step": 4740 }, { "distill_loss": 0.09845662862062454, "epoch": 1.5810540360240162, "step": 4740 }, { "epoch": 1.5810540360240162, "ref_ce_loss": 0.09697523713111877, "step": 4740 }, { "epoch": 1.5810540360240162, "loss": 0.7466808557510376, "step": 4740 }, { "ce_loss": 0.15043483674526215, "epoch": 1.5810540360240162, "step": 4740 }, { "distill_loss": 0.15371906757354736, "epoch": 1.5810540360240162, "step": 4740 }, { "epoch": 1.5810540360240162, "ref_ce_loss": 0.11501208692789078, "step": 4740 }, { "epoch": 1.5810540360240162, "loss": 0.5339465141296387, "step": 4740 }, { "ce_loss": 0.22342433035373688, "epoch": 1.5810540360240162, "step": 4740 }, { "distill_loss": 0.12968042492866516, "epoch": 1.5810540360240162, "step": 4740 }, { "epoch": 1.5810540360240162, "ref_ce_loss": 0.12109992653131485, "step": 4740 }, { "epoch": 1.5810540360240162, "loss": 0.4160770773887634, "step": 4740 }, { "ce_loss": 0.12246851623058319, "epoch": 1.5810540360240162, "step": 4740 }, { "distill_loss": 0.10528349876403809, "epoch": 1.5810540360240162, "step": 4740 }, { "epoch": 1.5810540360240162, "ref_ce_loss": 0.13350163400173187, "step": 4740 }, { "epoch": 1.5843895930620415, "loss": 0.5754, "step": 4750 }, { "epoch": 1.5843895930620415, "grad_norm": 2.2725913524627686, "step": 4750 }, { "epoch": 1.5843895930620415, "learning_rate": 0.0002783300957032921, "step": 4750 }, { "epoch": 1.5843895930620415, "loss": 0.8523422479629517, "step": 4750 }, { "ce_loss": 0.20142604410648346, "epoch": 1.5843895930620415, "step": 4750 }, { "distill_loss": 0.12058450281620026, "epoch": 1.5843895930620415, "step": 4750 }, { "epoch": 1.5843895930620415, "ref_ce_loss": 0.10920006781816483, "step": 4750 }, { "epoch": 1.5843895930620415, "loss": 0.8453580737113953, "step": 4750 }, { "ce_loss": 0.3348260819911957, "epoch": 1.5843895930620415, "step": 4750 }, { "distill_loss": 0.16452373564243317, "epoch": 1.5843895930620415, "step": 4750 }, { "epoch": 1.5843895930620415, "ref_ce_loss": 0.17683695256710052, "step": 4750 }, { "epoch": 1.5843895930620415, "loss": 0.7837134003639221, "step": 4750 }, { "ce_loss": 0.2850218415260315, "epoch": 1.5843895930620415, "step": 4750 }, { "distill_loss": 0.1574116200208664, "epoch": 1.5843895930620415, "step": 4750 }, { "epoch": 1.5843895930620415, "ref_ce_loss": 0.15330904722213745, "step": 4750 }, { "epoch": 1.5843895930620415, "loss": 1.0082786083221436, "step": 4750 }, { "ce_loss": 0.22477564215660095, "epoch": 1.5843895930620415, "step": 4750 }, { "distill_loss": 0.12516120076179504, "epoch": 1.5843895930620415, "step": 4750 }, { "epoch": 1.5843895930620415, "ref_ce_loss": 0.17074769735336304, "step": 4750 }, { "epoch": 1.5877251501000669, "loss": 0.711, "step": 4760 }, { "epoch": 1.5877251501000669, "grad_norm": 3.120191812515259, "step": 4760 }, { "epoch": 1.5877251501000669, "learning_rate": 0.00027822510321853734, "step": 4760 }, { "epoch": 1.5877251501000669, "loss": 0.5238044261932373, "step": 4760 }, { "ce_loss": 0.22598464787006378, "epoch": 1.5877251501000669, "step": 4760 }, { "distill_loss": 0.12697666883468628, "epoch": 1.5877251501000669, "step": 4760 }, { "epoch": 1.5877251501000669, "ref_ce_loss": 0.14048749208450317, "step": 4760 }, { "epoch": 1.5877251501000669, "loss": 0.6119301915168762, "step": 4760 }, { "ce_loss": 0.2498970329761505, "epoch": 1.5877251501000669, "step": 4760 }, { "distill_loss": 0.18579566478729248, "epoch": 1.5877251501000669, "step": 4760 }, { "epoch": 1.5877251501000669, "ref_ce_loss": 0.1740613728761673, "step": 4760 }, { "epoch": 1.5877251501000669, "loss": 0.7766237258911133, "step": 4760 }, { "ce_loss": 0.18085479736328125, "epoch": 1.5877251501000669, "step": 4760 }, { "distill_loss": 0.15683233737945557, "epoch": 1.5877251501000669, "step": 4760 }, { "epoch": 1.5877251501000669, "ref_ce_loss": 0.18002113699913025, "step": 4760 }, { "epoch": 1.5877251501000669, "loss": 0.4009416103363037, "step": 4760 }, { "ce_loss": 0.13139718770980835, "epoch": 1.5877251501000669, "step": 4760 }, { "distill_loss": 0.11593674123287201, "epoch": 1.5877251501000669, "step": 4760 }, { "epoch": 1.5877251501000669, "ref_ce_loss": 0.08506946265697479, "step": 4760 }, { "epoch": 1.5910607071380922, "loss": 0.598, "step": 4770 }, { "epoch": 1.5910607071380922, "grad_norm": 1.9799612760543823, "step": 4770 }, { "epoch": 1.5910607071380922, "learning_rate": 0.000278119876901775, "step": 4770 }, { "epoch": 1.5910607071380922, "loss": 0.6736071705818176, "step": 4770 }, { "ce_loss": 0.30645042657852173, "epoch": 1.5910607071380922, "step": 4770 }, { "distill_loss": 0.13452163338661194, "epoch": 1.5910607071380922, "step": 4770 }, { "epoch": 1.5910607071380922, "ref_ce_loss": 0.17019902169704437, "step": 4770 }, { "epoch": 1.5910607071380922, "loss": 1.46865975856781, "step": 4770 }, { "ce_loss": 0.28805285692214966, "epoch": 1.5910607071380922, "step": 4770 }, { "distill_loss": 0.1221141368150711, "epoch": 1.5910607071380922, "step": 4770 }, { "epoch": 1.5910607071380922, "ref_ce_loss": 0.18184491991996765, "step": 4770 }, { "epoch": 1.5910607071380922, "loss": 0.5761913061141968, "step": 4770 }, { "ce_loss": 0.13550378382205963, "epoch": 1.5910607071380922, "step": 4770 }, { "distill_loss": 0.10447970777750015, "epoch": 1.5910607071380922, "step": 4770 }, { "epoch": 1.5910607071380922, "ref_ce_loss": 0.09147114306688309, "step": 4770 }, { "epoch": 1.5910607071380922, "loss": 1.2852704524993896, "step": 4770 }, { "ce_loss": 0.22348161041736603, "epoch": 1.5910607071380922, "step": 4770 }, { "distill_loss": 0.11755059659481049, "epoch": 1.5910607071380922, "step": 4770 }, { "epoch": 1.5910607071380922, "ref_ce_loss": 0.1853521466255188, "step": 4770 }, { "epoch": 1.5943962641761176, "loss": 0.5862, "step": 4780 }, { "epoch": 1.5943962641761176, "grad_norm": 2.4209163188934326, "step": 4780 }, { "epoch": 1.5943962641761176, "learning_rate": 0.0002780144169448963, "step": 4780 }, { "epoch": 1.5943962641761176, "loss": 1.2094604969024658, "step": 4780 }, { "ce_loss": 0.15398405492305756, "epoch": 1.5943962641761176, "step": 4780 }, { "distill_loss": 0.09844505041837692, "epoch": 1.5943962641761176, "step": 4780 }, { "epoch": 1.5943962641761176, "ref_ce_loss": 0.12485992163419724, "step": 4780 }, { "epoch": 1.5943962641761176, "loss": 0.3832671046257019, "step": 4780 }, { "ce_loss": 0.12382929027080536, "epoch": 1.5943962641761176, "step": 4780 }, { "distill_loss": 0.0875585600733757, "epoch": 1.5943962641761176, "step": 4780 }, { "epoch": 1.5943962641761176, "ref_ce_loss": 0.08526071906089783, "step": 4780 }, { "epoch": 1.5943962641761176, "loss": 0.38048553466796875, "step": 4780 }, { "ce_loss": 0.15026162564754486, "epoch": 1.5943962641761176, "step": 4780 }, { "distill_loss": 0.11404670774936676, "epoch": 1.5943962641761176, "step": 4780 }, { "epoch": 1.5943962641761176, "ref_ce_loss": 0.11563827842473984, "step": 4780 }, { "epoch": 1.5943962641761176, "loss": 0.6271405220031738, "step": 4780 }, { "ce_loss": 0.19329474866390228, "epoch": 1.5943962641761176, "step": 4780 }, { "distill_loss": 0.09335696697235107, "epoch": 1.5943962641761176, "step": 4780 }, { "epoch": 1.5943962641761176, "ref_ce_loss": 0.13915462791919708, "step": 4780 }, { "epoch": 1.597731821214143, "loss": 0.6104, "step": 4790 }, { "epoch": 1.597731821214143, "grad_norm": 2.573589324951172, "step": 4790 }, { "epoch": 1.597731821214143, "learning_rate": 0.0002779087235402187, "step": 4790 }, { "epoch": 1.597731821214143, "loss": 0.740349531173706, "step": 4790 }, { "ce_loss": 0.14136841893196106, "epoch": 1.597731821214143, "step": 4790 }, { "distill_loss": 0.08817031234502792, "epoch": 1.597731821214143, "step": 4790 }, { "epoch": 1.597731821214143, "ref_ce_loss": 0.1261877566576004, "step": 4790 }, { "epoch": 1.597731821214143, "loss": 0.4377024173736572, "step": 4790 }, { "ce_loss": 0.15925750136375427, "epoch": 1.597731821214143, "step": 4790 }, { "distill_loss": 0.08435290306806564, "epoch": 1.597731821214143, "step": 4790 }, { "epoch": 1.597731821214143, "ref_ce_loss": 0.09518688917160034, "step": 4790 }, { "epoch": 1.597731821214143, "loss": 1.1542946100234985, "step": 4790 }, { "ce_loss": 0.2103382796049118, "epoch": 1.597731821214143, "step": 4790 }, { "distill_loss": 0.08120667934417725, "epoch": 1.597731821214143, "step": 4790 }, { "epoch": 1.597731821214143, "ref_ce_loss": 0.17455676198005676, "step": 4790 }, { "epoch": 1.597731821214143, "loss": 0.5343722105026245, "step": 4790 }, { "ce_loss": 0.18151548504829407, "epoch": 1.597731821214143, "step": 4790 }, { "distill_loss": 0.07225097715854645, "epoch": 1.597731821214143, "step": 4790 }, { "epoch": 1.597731821214143, "ref_ce_loss": 0.14874359965324402, "step": 4790 }, { "epoch": 1.6010673782521683, "loss": 0.6157, "step": 4800 }, { "epoch": 1.6010673782521683, "grad_norm": 2.3918330669403076, "step": 4800 }, { "epoch": 1.6010673782521683, "learning_rate": 0.00027780279688048516, "step": 4800 }, { "epoch": 1.6010673782521683, "loss": 0.426487535238266, "step": 4800 }, { "ce_loss": 0.15047642588615417, "epoch": 1.6010673782521683, "step": 4800 }, { "distill_loss": 0.10461893677711487, "epoch": 1.6010673782521683, "step": 4800 }, { "epoch": 1.6010673782521683, "ref_ce_loss": 0.17090778052806854, "step": 4800 }, { "epoch": 1.6010673782521683, "loss": 0.5501457452774048, "step": 4800 }, { "ce_loss": 0.216351717710495, "epoch": 1.6010673782521683, "step": 4800 }, { "distill_loss": 0.12492936104536057, "epoch": 1.6010673782521683, "step": 4800 }, { "epoch": 1.6010673782521683, "ref_ce_loss": 0.15076109766960144, "step": 4800 }, { "epoch": 1.6010673782521683, "loss": 0.43778330087661743, "step": 4800 }, { "ce_loss": 0.20027485489845276, "epoch": 1.6010673782521683, "step": 4800 }, { "distill_loss": 0.10449983924627304, "epoch": 1.6010673782521683, "step": 4800 }, { "epoch": 1.6010673782521683, "ref_ce_loss": 0.13276301324367523, "step": 4800 }, { "epoch": 1.6010673782521683, "loss": 0.3523600697517395, "step": 4800 }, { "ce_loss": 0.16868212819099426, "epoch": 1.6010673782521683, "step": 4800 }, { "distill_loss": 0.08839337527751923, "epoch": 1.6010673782521683, "step": 4800 }, { "epoch": 1.6010673782521683, "ref_ce_loss": 0.0947473868727684, "step": 4800 }, { "epoch": 1.6044029352901936, "loss": 0.5668, "step": 4810 }, { "epoch": 1.6044029352901936, "grad_norm": 1.779632806777954, "step": 4810 }, { "epoch": 1.6044029352901936, "learning_rate": 0.00027769663715886426, "step": 4810 }, { "epoch": 1.6044029352901936, "loss": 0.5785982608795166, "step": 4810 }, { "ce_loss": 0.20843467116355896, "epoch": 1.6044029352901936, "step": 4810 }, { "distill_loss": 0.08618040382862091, "epoch": 1.6044029352901936, "step": 4810 }, { "epoch": 1.6044029352901936, "ref_ce_loss": 0.13870251178741455, "step": 4810 }, { "epoch": 1.6044029352901936, "loss": 0.4519733190536499, "step": 4810 }, { "ce_loss": 0.09943678975105286, "epoch": 1.6044029352901936, "step": 4810 }, { "distill_loss": 0.08621320873498917, "epoch": 1.6044029352901936, "step": 4810 }, { "epoch": 1.6044029352901936, "ref_ce_loss": 0.12685640156269073, "step": 4810 }, { "epoch": 1.6044029352901936, "loss": 0.4492816627025604, "step": 4810 }, { "ce_loss": 0.19865265488624573, "epoch": 1.6044029352901936, "step": 4810 }, { "distill_loss": 0.11306588351726532, "epoch": 1.6044029352901936, "step": 4810 }, { "epoch": 1.6044029352901936, "ref_ce_loss": 0.13746190071105957, "step": 4810 }, { "epoch": 1.6044029352901936, "loss": 0.8414527177810669, "step": 4810 }, { "ce_loss": 0.29073143005371094, "epoch": 1.6044029352901936, "step": 4810 }, { "distill_loss": 0.11862733960151672, "epoch": 1.6044029352901936, "step": 4810 }, { "epoch": 1.6044029352901936, "ref_ce_loss": 0.1853315234184265, "step": 4810 }, { "epoch": 1.607738492328219, "loss": 0.6134, "step": 4820 }, { "epoch": 1.607738492328219, "grad_norm": 2.555032968521118, "step": 4820 }, { "epoch": 1.607738492328219, "learning_rate": 0.0002775902445689494, "step": 4820 }, { "epoch": 1.607738492328219, "loss": 0.45475345849990845, "step": 4820 }, { "ce_loss": 0.21073417365550995, "epoch": 1.607738492328219, "step": 4820 }, { "distill_loss": 0.10298866778612137, "epoch": 1.607738492328219, "step": 4820 }, { "epoch": 1.607738492328219, "ref_ce_loss": 0.0919463261961937, "step": 4820 }, { "epoch": 1.607738492328219, "loss": 0.7756646275520325, "step": 4820 }, { "ce_loss": 0.3087059557437897, "epoch": 1.607738492328219, "step": 4820 }, { "distill_loss": 0.12526912987232208, "epoch": 1.607738492328219, "step": 4820 }, { "epoch": 1.607738492328219, "ref_ce_loss": 0.13909755647182465, "step": 4820 }, { "epoch": 1.607738492328219, "loss": 0.48539334535598755, "step": 4820 }, { "ce_loss": 0.17870061099529266, "epoch": 1.607738492328219, "step": 4820 }, { "distill_loss": 0.0823015347123146, "epoch": 1.607738492328219, "step": 4820 }, { "epoch": 1.607738492328219, "ref_ce_loss": 0.11708899587392807, "step": 4820 }, { "epoch": 1.607738492328219, "loss": 0.45261070132255554, "step": 4820 }, { "ce_loss": 0.21282659471035004, "epoch": 1.607738492328219, "step": 4820 }, { "distill_loss": 0.10187379270792007, "epoch": 1.607738492328219, "step": 4820 }, { "epoch": 1.607738492328219, "ref_ce_loss": 0.08549916744232178, "step": 4820 }, { "epoch": 1.6110740493662443, "loss": 0.641, "step": 4830 }, { "epoch": 1.6110740493662443, "grad_norm": 4.425067901611328, "step": 4830 }, { "epoch": 1.6110740493662443, "learning_rate": 0.0002774836193047587, "step": 4830 }, { "epoch": 1.6110740493662443, "loss": 0.4936881363391876, "step": 4830 }, { "ce_loss": 0.1533152163028717, "epoch": 1.6110740493662443, "step": 4830 }, { "distill_loss": 0.19593192636966705, "epoch": 1.6110740493662443, "step": 4830 }, { "epoch": 1.6110740493662443, "ref_ce_loss": 0.0907311663031578, "step": 4830 }, { "epoch": 1.6110740493662443, "loss": 0.6278508305549622, "step": 4830 }, { "ce_loss": 0.17450125515460968, "epoch": 1.6110740493662443, "step": 4830 }, { "distill_loss": 0.21686002612113953, "epoch": 1.6110740493662443, "step": 4830 }, { "epoch": 1.6110740493662443, "ref_ce_loss": 0.15096011757850647, "step": 4830 }, { "epoch": 1.6110740493662443, "loss": 0.5692883133888245, "step": 4830 }, { "ce_loss": 0.1969756931066513, "epoch": 1.6110740493662443, "step": 4830 }, { "distill_loss": 0.1700528860092163, "epoch": 1.6110740493662443, "step": 4830 }, { "epoch": 1.6110740493662443, "ref_ce_loss": 0.152689591050148, "step": 4830 }, { "epoch": 1.6110740493662443, "loss": 0.8832573294639587, "step": 4830 }, { "ce_loss": 0.19293345510959625, "epoch": 1.6110740493662443, "step": 4830 }, { "distill_loss": 0.224935844540596, "epoch": 1.6110740493662443, "step": 4830 }, { "epoch": 1.6110740493662443, "ref_ce_loss": 0.17869606614112854, "step": 4830 }, { "epoch": 1.6144096064042697, "loss": 0.7192, "step": 4840 }, { "epoch": 1.6144096064042697, "grad_norm": 5.211795806884766, "step": 4840 }, { "epoch": 1.6144096064042697, "learning_rate": 0.00027737676156073453, "step": 4840 }, { "epoch": 1.6144096064042697, "loss": 0.7288017272949219, "step": 4840 }, { "ce_loss": 0.21490071713924408, "epoch": 1.6144096064042697, "step": 4840 }, { "distill_loss": 0.2942824065685272, "epoch": 1.6144096064042697, "step": 4840 }, { "epoch": 1.6144096064042697, "ref_ce_loss": 0.08734626322984695, "step": 4840 }, { "epoch": 1.6144096064042697, "loss": 0.6859686970710754, "step": 4840 }, { "ce_loss": 0.20365281403064728, "epoch": 1.6144096064042697, "step": 4840 }, { "distill_loss": 0.30579593777656555, "epoch": 1.6144096064042697, "step": 4840 }, { "epoch": 1.6144096064042697, "ref_ce_loss": 0.13223841786384583, "step": 4840 }, { "epoch": 1.6144096064042697, "loss": 1.122478723526001, "step": 4840 }, { "ce_loss": 0.13811272382736206, "epoch": 1.6144096064042697, "step": 4840 }, { "distill_loss": 0.24075543880462646, "epoch": 1.6144096064042697, "step": 4840 }, { "epoch": 1.6144096064042697, "ref_ce_loss": 0.12147624790668488, "step": 4840 }, { "epoch": 1.6144096064042697, "loss": 0.9976313710212708, "step": 4840 }, { "ce_loss": 0.39122068881988525, "epoch": 1.6144096064042697, "step": 4840 }, { "distill_loss": 0.2513085603713989, "epoch": 1.6144096064042697, "step": 4840 }, { "epoch": 1.6144096064042697, "ref_ce_loss": 0.2339351624250412, "step": 4840 }, { "epoch": 1.617745163442295, "loss": 0.6723, "step": 4850 }, { "epoch": 1.617745163442295, "grad_norm": 2.4428341388702393, "step": 4850 }, { "epoch": 1.617745163442295, "learning_rate": 0.00027726967153174337, "step": 4850 }, { "epoch": 1.617745163442295, "loss": 0.8324744701385498, "step": 4850 }, { "ce_loss": 0.2209848016500473, "epoch": 1.617745163442295, "step": 4850 }, { "distill_loss": 0.1588110327720642, "epoch": 1.617745163442295, "step": 4850 }, { "epoch": 1.617745163442295, "ref_ce_loss": 0.13250286877155304, "step": 4850 }, { "epoch": 1.617745163442295, "loss": 0.43706193566322327, "step": 4850 }, { "ce_loss": 0.141639843583107, "epoch": 1.617745163442295, "step": 4850 }, { "distill_loss": 0.15065184235572815, "epoch": 1.617745163442295, "step": 4850 }, { "epoch": 1.617745163442295, "ref_ce_loss": 0.09860837459564209, "step": 4850 }, { "epoch": 1.617745163442295, "loss": 0.49818480014801025, "step": 4850 }, { "ce_loss": 0.08093893527984619, "epoch": 1.617745163442295, "step": 4850 }, { "distill_loss": 0.1619725227355957, "epoch": 1.617745163442295, "step": 4850 }, { "epoch": 1.617745163442295, "ref_ce_loss": 0.09855761379003525, "step": 4850 }, { "epoch": 1.617745163442295, "loss": 0.9106850624084473, "step": 4850 }, { "ce_loss": 0.29924336075782776, "epoch": 1.617745163442295, "step": 4850 }, { "distill_loss": 0.21561862528324127, "epoch": 1.617745163442295, "step": 4850 }, { "epoch": 1.617745163442295, "ref_ce_loss": 0.13731172680854797, "step": 4850 }, { "epoch": 1.6210807204803204, "loss": 0.622, "step": 4860 }, { "epoch": 1.6210807204803204, "grad_norm": 2.667039155960083, "step": 4860 }, { "epoch": 1.6210807204803204, "learning_rate": 0.00027716234941307504, "step": 4860 }, { "epoch": 1.6210807204803204, "loss": 0.4096541404724121, "step": 4860 }, { "ce_loss": 0.12734420597553253, "epoch": 1.6210807204803204, "step": 4860 }, { "distill_loss": 0.10120591521263123, "epoch": 1.6210807204803204, "step": 4860 }, { "epoch": 1.6210807204803204, "ref_ce_loss": 0.11490931361913681, "step": 4860 }, { "epoch": 1.6210807204803204, "loss": 0.4143024682998657, "step": 4860 }, { "ce_loss": 0.14841949939727783, "epoch": 1.6210807204803204, "step": 4860 }, { "distill_loss": 0.10297094285488129, "epoch": 1.6210807204803204, "step": 4860 }, { "epoch": 1.6210807204803204, "ref_ce_loss": 0.11480960249900818, "step": 4860 }, { "epoch": 1.6210807204803204, "loss": 0.7108901739120483, "step": 4860 }, { "ce_loss": 0.2291679084300995, "epoch": 1.6210807204803204, "step": 4860 }, { "distill_loss": 0.1301821917295456, "epoch": 1.6210807204803204, "step": 4860 }, { "epoch": 1.6210807204803204, "ref_ce_loss": 0.1481446474790573, "step": 4860 }, { "epoch": 1.6210807204803204, "loss": 0.44128403067588806, "step": 4860 }, { "ce_loss": 0.1561107039451599, "epoch": 1.6210807204803204, "step": 4860 }, { "distill_loss": 0.10009510815143585, "epoch": 1.6210807204803204, "step": 4860 }, { "epoch": 1.6210807204803204, "ref_ce_loss": 0.11934979259967804, "step": 4860 }, { "epoch": 1.6244162775183457, "loss": 0.5846, "step": 4870 }, { "epoch": 1.6244162775183457, "grad_norm": 2.787006378173828, "step": 4870 }, { "epoch": 1.6244162775183457, "learning_rate": 0.00027705479540044293, "step": 4870 }, { "epoch": 1.6244162775183457, "loss": 0.8736571669578552, "step": 4870 }, { "ce_loss": 0.12208632379770279, "epoch": 1.6244162775183457, "step": 4870 }, { "distill_loss": 0.14207608997821808, "epoch": 1.6244162775183457, "step": 4870 }, { "epoch": 1.6244162775183457, "ref_ce_loss": 0.11297786980867386, "step": 4870 }, { "epoch": 1.6244162775183457, "loss": 0.6192541718482971, "step": 4870 }, { "ce_loss": 0.27191561460494995, "epoch": 1.6244162775183457, "step": 4870 }, { "distill_loss": 0.16424386203289032, "epoch": 1.6244162775183457, "step": 4870 }, { "epoch": 1.6244162775183457, "ref_ce_loss": 0.18219006061553955, "step": 4870 }, { "epoch": 1.6244162775183457, "loss": 0.5637862086296082, "step": 4870 }, { "ce_loss": 0.20224043726921082, "epoch": 1.6244162775183457, "step": 4870 }, { "distill_loss": 0.14408601820468903, "epoch": 1.6244162775183457, "step": 4870 }, { "epoch": 1.6244162775183457, "ref_ce_loss": 0.1527886986732483, "step": 4870 }, { "epoch": 1.6244162775183457, "loss": 0.5461189150810242, "step": 4870 }, { "ce_loss": 0.14812813699245453, "epoch": 1.6244162775183457, "step": 4870 }, { "distill_loss": 0.14465124905109406, "epoch": 1.6244162775183457, "step": 4870 }, { "epoch": 1.6244162775183457, "ref_ce_loss": 0.17705132067203522, "step": 4870 }, { "epoch": 1.627751834556371, "loss": 0.5969, "step": 4880 }, { "epoch": 1.627751834556371, "grad_norm": 2.4038162231445312, "step": 4880 }, { "epoch": 1.627751834556371, "learning_rate": 0.00027694700968998296, "step": 4880 }, { "epoch": 1.627751834556371, "loss": 0.5871032476425171, "step": 4880 }, { "ce_loss": 0.220157653093338, "epoch": 1.627751834556371, "step": 4880 }, { "distill_loss": 0.11889912933111191, "epoch": 1.627751834556371, "step": 4880 }, { "epoch": 1.627751834556371, "ref_ce_loss": 0.16518516838550568, "step": 4880 }, { "epoch": 1.627751834556371, "loss": 0.6563812494277954, "step": 4880 }, { "ce_loss": 0.27671051025390625, "epoch": 1.627751834556371, "step": 4880 }, { "distill_loss": 0.12580226361751556, "epoch": 1.627751834556371, "step": 4880 }, { "epoch": 1.627751834556371, "ref_ce_loss": 0.19856387376785278, "step": 4880 }, { "epoch": 1.627751834556371, "loss": 0.4133327603340149, "step": 4880 }, { "ce_loss": 0.18373562395572662, "epoch": 1.627751834556371, "step": 4880 }, { "distill_loss": 0.10071388632059097, "epoch": 1.627751834556371, "step": 4880 }, { "epoch": 1.627751834556371, "ref_ce_loss": 0.09259993582963943, "step": 4880 }, { "epoch": 1.627751834556371, "loss": 0.5829482078552246, "step": 4880 }, { "ce_loss": 0.17557404935359955, "epoch": 1.627751834556371, "step": 4880 }, { "distill_loss": 0.09654910862445831, "epoch": 1.627751834556371, "step": 4880 }, { "epoch": 1.627751834556371, "ref_ce_loss": 0.17849195003509521, "step": 4880 }, { "epoch": 1.6310873915943964, "loss": 0.584, "step": 4890 }, { "epoch": 1.6310873915943964, "grad_norm": 2.1157753467559814, "step": 4890 }, { "epoch": 1.6310873915943964, "learning_rate": 0.00027683899247825383, "step": 4890 }, { "epoch": 1.6310873915943964, "loss": 0.6509414911270142, "step": 4890 }, { "ce_loss": 0.16850188374519348, "epoch": 1.6310873915943964, "step": 4890 }, { "distill_loss": 0.07165582478046417, "epoch": 1.6310873915943964, "step": 4890 }, { "epoch": 1.6310873915943964, "ref_ce_loss": 0.11415962874889374, "step": 4890 }, { "epoch": 1.6310873915943964, "loss": 1.0927846431732178, "step": 4890 }, { "ce_loss": 0.2822512984275818, "epoch": 1.6310873915943964, "step": 4890 }, { "distill_loss": 0.10931877791881561, "epoch": 1.6310873915943964, "step": 4890 }, { "epoch": 1.6310873915943964, "ref_ce_loss": 0.16365401446819305, "step": 4890 }, { "epoch": 1.6310873915943964, "loss": 0.5998976230621338, "step": 4890 }, { "ce_loss": 0.19696788489818573, "epoch": 1.6310873915943964, "step": 4890 }, { "distill_loss": 0.09859666973352432, "epoch": 1.6310873915943964, "step": 4890 }, { "epoch": 1.6310873915943964, "ref_ce_loss": 0.1334376484155655, "step": 4890 }, { "epoch": 1.6310873915943964, "loss": 0.7270228862762451, "step": 4890 }, { "ce_loss": 0.2925046682357788, "epoch": 1.6310873915943964, "step": 4890 }, { "distill_loss": 0.12335610389709473, "epoch": 1.6310873915943964, "step": 4890 }, { "epoch": 1.6310873915943964, "ref_ce_loss": 0.16185958683490753, "step": 4890 }, { "epoch": 1.6344229486324218, "loss": 0.6318, "step": 4900 }, { "epoch": 1.6344229486324218, "grad_norm": 2.7217421531677246, "step": 4900 }, { "epoch": 1.6344229486324218, "learning_rate": 0.00027673074396223637, "step": 4900 }, { "epoch": 1.6344229486324218, "loss": 0.49891188740730286, "step": 4900 }, { "ce_loss": 0.1908903568983078, "epoch": 1.6344229486324218, "step": 4900 }, { "distill_loss": 0.1197698637843132, "epoch": 1.6344229486324218, "step": 4900 }, { "epoch": 1.6344229486324218, "ref_ce_loss": 0.14133276045322418, "step": 4900 }, { "epoch": 1.6344229486324218, "loss": 0.6270771622657776, "step": 4900 }, { "ce_loss": 0.17307709157466888, "epoch": 1.6344229486324218, "step": 4900 }, { "distill_loss": 0.08771342039108276, "epoch": 1.6344229486324218, "step": 4900 }, { "epoch": 1.6344229486324218, "ref_ce_loss": 0.09575901180505753, "step": 4900 }, { "epoch": 1.6344229486324218, "loss": 0.5917713046073914, "step": 4900 }, { "ce_loss": 0.18899348378181458, "epoch": 1.6344229486324218, "step": 4900 }, { "distill_loss": 0.11513810604810715, "epoch": 1.6344229486324218, "step": 4900 }, { "epoch": 1.6344229486324218, "ref_ce_loss": 0.14573843777179718, "step": 4900 }, { "epoch": 1.6344229486324218, "loss": 0.5202388763427734, "step": 4900 }, { "ce_loss": 0.20376692712306976, "epoch": 1.6344229486324218, "step": 4900 }, { "distill_loss": 0.09992238134145737, "epoch": 1.6344229486324218, "step": 4900 }, { "epoch": 1.6344229486324218, "ref_ce_loss": 0.1302812397480011, "step": 4900 }, { "epoch": 1.6377585056704471, "loss": 0.6124, "step": 4910 }, { "epoch": 1.6377585056704471, "grad_norm": 3.521989345550537, "step": 4910 }, { "epoch": 1.6377585056704471, "learning_rate": 0.00027662226433933305, "step": 4910 }, { "epoch": 1.6377585056704471, "loss": 0.4377168118953705, "step": 4910 }, { "ce_loss": 0.15247930586338043, "epoch": 1.6377585056704471, "step": 4910 }, { "distill_loss": 0.09452302753925323, "epoch": 1.6377585056704471, "step": 4910 }, { "epoch": 1.6377585056704471, "ref_ce_loss": 0.1294000744819641, "step": 4910 }, { "epoch": 1.6377585056704471, "loss": 0.3409244418144226, "step": 4910 }, { "ce_loss": 0.1711265742778778, "epoch": 1.6377585056704471, "step": 4910 }, { "distill_loss": 0.10409481078386307, "epoch": 1.6377585056704471, "step": 4910 }, { "epoch": 1.6377585056704471, "ref_ce_loss": 0.06564094126224518, "step": 4910 }, { "epoch": 1.6377585056704471, "loss": 0.4959399998188019, "step": 4910 }, { "ce_loss": 0.1535956710577011, "epoch": 1.6377585056704471, "step": 4910 }, { "distill_loss": 0.09488362818956375, "epoch": 1.6377585056704471, "step": 4910 }, { "epoch": 1.6377585056704471, "ref_ce_loss": 0.09364673495292664, "step": 4910 }, { "epoch": 1.6377585056704471, "loss": 0.5725511908531189, "step": 4910 }, { "ce_loss": 0.26551344990730286, "epoch": 1.6377585056704471, "step": 4910 }, { "distill_loss": 0.13034331798553467, "epoch": 1.6377585056704471, "step": 4910 }, { "epoch": 1.6377585056704471, "ref_ce_loss": 0.11410856246948242, "step": 4910 }, { "epoch": 1.6410940627084725, "loss": 0.6464, "step": 4920 }, { "epoch": 1.6410940627084725, "grad_norm": 2.946782112121582, "step": 4920 }, { "epoch": 1.6410940627084725, "learning_rate": 0.000276513553807368, "step": 4920 }, { "epoch": 1.6410940627084725, "loss": 0.6583297252655029, "step": 4920 }, { "ce_loss": 0.3206038475036621, "epoch": 1.6410940627084725, "step": 4920 }, { "distill_loss": 0.1186511218547821, "epoch": 1.6410940627084725, "step": 4920 }, { "epoch": 1.6410940627084725, "ref_ce_loss": 0.15720915794372559, "step": 4920 }, { "epoch": 1.6410940627084725, "loss": 0.4410336911678314, "step": 4920 }, { "ce_loss": 0.13522271811962128, "epoch": 1.6410940627084725, "step": 4920 }, { "distill_loss": 0.09478119015693665, "epoch": 1.6410940627084725, "step": 4920 }, { "epoch": 1.6410940627084725, "ref_ce_loss": 0.12998971343040466, "step": 4920 }, { "epoch": 1.6410940627084725, "loss": 0.4015612304210663, "step": 4920 }, { "ce_loss": 0.18265585601329803, "epoch": 1.6410940627084725, "step": 4920 }, { "distill_loss": 0.11469544470310211, "epoch": 1.6410940627084725, "step": 4920 }, { "epoch": 1.6410940627084725, "ref_ce_loss": 0.10402646660804749, "step": 4920 }, { "epoch": 1.6410940627084725, "loss": 0.6775842308998108, "step": 4920 }, { "ce_loss": 0.2640410363674164, "epoch": 1.6410940627084725, "step": 4920 }, { "distill_loss": 0.14958937466144562, "epoch": 1.6410940627084725, "step": 4920 }, { "epoch": 1.6410940627084725, "ref_ce_loss": 0.1849392205476761, "step": 4920 }, { "epoch": 1.6444296197464978, "loss": 0.601, "step": 4930 }, { "epoch": 1.6444296197464978, "grad_norm": 2.743461847305298, "step": 4930 }, { "epoch": 1.6444296197464978, "learning_rate": 0.0002764046125645864, "step": 4930 }, { "epoch": 1.6444296197464978, "loss": 0.39398637413978577, "step": 4930 }, { "ce_loss": 0.11836259067058563, "epoch": 1.6444296197464978, "step": 4930 }, { "distill_loss": 0.07528175413608551, "epoch": 1.6444296197464978, "step": 4930 }, { "epoch": 1.6444296197464978, "ref_ce_loss": 0.09293336421251297, "step": 4930 }, { "epoch": 1.6444296197464978, "loss": 0.5988407135009766, "step": 4930 }, { "ce_loss": 0.25346532464027405, "epoch": 1.6444296197464978, "step": 4930 }, { "distill_loss": 0.11336065828800201, "epoch": 1.6444296197464978, "step": 4930 }, { "epoch": 1.6444296197464978, "ref_ce_loss": 0.1648983359336853, "step": 4930 }, { "epoch": 1.6444296197464978, "loss": 0.4127940237522125, "step": 4930 }, { "ce_loss": 0.12222033739089966, "epoch": 1.6444296197464978, "step": 4930 }, { "distill_loss": 0.11669386178255081, "epoch": 1.6444296197464978, "step": 4930 }, { "epoch": 1.6444296197464978, "ref_ce_loss": 0.10548103600740433, "step": 4930 }, { "epoch": 1.6444296197464978, "loss": 1.0301282405853271, "step": 4930 }, { "ce_loss": 0.2393624484539032, "epoch": 1.6444296197464978, "step": 4930 }, { "distill_loss": 0.1533287614583969, "epoch": 1.6444296197464978, "step": 4930 }, { "epoch": 1.6444296197464978, "ref_ce_loss": 0.15323209762573242, "step": 4930 }, { "epoch": 1.6477651767845232, "loss": 0.6081, "step": 4940 }, { "epoch": 1.6477651767845232, "grad_norm": 2.3398468494415283, "step": 4940 }, { "epoch": 1.6477651767845232, "learning_rate": 0.00027629544080965394, "step": 4940 }, { "epoch": 1.6477651767845232, "loss": 0.41326504945755005, "step": 4940 }, { "ce_loss": 0.15715086460113525, "epoch": 1.6477651767845232, "step": 4940 }, { "distill_loss": 0.12461084872484207, "epoch": 1.6477651767845232, "step": 4940 }, { "epoch": 1.6477651767845232, "ref_ce_loss": 0.08338805288076401, "step": 4940 }, { "epoch": 1.6477651767845232, "loss": 0.8529278039932251, "step": 4940 }, { "ce_loss": 0.244953915476799, "epoch": 1.6477651767845232, "step": 4940 }, { "distill_loss": 0.14145401120185852, "epoch": 1.6477651767845232, "step": 4940 }, { "epoch": 1.6477651767845232, "ref_ce_loss": 0.210275799036026, "step": 4940 }, { "epoch": 1.6477651767845232, "loss": 0.38149306178092957, "step": 4940 }, { "ce_loss": 0.11619073152542114, "epoch": 1.6477651767845232, "step": 4940 }, { "distill_loss": 0.1336188167333603, "epoch": 1.6477651767845232, "step": 4940 }, { "epoch": 1.6477651767845232, "ref_ce_loss": 0.13149616122245789, "step": 4940 }, { "epoch": 1.6477651767845232, "loss": 0.6542447209358215, "step": 4940 }, { "ce_loss": 0.27234119176864624, "epoch": 1.6477651767845232, "step": 4940 }, { "distill_loss": 0.14006751775741577, "epoch": 1.6477651767845232, "step": 4940 }, { "epoch": 1.6477651767845232, "ref_ce_loss": 0.17293211817741394, "step": 4940 }, { "epoch": 1.6511007338225485, "loss": 0.5851, "step": 4950 }, { "epoch": 1.6511007338225485, "grad_norm": 2.6936793327331543, "step": 4950 }, { "epoch": 1.6511007338225485, "learning_rate": 0.000276186038741657, "step": 4950 }, { "epoch": 1.6511007338225485, "loss": 0.5390084981918335, "step": 4950 }, { "ce_loss": 0.1600649505853653, "epoch": 1.6511007338225485, "step": 4950 }, { "distill_loss": 0.2050093561410904, "epoch": 1.6511007338225485, "step": 4950 }, { "epoch": 1.6511007338225485, "ref_ce_loss": 0.09268899261951447, "step": 4950 }, { "epoch": 1.6511007338225485, "loss": 0.5070465803146362, "step": 4950 }, { "ce_loss": 0.183873251080513, "epoch": 1.6511007338225485, "step": 4950 }, { "distill_loss": 0.13934825360774994, "epoch": 1.6511007338225485, "step": 4950 }, { "epoch": 1.6511007338225485, "ref_ce_loss": 0.18370971083641052, "step": 4950 }, { "epoch": 1.6511007338225485, "loss": 0.7284078001976013, "step": 4950 }, { "ce_loss": 0.23376776278018951, "epoch": 1.6511007338225485, "step": 4950 }, { "distill_loss": 0.18618497252464294, "epoch": 1.6511007338225485, "step": 4950 }, { "epoch": 1.6511007338225485, "ref_ce_loss": 0.13847507536411285, "step": 4950 }, { "epoch": 1.6511007338225485, "loss": 0.5451200008392334, "step": 4950 }, { "ce_loss": 0.18877169489860535, "epoch": 1.6511007338225485, "step": 4950 }, { "distill_loss": 0.16713643074035645, "epoch": 1.6511007338225485, "step": 4950 }, { "epoch": 1.6511007338225485, "ref_ce_loss": 0.18870173394680023, "step": 4950 }, { "epoch": 1.6544362908605739, "loss": 0.6234, "step": 4960 }, { "epoch": 1.6544362908605739, "grad_norm": 2.0953621864318848, "step": 4960 }, { "epoch": 1.6544362908605739, "learning_rate": 0.0002760764065601017, "step": 4960 }, { "epoch": 1.6544362908605739, "loss": 0.9226181507110596, "step": 4960 }, { "ce_loss": 0.2521872818470001, "epoch": 1.6544362908605739, "step": 4960 }, { "distill_loss": 0.19906756281852722, "epoch": 1.6544362908605739, "step": 4960 }, { "epoch": 1.6544362908605739, "ref_ce_loss": 0.2884136736392975, "step": 4960 }, { "epoch": 1.6544362908605739, "loss": 0.6516121625900269, "step": 4960 }, { "ce_loss": 0.17211009562015533, "epoch": 1.6544362908605739, "step": 4960 }, { "distill_loss": 0.17549239099025726, "epoch": 1.6544362908605739, "step": 4960 }, { "epoch": 1.6544362908605739, "ref_ce_loss": 0.1282089799642563, "step": 4960 }, { "epoch": 1.6544362908605739, "loss": 0.5280035138130188, "step": 4960 }, { "ce_loss": 0.16100002825260162, "epoch": 1.6544362908605739, "step": 4960 }, { "distill_loss": 0.1495133936405182, "epoch": 1.6544362908605739, "step": 4960 }, { "epoch": 1.6544362908605739, "ref_ce_loss": 0.15446114540100098, "step": 4960 }, { "epoch": 1.6544362908605739, "loss": 1.0270130634307861, "step": 4960 }, { "ce_loss": 0.3464062809944153, "epoch": 1.6544362908605739, "step": 4960 }, { "distill_loss": 0.13512006402015686, "epoch": 1.6544362908605739, "step": 4960 }, { "epoch": 1.6544362908605739, "ref_ce_loss": 0.23012858629226685, "step": 4960 }, { "epoch": 1.6577718478985992, "loss": 0.6433, "step": 4970 }, { "epoch": 1.6577718478985992, "grad_norm": 2.4353840351104736, "step": 4970 }, { "epoch": 1.6577718478985992, "learning_rate": 0.0002759665444649139, "step": 4970 }, { "epoch": 1.6577718478985992, "loss": 0.617945671081543, "step": 4970 }, { "ce_loss": 0.2723182141780853, "epoch": 1.6577718478985992, "step": 4970 }, { "distill_loss": 0.19296707212924957, "epoch": 1.6577718478985992, "step": 4970 }, { "epoch": 1.6577718478985992, "ref_ce_loss": 0.15225380659103394, "step": 4970 }, { "epoch": 1.6577718478985992, "loss": 0.6833885312080383, "step": 4970 }, { "ce_loss": 0.2776934504508972, "epoch": 1.6577718478985992, "step": 4970 }, { "distill_loss": 0.23715607821941376, "epoch": 1.6577718478985992, "step": 4970 }, { "epoch": 1.6577718478985992, "ref_ce_loss": 0.14985275268554688, "step": 4970 }, { "epoch": 1.6577718478985992, "loss": 0.6385636329650879, "step": 4970 }, { "ce_loss": 0.17333070933818817, "epoch": 1.6577718478985992, "step": 4970 }, { "distill_loss": 0.21506594121456146, "epoch": 1.6577718478985992, "step": 4970 }, { "epoch": 1.6577718478985992, "ref_ce_loss": 0.09677013009786606, "step": 4970 }, { "epoch": 1.6577718478985992, "loss": 0.7047004699707031, "step": 4970 }, { "ce_loss": 0.20013362169265747, "epoch": 1.6577718478985992, "step": 4970 }, { "distill_loss": 0.16035297513008118, "epoch": 1.6577718478985992, "step": 4970 }, { "epoch": 1.6577718478985992, "ref_ce_loss": 0.10856600105762482, "step": 4970 }, { "epoch": 1.6611074049366246, "loss": 0.6098, "step": 4980 }, { "epoch": 1.6611074049366246, "grad_norm": 1.8887118101119995, "step": 4980 }, { "epoch": 1.6611074049366246, "learning_rate": 0.00027585645265643875, "step": 4980 }, { "epoch": 1.6611074049366246, "loss": 0.46489736437797546, "step": 4980 }, { "ce_loss": 0.16724559664726257, "epoch": 1.6611074049366246, "step": 4980 }, { "distill_loss": 0.1293192207813263, "epoch": 1.6611074049366246, "step": 4980 }, { "epoch": 1.6611074049366246, "ref_ce_loss": 0.07118848711252213, "step": 4980 }, { "epoch": 1.6611074049366246, "loss": 0.5294301509857178, "step": 4980 }, { "ce_loss": 0.21990546584129333, "epoch": 1.6611074049366246, "step": 4980 }, { "distill_loss": 0.136548712849617, "epoch": 1.6611074049366246, "step": 4980 }, { "epoch": 1.6611074049366246, "ref_ce_loss": 0.11981360614299774, "step": 4980 }, { "epoch": 1.6611074049366246, "loss": 0.55299973487854, "step": 4980 }, { "ce_loss": 0.25989824533462524, "epoch": 1.6611074049366246, "step": 4980 }, { "distill_loss": 0.11929081380367279, "epoch": 1.6611074049366246, "step": 4980 }, { "epoch": 1.6611074049366246, "ref_ce_loss": 0.1736879050731659, "step": 4980 }, { "epoch": 1.6611074049366246, "loss": 0.4317464232444763, "step": 4980 }, { "ce_loss": 0.14439506828784943, "epoch": 1.6611074049366246, "step": 4980 }, { "distill_loss": 0.1170135885477066, "epoch": 1.6611074049366246, "step": 4980 }, { "epoch": 1.6611074049366246, "ref_ce_loss": 0.11630581319332123, "step": 4980 }, { "epoch": 1.66444296197465, "loss": 0.5748, "step": 4990 }, { "epoch": 1.66444296197465, "grad_norm": 3.17610502243042, "step": 4990 }, { "epoch": 1.66444296197465, "learning_rate": 0.0002757461313354403, "step": 4990 }, { "epoch": 1.66444296197465, "loss": 0.6473589539527893, "step": 4990 }, { "ce_loss": 0.2640224099159241, "epoch": 1.66444296197465, "step": 4990 }, { "distill_loss": 0.14431437849998474, "epoch": 1.66444296197465, "step": 4990 }, { "epoch": 1.66444296197465, "ref_ce_loss": 0.17252643406391144, "step": 4990 }, { "epoch": 1.66444296197465, "loss": 0.5207508206367493, "step": 4990 }, { "ce_loss": 0.1483282446861267, "epoch": 1.66444296197465, "step": 4990 }, { "distill_loss": 0.18423455953598022, "epoch": 1.66444296197465, "step": 4990 }, { "epoch": 1.66444296197465, "ref_ce_loss": 0.11322248727083206, "step": 4990 }, { "epoch": 1.66444296197465, "loss": 0.5628471374511719, "step": 4990 }, { "ce_loss": 0.16010361909866333, "epoch": 1.66444296197465, "step": 4990 }, { "distill_loss": 0.16277122497558594, "epoch": 1.66444296197465, "step": 4990 }, { "epoch": 1.66444296197465, "ref_ce_loss": 0.08963080495595932, "step": 4990 }, { "epoch": 1.66444296197465, "loss": 0.3767557740211487, "step": 4990 }, { "ce_loss": 0.10535683482885361, "epoch": 1.66444296197465, "step": 4990 }, { "distill_loss": 0.13673093914985657, "epoch": 1.66444296197465, "step": 4990 }, { "epoch": 1.66444296197465, "ref_ce_loss": 0.0836324393749237, "step": 4990 }, { "epoch": 1.6677785190126753, "loss": 0.6322, "step": 5000 }, { "epoch": 1.6677785190126753, "grad_norm": 1.958633303642273, "step": 5000 }, { "epoch": 1.6677785190126753, "learning_rate": 0.00027563558070310104, "step": 5000 }, { "epoch": 1.6677785190126753, "loss": 0.41848665475845337, "step": 5000 }, { "ce_loss": 0.10958019644021988, "epoch": 1.6677785190126753, "step": 5000 }, { "distill_loss": 0.13504420220851898, "epoch": 1.6677785190126753, "step": 5000 }, { "epoch": 1.6677785190126753, "ref_ce_loss": 0.07605572789907455, "step": 5000 }, { "epoch": 1.6677785190126753, "loss": 0.6914088726043701, "step": 5000 }, { "ce_loss": 0.2166513353586197, "epoch": 1.6677785190126753, "step": 5000 }, { "distill_loss": 0.13223090767860413, "epoch": 1.6677785190126753, "step": 5000 }, { "epoch": 1.6677785190126753, "ref_ce_loss": 0.2379608452320099, "step": 5000 }, { "epoch": 1.6677785190126753, "loss": 0.5647359490394592, "step": 5000 }, { "ce_loss": 0.23841896653175354, "epoch": 1.6677785190126753, "step": 5000 }, { "distill_loss": 0.13560470938682556, "epoch": 1.6677785190126753, "step": 5000 }, { "epoch": 1.6677785190126753, "ref_ce_loss": 0.1557345688343048, "step": 5000 }, { "epoch": 1.6677785190126753, "loss": 0.4025331139564514, "step": 5000 }, { "ce_loss": 0.14620532095432281, "epoch": 1.6677785190126753, "step": 5000 }, { "distill_loss": 0.10365366190671921, "epoch": 1.6677785190126753, "step": 5000 }, { "epoch": 1.6677785190126753, "ref_ce_loss": 0.11881715804338455, "step": 5000 }, { "epoch": 1.6711140760507006, "loss": 0.6504, "step": 5010 }, { "epoch": 1.6711140760507006, "grad_norm": 3.3348193168640137, "step": 5010 }, { "epoch": 1.6711140760507006, "learning_rate": 0.0002755248009610218, "step": 5010 }, { "epoch": 1.6711140760507006, "loss": 0.5066215991973877, "step": 5010 }, { "ce_loss": 0.19570772349834442, "epoch": 1.6711140760507006, "step": 5010 }, { "distill_loss": 0.1310369223356247, "epoch": 1.6711140760507006, "step": 5010 }, { "epoch": 1.6711140760507006, "ref_ce_loss": 0.1483002007007599, "step": 5010 }, { "epoch": 1.6711140760507006, "loss": 0.4725184142589569, "step": 5010 }, { "ce_loss": 0.19623509049415588, "epoch": 1.6711140760507006, "step": 5010 }, { "distill_loss": 0.1314346045255661, "epoch": 1.6711140760507006, "step": 5010 }, { "epoch": 1.6711140760507006, "ref_ce_loss": 0.14413948357105255, "step": 5010 }, { "epoch": 1.6711140760507006, "loss": 0.7104153037071228, "step": 5010 }, { "ce_loss": 0.22572150826454163, "epoch": 1.6711140760507006, "step": 5010 }, { "distill_loss": 0.14044228196144104, "epoch": 1.6711140760507006, "step": 5010 }, { "epoch": 1.6711140760507006, "ref_ce_loss": 0.13732099533081055, "step": 5010 }, { "epoch": 1.6711140760507006, "loss": 0.8014934659004211, "step": 5010 }, { "ce_loss": 0.24018052220344543, "epoch": 1.6711140760507006, "step": 5010 }, { "distill_loss": 0.1529025137424469, "epoch": 1.6711140760507006, "step": 5010 }, { "epoch": 1.6711140760507006, "ref_ce_loss": 0.1667167842388153, "step": 5010 }, { "epoch": 1.674449633088726, "loss": 0.6296, "step": 5020 }, { "epoch": 1.674449633088726, "grad_norm": 2.7721095085144043, "step": 5020 }, { "epoch": 1.674449633088726, "learning_rate": 0.00027541379231122115, "step": 5020 }, { "epoch": 1.674449633088726, "loss": 0.8057844638824463, "step": 5020 }, { "ce_loss": 0.3243635594844818, "epoch": 1.674449633088726, "step": 5020 }, { "distill_loss": 0.1830526441335678, "epoch": 1.674449633088726, "step": 5020 }, { "epoch": 1.674449633088726, "ref_ce_loss": 0.2075968086719513, "step": 5020 }, { "epoch": 1.674449633088726, "loss": 0.7243004441261292, "step": 5020 }, { "ce_loss": 0.16893361508846283, "epoch": 1.674449633088726, "step": 5020 }, { "distill_loss": 0.0983588844537735, "epoch": 1.674449633088726, "step": 5020 }, { "epoch": 1.674449633088726, "ref_ce_loss": 0.09305059164762497, "step": 5020 }, { "epoch": 1.674449633088726, "loss": 0.7263129949569702, "step": 5020 }, { "ce_loss": 0.22422388195991516, "epoch": 1.674449633088726, "step": 5020 }, { "distill_loss": 0.13498666882514954, "epoch": 1.674449633088726, "step": 5020 }, { "epoch": 1.674449633088726, "ref_ce_loss": 0.14109773933887482, "step": 5020 }, { "epoch": 1.674449633088726, "loss": 0.4796724021434784, "step": 5020 }, { "ce_loss": 0.18545092642307281, "epoch": 1.674449633088726, "step": 5020 }, { "distill_loss": 0.12434764206409454, "epoch": 1.674449633088726, "step": 5020 }, { "epoch": 1.674449633088726, "ref_ce_loss": 0.12115143239498138, "step": 5020 }, { "epoch": 1.6777851901267513, "loss": 0.6394, "step": 5030 }, { "epoch": 1.6777851901267513, "grad_norm": 2.989145040512085, "step": 5030 }, { "epoch": 1.6777851901267513, "learning_rate": 0.000275302554956135, "step": 5030 }, { "epoch": 1.6777851901267513, "loss": 0.7933121919631958, "step": 5030 }, { "ce_loss": 0.28813034296035767, "epoch": 1.6777851901267513, "step": 5030 }, { "distill_loss": 0.15757888555526733, "epoch": 1.6777851901267513, "step": 5030 }, { "epoch": 1.6777851901267513, "ref_ce_loss": 0.1353616714477539, "step": 5030 }, { "epoch": 1.6777851901267513, "loss": 0.8034152984619141, "step": 5030 }, { "ce_loss": 0.21584422886371613, "epoch": 1.6777851901267513, "step": 5030 }, { "distill_loss": 0.1322658658027649, "epoch": 1.6777851901267513, "step": 5030 }, { "epoch": 1.6777851901267513, "ref_ce_loss": 0.18953277170658112, "step": 5030 }, { "epoch": 1.6777851901267513, "loss": 1.4683685302734375, "step": 5030 }, { "ce_loss": 0.2784746587276459, "epoch": 1.6777851901267513, "step": 5030 }, { "distill_loss": 0.09764213114976883, "epoch": 1.6777851901267513, "step": 5030 }, { "epoch": 1.6777851901267513, "ref_ce_loss": 0.2288404405117035, "step": 5030 }, { "epoch": 1.6777851901267513, "loss": 0.6212664246559143, "step": 5030 }, { "ce_loss": 0.28214043378829956, "epoch": 1.6777851901267513, "step": 5030 }, { "distill_loss": 0.13105669617652893, "epoch": 1.6777851901267513, "step": 5030 }, { "epoch": 1.6777851901267513, "ref_ce_loss": 0.14725418388843536, "step": 5030 }, { "epoch": 1.6811207471647767, "loss": 0.6674, "step": 5040 }, { "epoch": 1.6811207471647767, "grad_norm": 1.8358932733535767, "step": 5040 }, { "epoch": 1.6811207471647767, "learning_rate": 0.0002751910890986164, "step": 5040 }, { "epoch": 1.6811207471647767, "loss": 0.6445333957672119, "step": 5040 }, { "ce_loss": 0.14290131628513336, "epoch": 1.6811207471647767, "step": 5040 }, { "distill_loss": 0.09842705726623535, "epoch": 1.6811207471647767, "step": 5040 }, { "epoch": 1.6811207471647767, "ref_ce_loss": 0.1507960706949234, "step": 5040 }, { "epoch": 1.6811207471647767, "loss": 0.6434434056282043, "step": 5040 }, { "ce_loss": 0.1379636973142624, "epoch": 1.6811207471647767, "step": 5040 }, { "distill_loss": 0.12902599573135376, "epoch": 1.6811207471647767, "step": 5040 }, { "epoch": 1.6811207471647767, "ref_ce_loss": 0.12259085476398468, "step": 5040 }, { "epoch": 1.6811207471647767, "loss": 0.3519287407398224, "step": 5040 }, { "ce_loss": 0.14540047943592072, "epoch": 1.6811207471647767, "step": 5040 }, { "distill_loss": 0.10962574928998947, "epoch": 1.6811207471647767, "step": 5040 }, { "epoch": 1.6811207471647767, "ref_ce_loss": 0.09651947766542435, "step": 5040 }, { "epoch": 1.6811207471647767, "loss": 0.4088886082172394, "step": 5040 }, { "ce_loss": 0.10746689885854721, "epoch": 1.6811207471647767, "step": 5040 }, { "distill_loss": 0.1186119094491005, "epoch": 1.6811207471647767, "step": 5040 }, { "epoch": 1.6811207471647767, "ref_ce_loss": 0.12964355945587158, "step": 5040 }, { "epoch": 1.684456304202802, "loss": 0.6478, "step": 5050 }, { "epoch": 1.684456304202802, "grad_norm": 2.647010326385498, "step": 5050 }, { "epoch": 1.684456304202802, "learning_rate": 0.0002750793949419351, "step": 5050 }, { "epoch": 1.684456304202802, "loss": 0.4091527462005615, "step": 5050 }, { "ce_loss": 0.12278663367033005, "epoch": 1.684456304202802, "step": 5050 }, { "distill_loss": 0.17358151078224182, "epoch": 1.684456304202802, "step": 5050 }, { "epoch": 1.684456304202802, "ref_ce_loss": 0.1125297099351883, "step": 5050 }, { "epoch": 1.684456304202802, "loss": 0.547415018081665, "step": 5050 }, { "ce_loss": 0.1731032133102417, "epoch": 1.684456304202802, "step": 5050 }, { "distill_loss": 0.13061532378196716, "epoch": 1.684456304202802, "step": 5050 }, { "epoch": 1.684456304202802, "ref_ce_loss": 0.16719424724578857, "step": 5050 }, { "epoch": 1.684456304202802, "loss": 0.6925753951072693, "step": 5050 }, { "ce_loss": 0.28311067819595337, "epoch": 1.684456304202802, "step": 5050 }, { "distill_loss": 0.193869948387146, "epoch": 1.684456304202802, "step": 5050 }, { "epoch": 1.684456304202802, "ref_ce_loss": 0.1512855738401413, "step": 5050 }, { "epoch": 1.684456304202802, "loss": 0.7059001922607422, "step": 5050 }, { "ce_loss": 0.27139025926589966, "epoch": 1.684456304202802, "step": 5050 }, { "distill_loss": 0.16281284391880035, "epoch": 1.684456304202802, "step": 5050 }, { "epoch": 1.684456304202802, "ref_ce_loss": 0.19652172923088074, "step": 5050 }, { "epoch": 1.6877918612408274, "loss": 0.603, "step": 5060 }, { "epoch": 1.6877918612408274, "grad_norm": 2.900559902191162, "step": 5060 }, { "epoch": 1.6877918612408274, "learning_rate": 0.0002749674726897773, "step": 5060 }, { "epoch": 1.6877918612408274, "loss": 0.4165264070034027, "step": 5060 }, { "ce_loss": 0.14308080077171326, "epoch": 1.6877918612408274, "step": 5060 }, { "distill_loss": 0.12662950158119202, "epoch": 1.6877918612408274, "step": 5060 }, { "epoch": 1.6877918612408274, "ref_ce_loss": 0.1465776264667511, "step": 5060 }, { "epoch": 1.6877918612408274, "loss": 0.5354675650596619, "step": 5060 }, { "ce_loss": 0.21277961134910583, "epoch": 1.6877918612408274, "step": 5060 }, { "distill_loss": 0.16139402985572815, "epoch": 1.6877918612408274, "step": 5060 }, { "epoch": 1.6877918612408274, "ref_ce_loss": 0.08568815141916275, "step": 5060 }, { "epoch": 1.6877918612408274, "loss": 0.7034105658531189, "step": 5060 }, { "ce_loss": 0.2749280035495758, "epoch": 1.6877918612408274, "step": 5060 }, { "distill_loss": 0.17789869010448456, "epoch": 1.6877918612408274, "step": 5060 }, { "epoch": 1.6877918612408274, "ref_ce_loss": 0.19476871192455292, "step": 5060 }, { "epoch": 1.6877918612408274, "loss": 0.44613534212112427, "step": 5060 }, { "ce_loss": 0.18843120336532593, "epoch": 1.6877918612408274, "step": 5060 }, { "distill_loss": 0.14277829229831696, "epoch": 1.6877918612408274, "step": 5060 }, { "epoch": 1.6877918612408274, "ref_ce_loss": 0.11453019082546234, "step": 5060 }, { "epoch": 1.6911274182788527, "loss": 0.5778, "step": 5070 }, { "epoch": 1.6911274182788527, "grad_norm": 2.2885119915008545, "step": 5070 }, { "epoch": 1.6911274182788527, "learning_rate": 0.000274855322546245, "step": 5070 }, { "epoch": 1.6911274182788527, "loss": 0.3868217468261719, "step": 5070 }, { "ce_loss": 0.15230241417884827, "epoch": 1.6911274182788527, "step": 5070 }, { "distill_loss": 0.092768594622612, "epoch": 1.6911274182788527, "step": 5070 }, { "epoch": 1.6911274182788527, "ref_ce_loss": 0.07845815271139145, "step": 5070 }, { "epoch": 1.6911274182788527, "loss": 0.7932264804840088, "step": 5070 }, { "ce_loss": 0.2804737091064453, "epoch": 1.6911274182788527, "step": 5070 }, { "distill_loss": 0.12050312012434006, "epoch": 1.6911274182788527, "step": 5070 }, { "epoch": 1.6911274182788527, "ref_ce_loss": 0.20722705125808716, "step": 5070 }, { "epoch": 1.6911274182788527, "loss": 0.8451511263847351, "step": 5070 }, { "ce_loss": 0.246019646525383, "epoch": 1.6911274182788527, "step": 5070 }, { "distill_loss": 0.10395143181085587, "epoch": 1.6911274182788527, "step": 5070 }, { "epoch": 1.6911274182788527, "ref_ce_loss": 0.1522131860256195, "step": 5070 }, { "epoch": 1.6911274182788527, "loss": 0.49361860752105713, "step": 5070 }, { "ce_loss": 0.19995959103107452, "epoch": 1.6911274182788527, "step": 5070 }, { "distill_loss": 0.10739613324403763, "epoch": 1.6911274182788527, "step": 5070 }, { "epoch": 1.6911274182788527, "ref_ce_loss": 0.1293676495552063, "step": 5070 }, { "epoch": 1.694462975316878, "loss": 0.6237, "step": 5080 }, { "epoch": 1.694462975316878, "grad_norm": 3.2725327014923096, "step": 5080 }, { "epoch": 1.694462975316878, "learning_rate": 0.00027474294471585564, "step": 5080 }, { "epoch": 1.694462975316878, "loss": 0.740882933139801, "step": 5080 }, { "ce_loss": 0.27106061577796936, "epoch": 1.694462975316878, "step": 5080 }, { "distill_loss": 0.12209445983171463, "epoch": 1.694462975316878, "step": 5080 }, { "epoch": 1.694462975316878, "ref_ce_loss": 0.1586221605539322, "step": 5080 }, { "epoch": 1.694462975316878, "loss": 0.882435142993927, "step": 5080 }, { "ce_loss": 0.14245213568210602, "epoch": 1.694462975316878, "step": 5080 }, { "distill_loss": 0.10405879467725754, "epoch": 1.694462975316878, "step": 5080 }, { "epoch": 1.694462975316878, "ref_ce_loss": 0.18394401669502258, "step": 5080 }, { "epoch": 1.694462975316878, "loss": 0.4717642664909363, "step": 5080 }, { "ce_loss": 0.21122273802757263, "epoch": 1.694462975316878, "step": 5080 }, { "distill_loss": 0.11571598052978516, "epoch": 1.694462975316878, "step": 5080 }, { "epoch": 1.694462975316878, "ref_ce_loss": 0.10213414579629898, "step": 5080 }, { "epoch": 1.694462975316878, "loss": 0.5567148923873901, "step": 5080 }, { "ce_loss": 0.2327110916376114, "epoch": 1.694462975316878, "step": 5080 }, { "distill_loss": 0.1081765741109848, "epoch": 1.694462975316878, "step": 5080 }, { "epoch": 1.694462975316878, "ref_ce_loss": 0.12438291311264038, "step": 5080 }, { "epoch": 1.6977985323549034, "loss": 0.6664, "step": 5090 }, { "epoch": 1.6977985323549034, "grad_norm": 4.2465643882751465, "step": 5090 }, { "epoch": 1.6977985323549034, "learning_rate": 0.0002746303394035423, "step": 5090 }, { "epoch": 1.6977985323549034, "loss": 0.7007617354393005, "step": 5090 }, { "ce_loss": 0.2409987598657608, "epoch": 1.6977985323549034, "step": 5090 }, { "distill_loss": 0.2312084138393402, "epoch": 1.6977985323549034, "step": 5090 }, { "epoch": 1.6977985323549034, "ref_ce_loss": 0.13980107009410858, "step": 5090 }, { "epoch": 1.6977985323549034, "loss": 0.8144757747650146, "step": 5090 }, { "ce_loss": 0.18151697516441345, "epoch": 1.6977985323549034, "step": 5090 }, { "distill_loss": 0.2494632452726364, "epoch": 1.6977985323549034, "step": 5090 }, { "epoch": 1.6977985323549034, "ref_ce_loss": 0.17133742570877075, "step": 5090 }, { "epoch": 1.6977985323549034, "loss": 0.6633246541023254, "step": 5090 }, { "ce_loss": 0.23428836464881897, "epoch": 1.6977985323549034, "step": 5090 }, { "distill_loss": 0.21696534752845764, "epoch": 1.6977985323549034, "step": 5090 }, { "epoch": 1.6977985323549034, "ref_ce_loss": 0.1617213934659958, "step": 5090 }, { "epoch": 1.6977985323549034, "loss": 1.046887993812561, "step": 5090 }, { "ce_loss": 0.19139772653579712, "epoch": 1.6977985323549034, "step": 5090 }, { "distill_loss": 0.31649070978164673, "epoch": 1.6977985323549034, "step": 5090 }, { "epoch": 1.6977985323549034, "ref_ce_loss": 0.10765596479177475, "step": 5090 }, { "epoch": 1.7011340893929288, "loss": 0.6845, "step": 5100 }, { "epoch": 1.7011340893929288, "grad_norm": 3.3320302963256836, "step": 5100 }, { "epoch": 1.7011340893929288, "learning_rate": 0.00027451750681465253, "step": 5100 }, { "epoch": 1.7011340893929288, "loss": 0.5628963112831116, "step": 5100 }, { "ce_loss": 0.1644999235868454, "epoch": 1.7011340893929288, "step": 5100 }, { "distill_loss": 0.1443501114845276, "epoch": 1.7011340893929288, "step": 5100 }, { "epoch": 1.7011340893929288, "ref_ce_loss": 0.09509146213531494, "step": 5100 }, { "epoch": 1.7011340893929288, "loss": 0.6696035861968994, "step": 5100 }, { "ce_loss": 0.2381826490163803, "epoch": 1.7011340893929288, "step": 5100 }, { "distill_loss": 0.1346670538187027, "epoch": 1.7011340893929288, "step": 5100 }, { "epoch": 1.7011340893929288, "ref_ce_loss": 0.15772823989391327, "step": 5100 }, { "epoch": 1.7011340893929288, "loss": 0.6247545480728149, "step": 5100 }, { "ce_loss": 0.22702516615390778, "epoch": 1.7011340893929288, "step": 5100 }, { "distill_loss": 0.13249506056308746, "epoch": 1.7011340893929288, "step": 5100 }, { "epoch": 1.7011340893929288, "ref_ce_loss": 0.156106099486351, "step": 5100 }, { "epoch": 1.7011340893929288, "loss": 0.6921025514602661, "step": 5100 }, { "ce_loss": 0.2253514677286148, "epoch": 1.7011340893929288, "step": 5100 }, { "distill_loss": 0.15082751214504242, "epoch": 1.7011340893929288, "step": 5100 }, { "epoch": 1.7011340893929288, "ref_ce_loss": 0.159327894449234, "step": 5100 }, { "epoch": 1.704469646430954, "loss": 0.5672, "step": 5110 }, { "epoch": 1.704469646430954, "grad_norm": 1.8688466548919678, "step": 5110 }, { "epoch": 1.704469646430954, "learning_rate": 0.00027440444715494844, "step": 5110 }, { "epoch": 1.704469646430954, "loss": 0.5153326988220215, "step": 5110 }, { "ce_loss": 0.2143237441778183, "epoch": 1.704469646430954, "step": 5110 }, { "distill_loss": 0.12158432602882385, "epoch": 1.704469646430954, "step": 5110 }, { "epoch": 1.704469646430954, "ref_ce_loss": 0.1791994422674179, "step": 5110 }, { "epoch": 1.704469646430954, "loss": 0.6404139995574951, "step": 5110 }, { "ce_loss": 0.22489234805107117, "epoch": 1.704469646430954, "step": 5110 }, { "distill_loss": 0.10260467231273651, "epoch": 1.704469646430954, "step": 5110 }, { "epoch": 1.704469646430954, "ref_ce_loss": 0.13702525198459625, "step": 5110 }, { "epoch": 1.704469646430954, "loss": 0.5092939734458923, "step": 5110 }, { "ce_loss": 0.19932390749454498, "epoch": 1.704469646430954, "step": 5110 }, { "distill_loss": 0.11346331983804703, "epoch": 1.704469646430954, "step": 5110 }, { "epoch": 1.704469646430954, "ref_ce_loss": 0.12395470589399338, "step": 5110 }, { "epoch": 1.704469646430954, "loss": 0.7422287464141846, "step": 5110 }, { "ce_loss": 0.16377632319927216, "epoch": 1.704469646430954, "step": 5110 }, { "distill_loss": 0.11026269942522049, "epoch": 1.704469646430954, "step": 5110 }, { "epoch": 1.704469646430954, "ref_ce_loss": 0.08629206568002701, "step": 5110 }, { "epoch": 1.7078052034689795, "loss": 0.639, "step": 5120 }, { "epoch": 1.7078052034689795, "grad_norm": 2.86659574508667, "step": 5120 }, { "epoch": 1.7078052034689795, "learning_rate": 0.0002742911606306063, "step": 5120 }, { "epoch": 1.7078052034689795, "loss": 0.4779875576496124, "step": 5120 }, { "ce_loss": 0.1897718906402588, "epoch": 1.7078052034689795, "step": 5120 }, { "distill_loss": 0.11675845831632614, "epoch": 1.7078052034689795, "step": 5120 }, { "epoch": 1.7078052034689795, "ref_ce_loss": 0.12058022618293762, "step": 5120 }, { "epoch": 1.7078052034689795, "loss": 0.46407264471054077, "step": 5120 }, { "ce_loss": 0.11707013100385666, "epoch": 1.7078052034689795, "step": 5120 }, { "distill_loss": 0.08361580967903137, "epoch": 1.7078052034689795, "step": 5120 }, { "epoch": 1.7078052034689795, "ref_ce_loss": 0.09062016010284424, "step": 5120 }, { "epoch": 1.7078052034689795, "loss": 0.5182336568832397, "step": 5120 }, { "ce_loss": 0.23112425208091736, "epoch": 1.7078052034689795, "step": 5120 }, { "distill_loss": 0.10435008257627487, "epoch": 1.7078052034689795, "step": 5120 }, { "epoch": 1.7078052034689795, "ref_ce_loss": 0.13188083469867706, "step": 5120 }, { "epoch": 1.7078052034689795, "loss": 0.47077101469039917, "step": 5120 }, { "ce_loss": 0.1601075679063797, "epoch": 1.7078052034689795, "step": 5120 }, { "distill_loss": 0.12487640976905823, "epoch": 1.7078052034689795, "step": 5120 }, { "epoch": 1.7078052034689795, "ref_ce_loss": 0.11737626045942307, "step": 5120 }, { "epoch": 1.7111407605070048, "loss": 0.5843, "step": 5130 }, { "epoch": 1.7111407605070048, "grad_norm": 3.353935718536377, "step": 5130 }, { "epoch": 1.7111407605070048, "learning_rate": 0.00027417764744821604, "step": 5130 }, { "epoch": 1.7111407605070048, "loss": 0.5513918995857239, "step": 5130 }, { "ce_loss": 0.16730625927448273, "epoch": 1.7111407605070048, "step": 5130 }, { "distill_loss": 0.12257963418960571, "epoch": 1.7111407605070048, "step": 5130 }, { "epoch": 1.7111407605070048, "ref_ce_loss": 0.11809415370225906, "step": 5130 }, { "epoch": 1.7111407605070048, "loss": 0.6963628530502319, "step": 5130 }, { "ce_loss": 0.22674696147441864, "epoch": 1.7111407605070048, "step": 5130 }, { "distill_loss": 0.1302148550748825, "epoch": 1.7111407605070048, "step": 5130 }, { "epoch": 1.7111407605070048, "ref_ce_loss": 0.11398036032915115, "step": 5130 }, { "epoch": 1.7111407605070048, "loss": 0.8863782286643982, "step": 5130 }, { "ce_loss": 0.3215571343898773, "epoch": 1.7111407605070048, "step": 5130 }, { "distill_loss": 0.12419331818819046, "epoch": 1.7111407605070048, "step": 5130 }, { "epoch": 1.7111407605070048, "ref_ce_loss": 0.19529564678668976, "step": 5130 }, { "epoch": 1.7111407605070048, "loss": 0.7732776999473572, "step": 5130 }, { "ce_loss": 0.32119056582450867, "epoch": 1.7111407605070048, "step": 5130 }, { "distill_loss": 0.12385230511426926, "epoch": 1.7111407605070048, "step": 5130 }, { "epoch": 1.7111407605070048, "ref_ce_loss": 0.16419844329357147, "step": 5130 }, { "epoch": 1.7144763175450302, "loss": 0.553, "step": 5140 }, { "epoch": 1.7144763175450302, "grad_norm": 2.516479015350342, "step": 5140 }, { "epoch": 1.7144763175450302, "learning_rate": 0.00027406390781478093, "step": 5140 }, { "epoch": 1.7144763175450302, "loss": 0.5765625238418579, "step": 5140 }, { "ce_loss": 0.1710415482521057, "epoch": 1.7144763175450302, "step": 5140 }, { "distill_loss": 0.11104995012283325, "epoch": 1.7144763175450302, "step": 5140 }, { "epoch": 1.7144763175450302, "ref_ce_loss": 0.09997311234474182, "step": 5140 }, { "epoch": 1.7144763175450302, "loss": 0.44053176045417786, "step": 5140 }, { "ce_loss": 0.19158673286437988, "epoch": 1.7144763175450302, "step": 5140 }, { "distill_loss": 0.11490902304649353, "epoch": 1.7144763175450302, "step": 5140 }, { "epoch": 1.7144763175450302, "ref_ce_loss": 0.09579595178365707, "step": 5140 }, { "epoch": 1.7144763175450302, "loss": 0.5100491046905518, "step": 5140 }, { "ce_loss": 0.18296188116073608, "epoch": 1.7144763175450302, "step": 5140 }, { "distill_loss": 0.11433611810207367, "epoch": 1.7144763175450302, "step": 5140 }, { "epoch": 1.7144763175450302, "ref_ce_loss": 0.07839963585138321, "step": 5140 }, { "epoch": 1.7144763175450302, "loss": 0.4291071593761444, "step": 5140 }, { "ce_loss": 0.15751199424266815, "epoch": 1.7144763175450302, "step": 5140 }, { "distill_loss": 0.09726285189390182, "epoch": 1.7144763175450302, "step": 5140 }, { "epoch": 1.7144763175450302, "ref_ce_loss": 0.12397287786006927, "step": 5140 }, { "epoch": 1.7178118745830555, "loss": 0.5899, "step": 5150 }, { "epoch": 1.7178118745830555, "grad_norm": 2.534189224243164, "step": 5150 }, { "epoch": 1.7178118745830555, "learning_rate": 0.00027394994193771717, "step": 5150 }, { "epoch": 1.7178118745830555, "loss": 1.122070550918579, "step": 5150 }, { "ce_loss": 0.2445487380027771, "epoch": 1.7178118745830555, "step": 5150 }, { "distill_loss": 0.135958731174469, "epoch": 1.7178118745830555, "step": 5150 }, { "epoch": 1.7178118745830555, "ref_ce_loss": 0.13551239669322968, "step": 5150 }, { "epoch": 1.7178118745830555, "loss": 0.5136386156082153, "step": 5150 }, { "ce_loss": 0.2333669662475586, "epoch": 1.7178118745830555, "step": 5150 }, { "distill_loss": 0.09331156313419342, "epoch": 1.7178118745830555, "step": 5150 }, { "epoch": 1.7178118745830555, "ref_ce_loss": 0.13432331383228302, "step": 5150 }, { "epoch": 1.7178118745830555, "loss": 0.5635353326797485, "step": 5150 }, { "ce_loss": 0.10622437298297882, "epoch": 1.7178118745830555, "step": 5150 }, { "distill_loss": 0.07141038030385971, "epoch": 1.7178118745830555, "step": 5150 }, { "epoch": 1.7178118745830555, "ref_ce_loss": 0.1337660551071167, "step": 5150 }, { "epoch": 1.7178118745830555, "loss": 0.5571033358573914, "step": 5150 }, { "ce_loss": 0.20822681486606598, "epoch": 1.7178118745830555, "step": 5150 }, { "distill_loss": 0.14473620057106018, "epoch": 1.7178118745830555, "step": 5150 }, { "epoch": 1.7178118745830555, "ref_ce_loss": 0.14581289887428284, "step": 5150 }, { "epoch": 1.7211474316210809, "loss": 0.6477, "step": 5160 }, { "epoch": 1.7211474316210809, "grad_norm": 3.7310683727264404, "step": 5160 }, { "epoch": 1.7211474316210809, "learning_rate": 0.0002738357500248536, "step": 5160 }, { "epoch": 1.7211474316210809, "loss": 0.4637722969055176, "step": 5160 }, { "ce_loss": 0.22166353464126587, "epoch": 1.7211474316210809, "step": 5160 }, { "distill_loss": 0.11157701909542084, "epoch": 1.7211474316210809, "step": 5160 }, { "epoch": 1.7211474316210809, "ref_ce_loss": 0.1303255707025528, "step": 5160 }, { "epoch": 1.7211474316210809, "loss": 0.7100945711135864, "step": 5160 }, { "ce_loss": 0.1325289011001587, "epoch": 1.7211474316210809, "step": 5160 }, { "distill_loss": 0.10556505620479584, "epoch": 1.7211474316210809, "step": 5160 }, { "epoch": 1.7211474316210809, "ref_ce_loss": 0.09780916571617126, "step": 5160 }, { "epoch": 1.7211474316210809, "loss": 0.6478838920593262, "step": 5160 }, { "ce_loss": 0.214467391371727, "epoch": 1.7211474316210809, "step": 5160 }, { "distill_loss": 0.16932272911071777, "epoch": 1.7211474316210809, "step": 5160 }, { "epoch": 1.7211474316210809, "ref_ce_loss": 0.09977413713932037, "step": 5160 }, { "epoch": 1.7211474316210809, "loss": 0.2837287187576294, "step": 5160 }, { "ce_loss": 0.10429967194795609, "epoch": 1.7211474316210809, "step": 5160 }, { "distill_loss": 0.07188081741333008, "epoch": 1.7211474316210809, "step": 5160 }, { "epoch": 1.7211474316210809, "ref_ce_loss": 0.0646226704120636, "step": 5160 }, { "epoch": 1.7244829886591062, "loss": 0.6087, "step": 5170 }, { "epoch": 1.7244829886591062, "grad_norm": 3.750103235244751, "step": 5170 }, { "epoch": 1.7244829886591062, "learning_rate": 0.0002737213322844312, "step": 5170 }, { "epoch": 1.7244829886591062, "loss": 0.7976568937301636, "step": 5170 }, { "ce_loss": 0.1181984469294548, "epoch": 1.7244829886591062, "step": 5170 }, { "distill_loss": 0.098573699593544, "epoch": 1.7244829886591062, "step": 5170 }, { "epoch": 1.7244829886591062, "ref_ce_loss": 0.1335592269897461, "step": 5170 }, { "epoch": 1.7244829886591062, "loss": 0.6125006675720215, "step": 5170 }, { "ce_loss": 0.22754055261611938, "epoch": 1.7244829886591062, "step": 5170 }, { "distill_loss": 0.12978272140026093, "epoch": 1.7244829886591062, "step": 5170 }, { "epoch": 1.7244829886591062, "ref_ce_loss": 0.15586386620998383, "step": 5170 }, { "epoch": 1.7244829886591062, "loss": 0.4436233639717102, "step": 5170 }, { "ce_loss": 0.09966287016868591, "epoch": 1.7244829886591062, "step": 5170 }, { "distill_loss": 0.09829822927713394, "epoch": 1.7244829886591062, "step": 5170 }, { "epoch": 1.7244829886591062, "ref_ce_loss": 0.10884232074022293, "step": 5170 }, { "epoch": 1.7244829886591062, "loss": 0.9224821329116821, "step": 5170 }, { "ce_loss": 0.17320190370082855, "epoch": 1.7244829886591062, "step": 5170 }, { "distill_loss": 0.12536008656024933, "epoch": 1.7244829886591062, "step": 5170 }, { "epoch": 1.7244829886591062, "ref_ce_loss": 0.12005515396595001, "step": 5170 }, { "epoch": 1.7278185456971316, "loss": 0.6052, "step": 5180 }, { "epoch": 1.7278185456971316, "grad_norm": 2.824880838394165, "step": 5180 }, { "epoch": 1.7278185456971316, "learning_rate": 0.0002736066889251028, "step": 5180 }, { "epoch": 1.7278185456971316, "loss": 0.5080110430717468, "step": 5180 }, { "ce_loss": 0.20835170149803162, "epoch": 1.7278185456971316, "step": 5180 }, { "distill_loss": 0.11417423188686371, "epoch": 1.7278185456971316, "step": 5180 }, { "epoch": 1.7278185456971316, "ref_ce_loss": 0.18518763780593872, "step": 5180 }, { "epoch": 1.7278185456971316, "loss": 0.9636545181274414, "step": 5180 }, { "ce_loss": 0.24943476915359497, "epoch": 1.7278185456971316, "step": 5180 }, { "distill_loss": 0.15834645926952362, "epoch": 1.7278185456971316, "step": 5180 }, { "epoch": 1.7278185456971316, "ref_ce_loss": 0.18894070386886597, "step": 5180 }, { "epoch": 1.7278185456971316, "loss": 0.39006200432777405, "step": 5180 }, { "ce_loss": 0.1016741469502449, "epoch": 1.7278185456971316, "step": 5180 }, { "distill_loss": 0.09278419613838196, "epoch": 1.7278185456971316, "step": 5180 }, { "epoch": 1.7278185456971316, "ref_ce_loss": 0.12148765474557877, "step": 5180 }, { "epoch": 1.7278185456971316, "loss": 0.5657213926315308, "step": 5180 }, { "ce_loss": 0.18592000007629395, "epoch": 1.7278185456971316, "step": 5180 }, { "distill_loss": 0.11897885799407959, "epoch": 1.7278185456971316, "step": 5180 }, { "epoch": 1.7278185456971316, "ref_ce_loss": 0.12840834259986877, "step": 5180 }, { "epoch": 1.731154102735157, "loss": 0.6304, "step": 5190 }, { "epoch": 1.731154102735157, "grad_norm": 2.494673013687134, "step": 5190 }, { "epoch": 1.731154102735157, "learning_rate": 0.0002734918201559326, "step": 5190 }, { "epoch": 1.731154102735157, "loss": 0.47465330362319946, "step": 5190 }, { "ce_loss": 0.1857297122478485, "epoch": 1.731154102735157, "step": 5190 }, { "distill_loss": 0.12139198184013367, "epoch": 1.731154102735157, "step": 5190 }, { "epoch": 1.731154102735157, "ref_ce_loss": 0.1320515125989914, "step": 5190 }, { "epoch": 1.731154102735157, "loss": 1.148721694946289, "step": 5190 }, { "ce_loss": 0.24933886528015137, "epoch": 1.731154102735157, "step": 5190 }, { "distill_loss": 0.14437663555145264, "epoch": 1.731154102735157, "step": 5190 }, { "epoch": 1.731154102735157, "ref_ce_loss": 0.1374908685684204, "step": 5190 }, { "epoch": 1.731154102735157, "loss": 0.31072619557380676, "step": 5190 }, { "ce_loss": 0.10767339915037155, "epoch": 1.731154102735157, "step": 5190 }, { "distill_loss": 0.13216081261634827, "epoch": 1.731154102735157, "step": 5190 }, { "epoch": 1.731154102735157, "ref_ce_loss": 0.07079358398914337, "step": 5190 }, { "epoch": 1.731154102735157, "loss": 0.3686656057834625, "step": 5190 }, { "ce_loss": 0.13234637677669525, "epoch": 1.731154102735157, "step": 5190 }, { "distill_loss": 0.11700987070798874, "epoch": 1.731154102735157, "step": 5190 }, { "epoch": 1.731154102735157, "ref_ce_loss": 0.11908116191625595, "step": 5190 }, { "epoch": 1.7344896597731823, "loss": 0.6093, "step": 5200 }, { "epoch": 1.7344896597731823, "grad_norm": 4.322832107543945, "step": 5200 }, { "epoch": 1.7344896597731823, "learning_rate": 0.00027337672618639604, "step": 5200 }, { "epoch": 1.7344896597731823, "loss": 0.6111628413200378, "step": 5200 }, { "ce_loss": 0.15295933187007904, "epoch": 1.7344896597731823, "step": 5200 }, { "distill_loss": 0.10183216631412506, "epoch": 1.7344896597731823, "step": 5200 }, { "epoch": 1.7344896597731823, "ref_ce_loss": 0.14784273505210876, "step": 5200 }, { "epoch": 1.7344896597731823, "loss": 0.875718891620636, "step": 5200 }, { "ce_loss": 0.28108495473861694, "epoch": 1.7344896597731823, "step": 5200 }, { "distill_loss": 0.1419355273246765, "epoch": 1.7344896597731823, "step": 5200 }, { "epoch": 1.7344896597731823, "ref_ce_loss": 0.20049789547920227, "step": 5200 }, { "epoch": 1.7344896597731823, "loss": 0.5750555396080017, "step": 5200 }, { "ce_loss": 0.18172144889831543, "epoch": 1.7344896597731823, "step": 5200 }, { "distill_loss": 0.10994206368923187, "epoch": 1.7344896597731823, "step": 5200 }, { "epoch": 1.7344896597731823, "ref_ce_loss": 0.11367204785346985, "step": 5200 }, { "epoch": 1.7344896597731823, "loss": 0.5577737092971802, "step": 5200 }, { "ce_loss": 0.2857550382614136, "epoch": 1.7344896597731823, "step": 5200 }, { "distill_loss": 0.12473808228969574, "epoch": 1.7344896597731823, "step": 5200 }, { "epoch": 1.7344896597731823, "ref_ce_loss": 0.1469634473323822, "step": 5200 }, { "epoch": 1.7378252168112076, "loss": 0.588, "step": 5210 }, { "epoch": 1.7378252168112076, "grad_norm": 1.895171046257019, "step": 5210 }, { "epoch": 1.7378252168112076, "learning_rate": 0.0002732614072263791, "step": 5210 }, { "epoch": 1.7378252168112076, "loss": 0.6798540949821472, "step": 5210 }, { "ce_loss": 0.20952335000038147, "epoch": 1.7378252168112076, "step": 5210 }, { "distill_loss": 0.11695443838834763, "epoch": 1.7378252168112076, "step": 5210 }, { "epoch": 1.7378252168112076, "ref_ce_loss": 0.11746593564748764, "step": 5210 }, { "epoch": 1.7378252168112076, "loss": 0.26200753450393677, "step": 5210 }, { "ce_loss": 0.07369393855333328, "epoch": 1.7378252168112076, "step": 5210 }, { "distill_loss": 0.06114661321043968, "epoch": 1.7378252168112076, "step": 5210 }, { "epoch": 1.7378252168112076, "ref_ce_loss": 0.07951971143484116, "step": 5210 }, { "epoch": 1.7378252168112076, "loss": 0.6085673570632935, "step": 5210 }, { "ce_loss": 0.24875447154045105, "epoch": 1.7378252168112076, "step": 5210 }, { "distill_loss": 0.10614325106143951, "epoch": 1.7378252168112076, "step": 5210 }, { "epoch": 1.7378252168112076, "ref_ce_loss": 0.1640029400587082, "step": 5210 }, { "epoch": 1.7378252168112076, "loss": 1.0081214904785156, "step": 5210 }, { "ce_loss": 0.21574634313583374, "epoch": 1.7378252168112076, "step": 5210 }, { "distill_loss": 0.07902166247367859, "epoch": 1.7378252168112076, "step": 5210 }, { "epoch": 1.7378252168112076, "ref_ce_loss": 0.10362114012241364, "step": 5210 }, { "epoch": 1.741160773849233, "loss": 0.6313, "step": 5220 }, { "epoch": 1.741160773849233, "grad_norm": 2.011845111846924, "step": 5220 }, { "epoch": 1.741160773849233, "learning_rate": 0.00027314586348617793, "step": 5220 }, { "epoch": 1.741160773849233, "loss": 0.6953931450843811, "step": 5220 }, { "ce_loss": 0.23883208632469177, "epoch": 1.741160773849233, "step": 5220 }, { "distill_loss": 0.11485342681407928, "epoch": 1.741160773849233, "step": 5220 }, { "epoch": 1.741160773849233, "ref_ce_loss": 0.14983177185058594, "step": 5220 }, { "epoch": 1.741160773849233, "loss": 0.5598002672195435, "step": 5220 }, { "ce_loss": 0.14301392436027527, "epoch": 1.741160773849233, "step": 5220 }, { "distill_loss": 0.11065421998500824, "epoch": 1.741160773849233, "step": 5220 }, { "epoch": 1.741160773849233, "ref_ce_loss": 0.15477022528648376, "step": 5220 }, { "epoch": 1.741160773849233, "loss": 0.5702969431877136, "step": 5220 }, { "ce_loss": 0.23705795407295227, "epoch": 1.741160773849233, "step": 5220 }, { "distill_loss": 0.09813598543405533, "epoch": 1.741160773849233, "step": 5220 }, { "epoch": 1.741160773849233, "ref_ce_loss": 0.23498722910881042, "step": 5220 }, { "epoch": 1.741160773849233, "loss": 0.4153706431388855, "step": 5220 }, { "ce_loss": 0.15375342965126038, "epoch": 1.741160773849233, "step": 5220 }, { "distill_loss": 0.09605289995670319, "epoch": 1.741160773849233, "step": 5220 }, { "epoch": 1.741160773849233, "ref_ce_loss": 0.09962562471628189, "step": 5220 }, { "epoch": 1.7444963308872583, "loss": 0.6265, "step": 5230 }, { "epoch": 1.7444963308872583, "grad_norm": 3.9798173904418945, "step": 5230 }, { "epoch": 1.7444963308872583, "learning_rate": 0.0002730300951764989, "step": 5230 }, { "epoch": 1.7444963308872583, "loss": 1.071535587310791, "step": 5230 }, { "ce_loss": 0.24925430119037628, "epoch": 1.7444963308872583, "step": 5230 }, { "distill_loss": 0.21522605419158936, "epoch": 1.7444963308872583, "step": 5230 }, { "epoch": 1.7444963308872583, "ref_ce_loss": 0.17369335889816284, "step": 5230 }, { "epoch": 1.7444963308872583, "loss": 0.648411750793457, "step": 5230 }, { "ce_loss": 0.243378147482872, "epoch": 1.7444963308872583, "step": 5230 }, { "distill_loss": 0.16543267667293549, "epoch": 1.7444963308872583, "step": 5230 }, { "epoch": 1.7444963308872583, "ref_ce_loss": 0.1546335220336914, "step": 5230 }, { "epoch": 1.7444963308872583, "loss": 1.0082130432128906, "step": 5230 }, { "ce_loss": 0.30150628089904785, "epoch": 1.7444963308872583, "step": 5230 }, { "distill_loss": 0.19451621174812317, "epoch": 1.7444963308872583, "step": 5230 }, { "epoch": 1.7444963308872583, "ref_ce_loss": 0.12648442387580872, "step": 5230 }, { "epoch": 1.7444963308872583, "loss": 0.4836832284927368, "step": 5230 }, { "ce_loss": 0.15982374548912048, "epoch": 1.7444963308872583, "step": 5230 }, { "distill_loss": 0.17217841744422913, "epoch": 1.7444963308872583, "step": 5230 }, { "epoch": 1.7444963308872583, "ref_ce_loss": 0.0835745707154274, "step": 5230 }, { "epoch": 1.7478318879252837, "loss": 0.6475, "step": 5240 }, { "epoch": 1.7478318879252837, "grad_norm": 2.498871088027954, "step": 5240 }, { "epoch": 1.7478318879252837, "learning_rate": 0.0002729141025084577, "step": 5240 }, { "epoch": 1.7478318879252837, "loss": 0.6212328672409058, "step": 5240 }, { "ce_loss": 0.15493199229240417, "epoch": 1.7478318879252837, "step": 5240 }, { "distill_loss": 0.1978866457939148, "epoch": 1.7478318879252837, "step": 5240 }, { "epoch": 1.7478318879252837, "ref_ce_loss": 0.13444751501083374, "step": 5240 }, { "epoch": 1.7478318879252837, "loss": 0.5558438897132874, "step": 5240 }, { "ce_loss": 0.21160100400447845, "epoch": 1.7478318879252837, "step": 5240 }, { "distill_loss": 0.1478114128112793, "epoch": 1.7478318879252837, "step": 5240 }, { "epoch": 1.7478318879252837, "ref_ce_loss": 0.1627282351255417, "step": 5240 }, { "epoch": 1.7478318879252837, "loss": 0.9472699165344238, "step": 5240 }, { "ce_loss": 0.25463220477104187, "epoch": 1.7478318879252837, "step": 5240 }, { "distill_loss": 0.1912805289030075, "epoch": 1.7478318879252837, "step": 5240 }, { "epoch": 1.7478318879252837, "ref_ce_loss": 0.13427430391311646, "step": 5240 }, { "epoch": 1.7478318879252837, "loss": 0.5777902007102966, "step": 5240 }, { "ce_loss": 0.2193995863199234, "epoch": 1.7478318879252837, "step": 5240 }, { "distill_loss": 0.19780264794826508, "epoch": 1.7478318879252837, "step": 5240 }, { "epoch": 1.7478318879252837, "ref_ce_loss": 0.16050836443901062, "step": 5240 }, { "epoch": 1.751167444963309, "loss": 0.6298, "step": 5250 }, { "epoch": 1.751167444963309, "grad_norm": 3.098067283630371, "step": 5250 }, { "epoch": 1.751167444963309, "learning_rate": 0.00027279788569357916, "step": 5250 }, { "epoch": 1.751167444963309, "loss": 0.6694287061691284, "step": 5250 }, { "ce_loss": 0.2001875340938568, "epoch": 1.751167444963309, "step": 5250 }, { "distill_loss": 0.13199788331985474, "epoch": 1.751167444963309, "step": 5250 }, { "epoch": 1.751167444963309, "ref_ce_loss": 0.17222945392131805, "step": 5250 }, { "epoch": 1.751167444963309, "loss": 0.30083397030830383, "step": 5250 }, { "ce_loss": 0.10140173882246017, "epoch": 1.751167444963309, "step": 5250 }, { "distill_loss": 0.10152903199195862, "epoch": 1.751167444963309, "step": 5250 }, { "epoch": 1.751167444963309, "ref_ce_loss": 0.09783918410539627, "step": 5250 }, { "epoch": 1.751167444963309, "loss": 0.5701338648796082, "step": 5250 }, { "ce_loss": 0.2400723248720169, "epoch": 1.751167444963309, "step": 5250 }, { "distill_loss": 0.1128765121102333, "epoch": 1.751167444963309, "step": 5250 }, { "epoch": 1.751167444963309, "ref_ce_loss": 0.14214551448822021, "step": 5250 }, { "epoch": 1.751167444963309, "loss": 1.617339849472046, "step": 5250 }, { "ce_loss": 0.34532344341278076, "epoch": 1.751167444963309, "step": 5250 }, { "distill_loss": 0.20758526027202606, "epoch": 1.751167444963309, "step": 5250 }, { "epoch": 1.751167444963309, "ref_ce_loss": 0.2039952427148819, "step": 5250 }, { "epoch": 1.7545030020013344, "loss": 0.6432, "step": 5260 }, { "epoch": 1.7545030020013344, "grad_norm": 3.359388828277588, "step": 5260 }, { "epoch": 1.7545030020013344, "learning_rate": 0.0002726814449437969, "step": 5260 }, { "epoch": 1.7545030020013344, "loss": 0.35906630754470825, "step": 5260 }, { "ce_loss": 0.12533292174339294, "epoch": 1.7545030020013344, "step": 5260 }, { "distill_loss": 0.0773266926407814, "epoch": 1.7545030020013344, "step": 5260 }, { "epoch": 1.7545030020013344, "ref_ce_loss": 0.09706180542707443, "step": 5260 }, { "epoch": 1.7545030020013344, "loss": 0.8488431572914124, "step": 5260 }, { "ce_loss": 0.19705916941165924, "epoch": 1.7545030020013344, "step": 5260 }, { "distill_loss": 0.12958835065364838, "epoch": 1.7545030020013344, "step": 5260 }, { "epoch": 1.7545030020013344, "ref_ce_loss": 0.17487740516662598, "step": 5260 }, { "epoch": 1.7545030020013344, "loss": 0.6030906438827515, "step": 5260 }, { "ce_loss": 0.17236897349357605, "epoch": 1.7545030020013344, "step": 5260 }, { "distill_loss": 0.11375070363283157, "epoch": 1.7545030020013344, "step": 5260 }, { "epoch": 1.7545030020013344, "ref_ce_loss": 0.14064763486385345, "step": 5260 }, { "epoch": 1.7545030020013344, "loss": 0.8054187297821045, "step": 5260 }, { "ce_loss": 0.2541971206665039, "epoch": 1.7545030020013344, "step": 5260 }, { "distill_loss": 0.14604657888412476, "epoch": 1.7545030020013344, "step": 5260 }, { "epoch": 1.7545030020013344, "ref_ce_loss": 0.1376919001340866, "step": 5260 }, { "epoch": 1.7578385590393597, "loss": 0.6159, "step": 5270 }, { "epoch": 1.7578385590393597, "grad_norm": 3.611070394515991, "step": 5270 }, { "epoch": 1.7578385590393597, "learning_rate": 0.00027256478047145297, "step": 5270 }, { "epoch": 1.7578385590393597, "loss": 0.4489757716655731, "step": 5270 }, { "ce_loss": 0.15936985611915588, "epoch": 1.7578385590393597, "step": 5270 }, { "distill_loss": 0.13974682986736298, "epoch": 1.7578385590393597, "step": 5270 }, { "epoch": 1.7578385590393597, "ref_ce_loss": 0.12064920365810394, "step": 5270 }, { "epoch": 1.7578385590393597, "loss": 0.35650843381881714, "step": 5270 }, { "ce_loss": 0.07867399603128433, "epoch": 1.7578385590393597, "step": 5270 }, { "distill_loss": 0.11882077902555466, "epoch": 1.7578385590393597, "step": 5270 }, { "epoch": 1.7578385590393597, "ref_ce_loss": 0.09165604412555695, "step": 5270 }, { "epoch": 1.7578385590393597, "loss": 0.40967947244644165, "step": 5270 }, { "ce_loss": 0.1292242407798767, "epoch": 1.7578385590393597, "step": 5270 }, { "distill_loss": 0.12140464037656784, "epoch": 1.7578385590393597, "step": 5270 }, { "epoch": 1.7578385590393597, "ref_ce_loss": 0.10581637173891068, "step": 5270 }, { "epoch": 1.7578385590393597, "loss": 0.6546857357025146, "step": 5270 }, { "ce_loss": 0.2300027757883072, "epoch": 1.7578385590393597, "step": 5270 }, { "distill_loss": 0.15563297271728516, "epoch": 1.7578385590393597, "step": 5270 }, { "epoch": 1.7578385590393597, "ref_ce_loss": 0.126151442527771, "step": 5270 }, { "epoch": 1.761174116077385, "loss": 0.5937, "step": 5280 }, { "epoch": 1.761174116077385, "grad_norm": 2.565040349960327, "step": 5280 }, { "epoch": 1.761174116077385, "learning_rate": 0.00027244789248929735, "step": 5280 }, { "epoch": 1.761174116077385, "loss": 0.4498351812362671, "step": 5280 }, { "ce_loss": 0.10470875352621078, "epoch": 1.761174116077385, "step": 5280 }, { "distill_loss": 0.14078587293624878, "epoch": 1.761174116077385, "step": 5280 }, { "epoch": 1.761174116077385, "ref_ce_loss": 0.08281106501817703, "step": 5280 }, { "epoch": 1.761174116077385, "loss": 0.5353066921234131, "step": 5280 }, { "ce_loss": 0.1827135682106018, "epoch": 1.761174116077385, "step": 5280 }, { "distill_loss": 0.12412932515144348, "epoch": 1.761174116077385, "step": 5280 }, { "epoch": 1.761174116077385, "ref_ce_loss": 0.1676645129919052, "step": 5280 }, { "epoch": 1.761174116077385, "loss": 0.39925214648246765, "step": 5280 }, { "ce_loss": 0.11564359813928604, "epoch": 1.761174116077385, "step": 5280 }, { "distill_loss": 0.13008667528629303, "epoch": 1.761174116077385, "step": 5280 }, { "epoch": 1.761174116077385, "ref_ce_loss": 0.11450548470020294, "step": 5280 }, { "epoch": 1.761174116077385, "loss": 0.9488141536712646, "step": 5280 }, { "ce_loss": 0.296876460313797, "epoch": 1.761174116077385, "step": 5280 }, { "distill_loss": 0.13965021073818207, "epoch": 1.761174116077385, "step": 5280 }, { "epoch": 1.761174116077385, "ref_ce_loss": 0.2269418090581894, "step": 5280 }, { "epoch": 1.7645096731154104, "loss": 0.5795, "step": 5290 }, { "epoch": 1.7645096731154104, "grad_norm": 4.342066287994385, "step": 5290 }, { "epoch": 1.7645096731154104, "learning_rate": 0.0002723307812104875, "step": 5290 }, { "epoch": 1.7645096731154104, "loss": 0.4556236267089844, "step": 5290 }, { "ce_loss": 0.19900187849998474, "epoch": 1.7645096731154104, "step": 5290 }, { "distill_loss": 0.12070953100919724, "epoch": 1.7645096731154104, "step": 5290 }, { "epoch": 1.7645096731154104, "ref_ce_loss": 0.1357794851064682, "step": 5290 }, { "epoch": 1.7645096731154104, "loss": 0.49381351470947266, "step": 5290 }, { "ce_loss": 0.22090773284435272, "epoch": 1.7645096731154104, "step": 5290 }, { "distill_loss": 0.15897664427757263, "epoch": 1.7645096731154104, "step": 5290 }, { "epoch": 1.7645096731154104, "ref_ce_loss": 0.11341068148612976, "step": 5290 }, { "epoch": 1.7645096731154104, "loss": 0.8587750196456909, "step": 5290 }, { "ce_loss": 0.3212994337081909, "epoch": 1.7645096731154104, "step": 5290 }, { "distill_loss": 0.189230814576149, "epoch": 1.7645096731154104, "step": 5290 }, { "epoch": 1.7645096731154104, "ref_ce_loss": 0.17310874164104462, "step": 5290 }, { "epoch": 1.7645096731154104, "loss": 1.1447008848190308, "step": 5290 }, { "ce_loss": 0.16058073937892914, "epoch": 1.7645096731154104, "step": 5290 }, { "distill_loss": 0.14463923871517181, "epoch": 1.7645096731154104, "step": 5290 }, { "epoch": 1.7645096731154104, "ref_ce_loss": 0.12579326331615448, "step": 5290 }, { "epoch": 1.7678452301534358, "loss": 0.6033, "step": 5300 }, { "epoch": 1.7678452301534358, "grad_norm": 3.2570016384124756, "step": 5300 }, { "epoch": 1.7678452301534358, "learning_rate": 0.00027221344684858834, "step": 5300 }, { "epoch": 1.7678452301534358, "loss": 0.5072019696235657, "step": 5300 }, { "ce_loss": 0.15257352590560913, "epoch": 1.7678452301534358, "step": 5300 }, { "distill_loss": 0.13202786445617676, "epoch": 1.7678452301534358, "step": 5300 }, { "epoch": 1.7678452301534358, "ref_ce_loss": 0.12185128033161163, "step": 5300 }, { "epoch": 1.7678452301534358, "loss": 0.4970143437385559, "step": 5300 }, { "ce_loss": 0.19305184483528137, "epoch": 1.7678452301534358, "step": 5300 }, { "distill_loss": 0.15051575005054474, "epoch": 1.7678452301534358, "step": 5300 }, { "epoch": 1.7678452301534358, "ref_ce_loss": 0.12638255953788757, "step": 5300 }, { "epoch": 1.7678452301534358, "loss": 0.6955916881561279, "step": 5300 }, { "ce_loss": 0.20245814323425293, "epoch": 1.7678452301534358, "step": 5300 }, { "distill_loss": 0.14921513199806213, "epoch": 1.7678452301534358, "step": 5300 }, { "epoch": 1.7678452301534358, "ref_ce_loss": 0.16635259985923767, "step": 5300 }, { "epoch": 1.7678452301534358, "loss": 0.6717430353164673, "step": 5300 }, { "ce_loss": 0.2943767011165619, "epoch": 1.7678452301534358, "step": 5300 }, { "distill_loss": 0.2113085836172104, "epoch": 1.7678452301534358, "step": 5300 }, { "epoch": 1.7678452301534358, "ref_ce_loss": 0.16593855619430542, "step": 5300 }, { "epoch": 1.771180787191461, "loss": 0.6271, "step": 5310 }, { "epoch": 1.771180787191461, "grad_norm": 3.04647159576416, "step": 5310 }, { "epoch": 1.771180787191461, "learning_rate": 0.00027209588961757137, "step": 5310 }, { "epoch": 1.771180787191461, "loss": 1.033667802810669, "step": 5310 }, { "ce_loss": 0.18642698228359222, "epoch": 1.771180787191461, "step": 5310 }, { "distill_loss": 0.09144186973571777, "epoch": 1.771180787191461, "step": 5310 }, { "epoch": 1.771180787191461, "ref_ce_loss": 0.12195436656475067, "step": 5310 }, { "epoch": 1.771180787191461, "loss": 0.5744385719299316, "step": 5310 }, { "ce_loss": 0.19218167662620544, "epoch": 1.771180787191461, "step": 5310 }, { "distill_loss": 0.0962839350104332, "epoch": 1.771180787191461, "step": 5310 }, { "epoch": 1.771180787191461, "ref_ce_loss": 0.14160801470279694, "step": 5310 }, { "epoch": 1.771180787191461, "loss": 0.5619361996650696, "step": 5310 }, { "ce_loss": 0.2757478356361389, "epoch": 1.771180787191461, "step": 5310 }, { "distill_loss": 0.14615021646022797, "epoch": 1.771180787191461, "step": 5310 }, { "epoch": 1.771180787191461, "ref_ce_loss": 0.13982820510864258, "step": 5310 }, { "epoch": 1.771180787191461, "loss": 0.38822707533836365, "step": 5310 }, { "ce_loss": 0.18420282006263733, "epoch": 1.771180787191461, "step": 5310 }, { "distill_loss": 0.10989782214164734, "epoch": 1.771180787191461, "step": 5310 }, { "epoch": 1.771180787191461, "ref_ce_loss": 0.09404101222753525, "step": 5310 }, { "epoch": 1.7745163442294865, "loss": 0.6356, "step": 5320 }, { "epoch": 1.7745163442294865, "grad_norm": 2.576591730117798, "step": 5320 }, { "epoch": 1.7745163442294865, "learning_rate": 0.0002719781097318147, "step": 5320 }, { "epoch": 1.7745163442294865, "loss": 1.1352790594100952, "step": 5320 }, { "ce_loss": 0.3041479289531708, "epoch": 1.7745163442294865, "step": 5320 }, { "distill_loss": 0.0969984382390976, "epoch": 1.7745163442294865, "step": 5320 }, { "epoch": 1.7745163442294865, "ref_ce_loss": 0.23569048941135406, "step": 5320 }, { "epoch": 1.7745163442294865, "loss": 1.0350360870361328, "step": 5320 }, { "ce_loss": 0.3750414252281189, "epoch": 1.7745163442294865, "step": 5320 }, { "distill_loss": 0.15197308361530304, "epoch": 1.7745163442294865, "step": 5320 }, { "epoch": 1.7745163442294865, "ref_ce_loss": 0.2801123857498169, "step": 5320 }, { "epoch": 1.7745163442294865, "loss": 0.6875044107437134, "step": 5320 }, { "ce_loss": 0.19332902133464813, "epoch": 1.7745163442294865, "step": 5320 }, { "distill_loss": 0.09888143092393875, "epoch": 1.7745163442294865, "step": 5320 }, { "epoch": 1.7745163442294865, "ref_ce_loss": 0.1342381089925766, "step": 5320 }, { "epoch": 1.7745163442294865, "loss": 0.4631473124027252, "step": 5320 }, { "ce_loss": 0.15175089240074158, "epoch": 1.7745163442294865, "step": 5320 }, { "distill_loss": 0.09046868234872818, "epoch": 1.7745163442294865, "step": 5320 }, { "epoch": 1.7745163442294865, "ref_ce_loss": 0.1468697339296341, "step": 5320 }, { "epoch": 1.7778519012675118, "loss": 0.6277, "step": 5330 }, { "epoch": 1.7778519012675118, "grad_norm": 3.112802505493164, "step": 5330 }, { "epoch": 1.7778519012675118, "learning_rate": 0.00027186010740610226, "step": 5330 }, { "epoch": 1.7778519012675118, "loss": 1.1258635520935059, "step": 5330 }, { "ce_loss": 0.23738060891628265, "epoch": 1.7778519012675118, "step": 5330 }, { "distill_loss": 0.12533307075500488, "epoch": 1.7778519012675118, "step": 5330 }, { "epoch": 1.7778519012675118, "ref_ce_loss": 0.16516506671905518, "step": 5330 }, { "epoch": 1.7778519012675118, "loss": 0.9647006988525391, "step": 5330 }, { "ce_loss": 0.199024498462677, "epoch": 1.7778519012675118, "step": 5330 }, { "distill_loss": 0.1232900470495224, "epoch": 1.7778519012675118, "step": 5330 }, { "epoch": 1.7778519012675118, "ref_ce_loss": 0.1646987795829773, "step": 5330 }, { "epoch": 1.7778519012675118, "loss": 0.4685381054878235, "step": 5330 }, { "ce_loss": 0.14527423679828644, "epoch": 1.7778519012675118, "step": 5330 }, { "distill_loss": 0.11627337336540222, "epoch": 1.7778519012675118, "step": 5330 }, { "epoch": 1.7778519012675118, "ref_ce_loss": 0.10118523985147476, "step": 5330 }, { "epoch": 1.7778519012675118, "loss": 0.3879113793373108, "step": 5330 }, { "ce_loss": 0.15242859721183777, "epoch": 1.7778519012675118, "step": 5330 }, { "distill_loss": 0.0961771309375763, "epoch": 1.7778519012675118, "step": 5330 }, { "epoch": 1.7778519012675118, "ref_ce_loss": 0.09178163856267929, "step": 5330 }, { "epoch": 1.7811874583055372, "loss": 0.5619, "step": 5340 }, { "epoch": 1.7811874583055372, "grad_norm": 5.199608325958252, "step": 5340 }, { "epoch": 1.7811874583055372, "learning_rate": 0.00027174188285562377, "step": 5340 }, { "epoch": 1.7811874583055372, "loss": 0.8281639814376831, "step": 5340 }, { "ce_loss": 0.1604779213666916, "epoch": 1.7811874583055372, "step": 5340 }, { "distill_loss": 0.11927327513694763, "epoch": 1.7811874583055372, "step": 5340 }, { "epoch": 1.7811874583055372, "ref_ce_loss": 0.16226066648960114, "step": 5340 }, { "epoch": 1.7811874583055372, "loss": 1.659521460533142, "step": 5340 }, { "ce_loss": 0.2811453640460968, "epoch": 1.7811874583055372, "step": 5340 }, { "distill_loss": 0.15258263051509857, "epoch": 1.7811874583055372, "step": 5340 }, { "epoch": 1.7811874583055372, "ref_ce_loss": 0.19549334049224854, "step": 5340 }, { "epoch": 1.7811874583055372, "loss": 0.26514604687690735, "step": 5340 }, { "ce_loss": 0.029982445761561394, "epoch": 1.7811874583055372, "step": 5340 }, { "distill_loss": 0.07369742542505264, "epoch": 1.7811874583055372, "step": 5340 }, { "epoch": 1.7811874583055372, "ref_ce_loss": 0.10002217441797256, "step": 5340 }, { "epoch": 1.7811874583055372, "loss": 0.3143136501312256, "step": 5340 }, { "ce_loss": 0.07460236549377441, "epoch": 1.7811874583055372, "step": 5340 }, { "distill_loss": 0.08271145820617676, "epoch": 1.7811874583055372, "step": 5340 }, { "epoch": 1.7811874583055372, "ref_ce_loss": 0.11252786964178085, "step": 5340 }, { "epoch": 1.7845230153435625, "loss": 0.6054, "step": 5350 }, { "epoch": 1.7845230153435625, "grad_norm": 1.8053847551345825, "step": 5350 }, { "epoch": 1.7845230153435625, "learning_rate": 0.00027162343629597425, "step": 5350 }, { "epoch": 1.7845230153435625, "loss": 0.4593953788280487, "step": 5350 }, { "ce_loss": 0.04222152754664421, "epoch": 1.7845230153435625, "step": 5350 }, { "distill_loss": 0.07897884398698807, "epoch": 1.7845230153435625, "step": 5350 }, { "epoch": 1.7845230153435625, "ref_ce_loss": 0.11623603850603104, "step": 5350 }, { "epoch": 1.7845230153435625, "loss": 0.518944263458252, "step": 5350 }, { "ce_loss": 0.17399434745311737, "epoch": 1.7845230153435625, "step": 5350 }, { "distill_loss": 0.14518676698207855, "epoch": 1.7845230153435625, "step": 5350 }, { "epoch": 1.7845230153435625, "ref_ce_loss": 0.12270195037126541, "step": 5350 }, { "epoch": 1.7845230153435625, "loss": 0.7417664527893066, "step": 5350 }, { "ce_loss": 0.21204745769500732, "epoch": 1.7845230153435625, "step": 5350 }, { "distill_loss": 0.13328410685062408, "epoch": 1.7845230153435625, "step": 5350 }, { "epoch": 1.7845230153435625, "ref_ce_loss": 0.07787536084651947, "step": 5350 }, { "epoch": 1.7845230153435625, "loss": 0.39500027894973755, "step": 5350 }, { "ce_loss": 0.1639915108680725, "epoch": 1.7845230153435625, "step": 5350 }, { "distill_loss": 0.10372168570756912, "epoch": 1.7845230153435625, "step": 5350 }, { "epoch": 1.7845230153435625, "ref_ce_loss": 0.0849570780992508, "step": 5350 }, { "epoch": 1.7878585723815879, "loss": 0.6098, "step": 5360 }, { "epoch": 1.7878585723815879, "grad_norm": 2.036372184753418, "step": 5360 }, { "epoch": 1.7878585723815879, "learning_rate": 0.00027150476794315345, "step": 5360 }, { "epoch": 1.7878585723815879, "loss": 0.365789532661438, "step": 5360 }, { "ce_loss": 0.12750551104545593, "epoch": 1.7878585723815879, "step": 5360 }, { "distill_loss": 0.10385686159133911, "epoch": 1.7878585723815879, "step": 5360 }, { "epoch": 1.7878585723815879, "ref_ce_loss": 0.13411588966846466, "step": 5360 }, { "epoch": 1.7878585723815879, "loss": 0.37090298533439636, "step": 5360 }, { "ce_loss": 0.13140667974948883, "epoch": 1.7878585723815879, "step": 5360 }, { "distill_loss": 0.1126258373260498, "epoch": 1.7878585723815879, "step": 5360 }, { "epoch": 1.7878585723815879, "ref_ce_loss": 0.09275903552770615, "step": 5360 }, { "epoch": 1.7878585723815879, "loss": 1.2308400869369507, "step": 5360 }, { "ce_loss": 0.21442733705043793, "epoch": 1.7878585723815879, "step": 5360 }, { "distill_loss": 0.1381993144750595, "epoch": 1.7878585723815879, "step": 5360 }, { "epoch": 1.7878585723815879, "ref_ce_loss": 0.13881564140319824, "step": 5360 }, { "epoch": 1.7878585723815879, "loss": 0.38338571786880493, "step": 5360 }, { "ce_loss": 0.11972062289714813, "epoch": 1.7878585723815879, "step": 5360 }, { "distill_loss": 0.06880615651607513, "epoch": 1.7878585723815879, "step": 5360 }, { "epoch": 1.7878585723815879, "ref_ce_loss": 0.07677696645259857, "step": 5360 }, { "epoch": 1.7911941294196132, "loss": 0.5662, "step": 5370 }, { "epoch": 1.7911941294196132, "grad_norm": 2.8769922256469727, "step": 5370 }, { "epoch": 1.7911941294196132, "learning_rate": 0.0002713858780135657, "step": 5370 }, { "epoch": 1.7911941294196132, "loss": 0.6280443668365479, "step": 5370 }, { "ce_loss": 0.16620515286922455, "epoch": 1.7911941294196132, "step": 5370 }, { "distill_loss": 0.0986478179693222, "epoch": 1.7911941294196132, "step": 5370 }, { "epoch": 1.7911941294196132, "ref_ce_loss": 0.1469651609659195, "step": 5370 }, { "epoch": 1.7911941294196132, "loss": 0.8258258700370789, "step": 5370 }, { "ce_loss": 0.17810150980949402, "epoch": 1.7911941294196132, "step": 5370 }, { "distill_loss": 0.11801296472549438, "epoch": 1.7911941294196132, "step": 5370 }, { "epoch": 1.7911941294196132, "ref_ce_loss": 0.23140060901641846, "step": 5370 }, { "epoch": 1.7911941294196132, "loss": 0.4521212577819824, "step": 5370 }, { "ce_loss": 0.10244987159967422, "epoch": 1.7911941294196132, "step": 5370 }, { "distill_loss": 0.08709345757961273, "epoch": 1.7911941294196132, "step": 5370 }, { "epoch": 1.7911941294196132, "ref_ce_loss": 0.1392844021320343, "step": 5370 }, { "epoch": 1.7911941294196132, "loss": 0.5021611452102661, "step": 5370 }, { "ce_loss": 0.20533789694309235, "epoch": 1.7911941294196132, "step": 5370 }, { "distill_loss": 0.1511787325143814, "epoch": 1.7911941294196132, "step": 5370 }, { "epoch": 1.7911941294196132, "ref_ce_loss": 0.09348888695240021, "step": 5370 }, { "epoch": 1.7945296864576386, "loss": 0.6176, "step": 5380 }, { "epoch": 1.7945296864576386, "grad_norm": 2.5719034671783447, "step": 5380 }, { "epoch": 1.7945296864576386, "learning_rate": 0.00027126676672401917, "step": 5380 }, { "epoch": 1.7945296864576386, "loss": 0.36035069823265076, "step": 5380 }, { "ce_loss": 0.11971081793308258, "epoch": 1.7945296864576386, "step": 5380 }, { "distill_loss": 0.09691092371940613, "epoch": 1.7945296864576386, "step": 5380 }, { "epoch": 1.7945296864576386, "ref_ce_loss": 0.09212952107191086, "step": 5380 }, { "epoch": 1.7945296864576386, "loss": 0.5740700960159302, "step": 5380 }, { "ce_loss": 0.19906161725521088, "epoch": 1.7945296864576386, "step": 5380 }, { "distill_loss": 0.10676465183496475, "epoch": 1.7945296864576386, "step": 5380 }, { "epoch": 1.7945296864576386, "ref_ce_loss": 0.13155211508274078, "step": 5380 }, { "epoch": 1.7945296864576386, "loss": 0.7756186127662659, "step": 5380 }, { "ce_loss": 0.19126425683498383, "epoch": 1.7945296864576386, "step": 5380 }, { "distill_loss": 0.14128831028938293, "epoch": 1.7945296864576386, "step": 5380 }, { "epoch": 1.7945296864576386, "ref_ce_loss": 0.16342337429523468, "step": 5380 }, { "epoch": 1.7945296864576386, "loss": 0.33875101804733276, "step": 5380 }, { "ce_loss": 0.1185140386223793, "epoch": 1.7945296864576386, "step": 5380 }, { "distill_loss": 0.09045391529798508, "epoch": 1.7945296864576386, "step": 5380 }, { "epoch": 1.7945296864576386, "ref_ce_loss": 0.08819026499986649, "step": 5380 }, { "epoch": 1.797865243495664, "loss": 0.5726, "step": 5390 }, { "epoch": 1.797865243495664, "grad_norm": 2.365931987762451, "step": 5390 }, { "epoch": 1.797865243495664, "learning_rate": 0.0002711474342917261, "step": 5390 }, { "epoch": 1.797865243495664, "loss": 0.3825666904449463, "step": 5390 }, { "ce_loss": 0.13808047771453857, "epoch": 1.797865243495664, "step": 5390 }, { "distill_loss": 0.12363763153553009, "epoch": 1.797865243495664, "step": 5390 }, { "epoch": 1.797865243495664, "ref_ce_loss": 0.12069553881883621, "step": 5390 }, { "epoch": 1.797865243495664, "loss": 0.35606908798217773, "step": 5390 }, { "ce_loss": 0.09475722163915634, "epoch": 1.797865243495664, "step": 5390 }, { "distill_loss": 0.09718462079763412, "epoch": 1.797865243495664, "step": 5390 }, { "epoch": 1.797865243495664, "ref_ce_loss": 0.11456798017024994, "step": 5390 }, { "epoch": 1.797865243495664, "loss": 0.7455524802207947, "step": 5390 }, { "ce_loss": 0.20426201820373535, "epoch": 1.797865243495664, "step": 5390 }, { "distill_loss": 0.12498379498720169, "epoch": 1.797865243495664, "step": 5390 }, { "epoch": 1.797865243495664, "ref_ce_loss": 0.10539623349905014, "step": 5390 }, { "epoch": 1.797865243495664, "loss": 0.8537323474884033, "step": 5390 }, { "ce_loss": 0.25624123215675354, "epoch": 1.797865243495664, "step": 5390 }, { "distill_loss": 0.13884396851062775, "epoch": 1.797865243495664, "step": 5390 }, { "epoch": 1.797865243495664, "ref_ce_loss": 0.1865921914577484, "step": 5390 }, { "epoch": 1.8012008005336893, "loss": 0.556, "step": 5400 }, { "epoch": 1.8012008005336893, "grad_norm": 2.434817314147949, "step": 5400 }, { "epoch": 1.8012008005336893, "learning_rate": 0.0002710278809343015, "step": 5400 }, { "epoch": 1.8012008005336893, "loss": 0.6801935434341431, "step": 5400 }, { "ce_loss": 0.19641169905662537, "epoch": 1.8012008005336893, "step": 5400 }, { "distill_loss": 0.09645406156778336, "epoch": 1.8012008005336893, "step": 5400 }, { "epoch": 1.8012008005336893, "ref_ce_loss": 0.11136016249656677, "step": 5400 }, { "epoch": 1.8012008005336893, "loss": 0.6349014043807983, "step": 5400 }, { "ce_loss": 0.2154546082019806, "epoch": 1.8012008005336893, "step": 5400 }, { "distill_loss": 0.111332967877388, "epoch": 1.8012008005336893, "step": 5400 }, { "epoch": 1.8012008005336893, "ref_ce_loss": 0.10834940522909164, "step": 5400 }, { "epoch": 1.8012008005336893, "loss": 0.8095474243164062, "step": 5400 }, { "ce_loss": 0.13908988237380981, "epoch": 1.8012008005336893, "step": 5400 }, { "distill_loss": 0.11110659688711166, "epoch": 1.8012008005336893, "step": 5400 }, { "epoch": 1.8012008005336893, "ref_ce_loss": 0.08503452688455582, "step": 5400 }, { "epoch": 1.8012008005336893, "loss": 0.9587161540985107, "step": 5400 }, { "ce_loss": 0.17953532934188843, "epoch": 1.8012008005336893, "step": 5400 }, { "distill_loss": 0.09979195147752762, "epoch": 1.8012008005336893, "step": 5400 }, { "epoch": 1.8012008005336893, "ref_ce_loss": 0.11760571599006653, "step": 5400 }, { "epoch": 1.8045363575717146, "loss": 0.6309, "step": 5410 }, { "epoch": 1.8045363575717146, "grad_norm": 2.2750089168548584, "step": 5410 }, { "epoch": 1.8045363575717146, "learning_rate": 0.00027090810686976373, "step": 5410 }, { "epoch": 1.8045363575717146, "loss": 0.7453268766403198, "step": 5410 }, { "ce_loss": 0.19390563666820526, "epoch": 1.8045363575717146, "step": 5410 }, { "distill_loss": 0.12300623953342438, "epoch": 1.8045363575717146, "step": 5410 }, { "epoch": 1.8045363575717146, "ref_ce_loss": 0.2404060810804367, "step": 5410 }, { "epoch": 1.8045363575717146, "loss": 1.062126636505127, "step": 5410 }, { "ce_loss": 0.28073927760124207, "epoch": 1.8045363575717146, "step": 5410 }, { "distill_loss": 0.10538649559020996, "epoch": 1.8045363575717146, "step": 5410 }, { "epoch": 1.8045363575717146, "ref_ce_loss": 0.14817595481872559, "step": 5410 }, { "epoch": 1.8045363575717146, "loss": 0.4358749985694885, "step": 5410 }, { "ce_loss": 0.1794261783361435, "epoch": 1.8045363575717146, "step": 5410 }, { "distill_loss": 0.09952785074710846, "epoch": 1.8045363575717146, "step": 5410 }, { "epoch": 1.8045363575717146, "ref_ce_loss": 0.10201841592788696, "step": 5410 }, { "epoch": 1.8045363575717146, "loss": 0.5710932016372681, "step": 5410 }, { "ce_loss": 0.1435142308473587, "epoch": 1.8045363575717146, "step": 5410 }, { "distill_loss": 0.07739975303411484, "epoch": 1.8045363575717146, "step": 5410 }, { "epoch": 1.8045363575717146, "ref_ce_loss": 0.1580650359392166, "step": 5410 }, { "epoch": 1.80787191460974, "loss": 0.5962, "step": 5420 }, { "epoch": 1.80787191460974, "grad_norm": 2.5487122535705566, "step": 5420 }, { "epoch": 1.80787191460974, "learning_rate": 0.0002707881123165334, "step": 5420 }, { "epoch": 1.80787191460974, "loss": 0.4332432746887207, "step": 5420 }, { "ce_loss": 0.17312292754650116, "epoch": 1.80787191460974, "step": 5420 }, { "distill_loss": 0.1059122085571289, "epoch": 1.80787191460974, "step": 5420 }, { "epoch": 1.80787191460974, "ref_ce_loss": 0.10186949372291565, "step": 5420 }, { "epoch": 1.80787191460974, "loss": 0.812820315361023, "step": 5420 }, { "ce_loss": 0.2344483882188797, "epoch": 1.80787191460974, "step": 5420 }, { "distill_loss": 0.12185351550579071, "epoch": 1.80787191460974, "step": 5420 }, { "epoch": 1.80787191460974, "ref_ce_loss": 0.10016775131225586, "step": 5420 }, { "epoch": 1.80787191460974, "loss": 0.5691059827804565, "step": 5420 }, { "ce_loss": 0.19381578266620636, "epoch": 1.80787191460974, "step": 5420 }, { "distill_loss": 0.12860912084579468, "epoch": 1.80787191460974, "step": 5420 }, { "epoch": 1.80787191460974, "ref_ce_loss": 0.10750152915716171, "step": 5420 }, { "epoch": 1.80787191460974, "loss": 0.4607386589050293, "step": 5420 }, { "ce_loss": 0.20922338962554932, "epoch": 1.80787191460974, "step": 5420 }, { "distill_loss": 0.12475057691335678, "epoch": 1.80787191460974, "step": 5420 }, { "epoch": 1.80787191460974, "ref_ce_loss": 0.1266154795885086, "step": 5420 }, { "epoch": 1.8112074716477653, "loss": 0.6349, "step": 5430 }, { "epoch": 1.8112074716477653, "grad_norm": 1.9588823318481445, "step": 5430 }, { "epoch": 1.8112074716477653, "learning_rate": 0.00027066789749343324, "step": 5430 }, { "epoch": 1.8112074716477653, "loss": 0.6367460489273071, "step": 5430 }, { "ce_loss": 0.1723642200231552, "epoch": 1.8112074716477653, "step": 5430 }, { "distill_loss": 0.1507956087589264, "epoch": 1.8112074716477653, "step": 5430 }, { "epoch": 1.8112074716477653, "ref_ce_loss": 0.14595159888267517, "step": 5430 }, { "epoch": 1.8112074716477653, "loss": 0.5201939344406128, "step": 5430 }, { "ce_loss": 0.13760589063167572, "epoch": 1.8112074716477653, "step": 5430 }, { "distill_loss": 0.10090071707963943, "epoch": 1.8112074716477653, "step": 5430 }, { "epoch": 1.8112074716477653, "ref_ce_loss": 0.15078330039978027, "step": 5430 }, { "epoch": 1.8112074716477653, "loss": 0.6601283550262451, "step": 5430 }, { "ce_loss": 0.2796492874622345, "epoch": 1.8112074716477653, "step": 5430 }, { "distill_loss": 0.14720816910266876, "epoch": 1.8112074716477653, "step": 5430 }, { "epoch": 1.8112074716477653, "ref_ce_loss": 0.17631027102470398, "step": 5430 }, { "epoch": 1.8112074716477653, "loss": 0.43748265504837036, "step": 5430 }, { "ce_loss": 0.1258515566587448, "epoch": 1.8112074716477653, "step": 5430 }, { "distill_loss": 0.11924121528863907, "epoch": 1.8112074716477653, "step": 5430 }, { "epoch": 1.8112074716477653, "ref_ce_loss": 0.1339789777994156, "step": 5430 }, { "epoch": 1.8145430286857906, "loss": 0.6099, "step": 5440 }, { "epoch": 1.8145430286857906, "grad_norm": 2.69777250289917, "step": 5440 }, { "epoch": 1.8145430286857906, "learning_rate": 0.0002705474626196876, "step": 5440 }, { "epoch": 1.8145430286857906, "loss": 0.5835865139961243, "step": 5440 }, { "ce_loss": 0.23276305198669434, "epoch": 1.8145430286857906, "step": 5440 }, { "distill_loss": 0.12228725850582123, "epoch": 1.8145430286857906, "step": 5440 }, { "epoch": 1.8145430286857906, "ref_ce_loss": 0.17023053765296936, "step": 5440 }, { "epoch": 1.8145430286857906, "loss": 0.8016778230667114, "step": 5440 }, { "ce_loss": 0.3399412930011749, "epoch": 1.8145430286857906, "step": 5440 }, { "distill_loss": 0.13676761090755463, "epoch": 1.8145430286857906, "step": 5440 }, { "epoch": 1.8145430286857906, "ref_ce_loss": 0.18872161209583282, "step": 5440 }, { "epoch": 1.8145430286857906, "loss": 0.46767154335975647, "step": 5440 }, { "ce_loss": 0.1788882166147232, "epoch": 1.8145430286857906, "step": 5440 }, { "distill_loss": 0.13512161374092102, "epoch": 1.8145430286857906, "step": 5440 }, { "epoch": 1.8145430286857906, "ref_ce_loss": 0.09428312629461288, "step": 5440 }, { "epoch": 1.8145430286857906, "loss": 0.9075959920883179, "step": 5440 }, { "ce_loss": 0.2922513484954834, "epoch": 1.8145430286857906, "step": 5440 }, { "distill_loss": 0.17621180415153503, "epoch": 1.8145430286857906, "step": 5440 }, { "epoch": 1.8145430286857906, "ref_ce_loss": 0.18406786024570465, "step": 5440 }, { "epoch": 1.817878585723816, "loss": 0.5886, "step": 5450 }, { "epoch": 1.817878585723816, "grad_norm": 7.365907192230225, "step": 5450 }, { "epoch": 1.817878585723816, "learning_rate": 0.0002704268079149223, "step": 5450 }, { "epoch": 1.817878585723816, "loss": 0.7035877108573914, "step": 5450 }, { "ce_loss": 0.13215026259422302, "epoch": 1.817878585723816, "step": 5450 }, { "distill_loss": 0.4339187741279602, "epoch": 1.817878585723816, "step": 5450 }, { "epoch": 1.817878585723816, "ref_ce_loss": 0.10032802820205688, "step": 5450 }, { "epoch": 1.817878585723816, "loss": 0.7162448167800903, "step": 5450 }, { "ce_loss": 0.18805432319641113, "epoch": 1.817878585723816, "step": 5450 }, { "distill_loss": 0.37199363112449646, "epoch": 1.817878585723816, "step": 5450 }, { "epoch": 1.817878585723816, "ref_ce_loss": 0.12469123303890228, "step": 5450 }, { "epoch": 1.817878585723816, "loss": 0.5766035914421082, "step": 5450 }, { "ce_loss": 0.14342515170574188, "epoch": 1.817878585723816, "step": 5450 }, { "distill_loss": 0.27214252948760986, "epoch": 1.817878585723816, "step": 5450 }, { "epoch": 1.817878585723816, "ref_ce_loss": 0.16064272820949554, "step": 5450 }, { "epoch": 1.817878585723816, "loss": 0.8636009693145752, "step": 5450 }, { "ce_loss": 0.16702553629875183, "epoch": 1.817878585723816, "step": 5450 }, { "distill_loss": 0.35656481981277466, "epoch": 1.817878585723816, "step": 5450 }, { "epoch": 1.817878585723816, "ref_ce_loss": 0.1793079525232315, "step": 5450 }, { "epoch": 1.8212141427618413, "loss": 0.7028, "step": 5460 }, { "epoch": 1.8212141427618413, "grad_norm": 3.6993865966796875, "step": 5460 }, { "epoch": 1.8212141427618413, "learning_rate": 0.00027030593359916383, "step": 5460 }, { "epoch": 1.8212141427618413, "loss": 0.41134822368621826, "step": 5460 }, { "ce_loss": 0.1384688913822174, "epoch": 1.8212141427618413, "step": 5460 }, { "distill_loss": 0.1681678742170334, "epoch": 1.8212141427618413, "step": 5460 }, { "epoch": 1.8212141427618413, "ref_ce_loss": 0.10444191843271255, "step": 5460 }, { "epoch": 1.8212141427618413, "loss": 0.6750077605247498, "step": 5460 }, { "ce_loss": 0.15935517847537994, "epoch": 1.8212141427618413, "step": 5460 }, { "distill_loss": 0.27100151777267456, "epoch": 1.8212141427618413, "step": 5460 }, { "epoch": 1.8212141427618413, "ref_ce_loss": 0.16393183171749115, "step": 5460 }, { "epoch": 1.8212141427618413, "loss": 0.6315606832504272, "step": 5460 }, { "ce_loss": 0.14299316704273224, "epoch": 1.8212141427618413, "step": 5460 }, { "distill_loss": 0.17834961414337158, "epoch": 1.8212141427618413, "step": 5460 }, { "epoch": 1.8212141427618413, "ref_ce_loss": 0.08231250941753387, "step": 5460 }, { "epoch": 1.8212141427618413, "loss": 0.6965258121490479, "step": 5460 }, { "ce_loss": 0.31687676906585693, "epoch": 1.8212141427618413, "step": 5460 }, { "distill_loss": 0.20971833169460297, "epoch": 1.8212141427618413, "step": 5460 }, { "epoch": 1.8212141427618413, "ref_ce_loss": 0.16972923278808594, "step": 5460 }, { "epoch": 1.8245496997998667, "loss": 0.6912, "step": 5470 }, { "epoch": 1.8245496997998667, "grad_norm": 4.046206474304199, "step": 5470 }, { "epoch": 1.8245496997998667, "learning_rate": 0.0002701848398928393, "step": 5470 }, { "epoch": 1.8245496997998667, "loss": 0.6588386297225952, "step": 5470 }, { "ce_loss": 0.24764564633369446, "epoch": 1.8245496997998667, "step": 5470 }, { "distill_loss": 0.16088978946208954, "epoch": 1.8245496997998667, "step": 5470 }, { "epoch": 1.8245496997998667, "ref_ce_loss": 0.19515611231327057, "step": 5470 }, { "epoch": 1.8245496997998667, "loss": 0.5033345222473145, "step": 5470 }, { "ce_loss": 0.22179868817329407, "epoch": 1.8245496997998667, "step": 5470 }, { "distill_loss": 0.16051587462425232, "epoch": 1.8245496997998667, "step": 5470 }, { "epoch": 1.8245496997998667, "ref_ce_loss": 0.08434831351041794, "step": 5470 }, { "epoch": 1.8245496997998667, "loss": 0.6149681210517883, "step": 5470 }, { "ce_loss": 0.25773754715919495, "epoch": 1.8245496997998667, "step": 5470 }, { "distill_loss": 0.1539229452610016, "epoch": 1.8245496997998667, "step": 5470 }, { "epoch": 1.8245496997998667, "ref_ce_loss": 0.1424413025379181, "step": 5470 }, { "epoch": 1.8245496997998667, "loss": 0.44749850034713745, "step": 5470 }, { "ce_loss": 0.1712462604045868, "epoch": 1.8245496997998667, "step": 5470 }, { "distill_loss": 0.14961741864681244, "epoch": 1.8245496997998667, "step": 5470 }, { "epoch": 1.8245496997998667, "ref_ce_loss": 0.08178859949111938, "step": 5470 }, { "epoch": 1.827885256837892, "loss": 0.5976, "step": 5480 }, { "epoch": 1.827885256837892, "grad_norm": 2.5611064434051514, "step": 5480 }, { "epoch": 1.827885256837892, "learning_rate": 0.00027006352701677583, "step": 5480 }, { "epoch": 1.827885256837892, "loss": 1.1137487888336182, "step": 5480 }, { "ce_loss": 0.10859647393226624, "epoch": 1.827885256837892, "step": 5480 }, { "distill_loss": 0.11339569091796875, "epoch": 1.827885256837892, "step": 5480 }, { "epoch": 1.827885256837892, "ref_ce_loss": 0.0940646305680275, "step": 5480 }, { "epoch": 1.827885256837892, "loss": 0.6393406987190247, "step": 5480 }, { "ce_loss": 0.20108860731124878, "epoch": 1.827885256837892, "step": 5480 }, { "distill_loss": 0.13915398716926575, "epoch": 1.827885256837892, "step": 5480 }, { "epoch": 1.827885256837892, "ref_ce_loss": 0.12006493657827377, "step": 5480 }, { "epoch": 1.827885256837892, "loss": 0.6720687747001648, "step": 5480 }, { "ce_loss": 0.23451204597949982, "epoch": 1.827885256837892, "step": 5480 }, { "distill_loss": 0.19288182258605957, "epoch": 1.827885256837892, "step": 5480 }, { "epoch": 1.827885256837892, "ref_ce_loss": 0.17443564534187317, "step": 5480 }, { "epoch": 1.827885256837892, "loss": 0.5880801677703857, "step": 5480 }, { "ce_loss": 0.20224609971046448, "epoch": 1.827885256837892, "step": 5480 }, { "distill_loss": 0.17057499289512634, "epoch": 1.827885256837892, "step": 5480 }, { "epoch": 1.827885256837892, "ref_ce_loss": 0.13367359340190887, "step": 5480 }, { "epoch": 1.8312208138759174, "loss": 0.6369, "step": 5490 }, { "epoch": 1.8312208138759174, "grad_norm": 3.403841495513916, "step": 5490 }, { "epoch": 1.8312208138759174, "learning_rate": 0.0002699419951922003, "step": 5490 }, { "epoch": 1.8312208138759174, "loss": 0.5935007333755493, "step": 5490 }, { "ce_loss": 0.13772651553153992, "epoch": 1.8312208138759174, "step": 5490 }, { "distill_loss": 0.10074299573898315, "epoch": 1.8312208138759174, "step": 5490 }, { "epoch": 1.8312208138759174, "ref_ce_loss": 0.1018882468342781, "step": 5490 }, { "epoch": 1.8312208138759174, "loss": 0.6726405024528503, "step": 5490 }, { "ce_loss": 0.19621394574642181, "epoch": 1.8312208138759174, "step": 5490 }, { "distill_loss": 0.11780223250389099, "epoch": 1.8312208138759174, "step": 5490 }, { "epoch": 1.8312208138759174, "ref_ce_loss": 0.15497581660747528, "step": 5490 }, { "epoch": 1.8312208138759174, "loss": 1.072763442993164, "step": 5490 }, { "ce_loss": 0.23977363109588623, "epoch": 1.8312208138759174, "step": 5490 }, { "distill_loss": 0.13159865140914917, "epoch": 1.8312208138759174, "step": 5490 }, { "epoch": 1.8312208138759174, "ref_ce_loss": 0.14965175092220306, "step": 5490 }, { "epoch": 1.8312208138759174, "loss": 0.3933233320713043, "step": 5490 }, { "ce_loss": 0.16734004020690918, "epoch": 1.8312208138759174, "step": 5490 }, { "distill_loss": 0.11768080294132233, "epoch": 1.8312208138759174, "step": 5490 }, { "epoch": 1.8312208138759174, "ref_ce_loss": 0.10801305621862411, "step": 5490 }, { "epoch": 1.8345563709139427, "loss": 0.552, "step": 5500 }, { "epoch": 1.8345563709139427, "grad_norm": 3.3497848510742188, "step": 5500 }, { "epoch": 1.8345563709139427, "learning_rate": 0.0002698202446407388, "step": 5500 }, { "epoch": 1.8345563709139427, "loss": 0.4976787269115448, "step": 5500 }, { "ce_loss": 0.1326054334640503, "epoch": 1.8345563709139427, "step": 5500 }, { "distill_loss": 0.08439873158931732, "epoch": 1.8345563709139427, "step": 5500 }, { "epoch": 1.8345563709139427, "ref_ce_loss": 0.13565798103809357, "step": 5500 }, { "epoch": 1.8345563709139427, "loss": 0.6742888689041138, "step": 5500 }, { "ce_loss": 0.2550393342971802, "epoch": 1.8345563709139427, "step": 5500 }, { "distill_loss": 0.12310586869716644, "epoch": 1.8345563709139427, "step": 5500 }, { "epoch": 1.8345563709139427, "ref_ce_loss": 0.2118934988975525, "step": 5500 }, { "epoch": 1.8345563709139427, "loss": 1.2174590826034546, "step": 5500 }, { "ce_loss": 0.22371555864810944, "epoch": 1.8345563709139427, "step": 5500 }, { "distill_loss": 0.1468636691570282, "epoch": 1.8345563709139427, "step": 5500 }, { "epoch": 1.8345563709139427, "ref_ce_loss": 0.09248271584510803, "step": 5500 }, { "epoch": 1.8345563709139427, "loss": 0.3218141198158264, "step": 5500 }, { "ce_loss": 0.11735627055168152, "epoch": 1.8345563709139427, "step": 5500 }, { "distill_loss": 0.09057797491550446, "epoch": 1.8345563709139427, "step": 5500 }, { "epoch": 1.8345563709139427, "ref_ce_loss": 0.11361885070800781, "step": 5500 }, { "epoch": 1.837891927951968, "loss": 0.5876, "step": 5510 }, { "epoch": 1.837891927951968, "grad_norm": 3.7035536766052246, "step": 5510 }, { "epoch": 1.837891927951968, "learning_rate": 0.0002696982755844163, "step": 5510 }, { "epoch": 1.837891927951968, "loss": 0.48578375577926636, "step": 5510 }, { "ce_loss": 0.20499315857887268, "epoch": 1.837891927951968, "step": 5510 }, { "distill_loss": 0.13509148359298706, "epoch": 1.837891927951968, "step": 5510 }, { "epoch": 1.837891927951968, "ref_ce_loss": 0.14547577500343323, "step": 5510 }, { "epoch": 1.837891927951968, "loss": 0.5209768414497375, "step": 5510 }, { "ce_loss": 0.12706170976161957, "epoch": 1.837891927951968, "step": 5510 }, { "distill_loss": 0.11170224845409393, "epoch": 1.837891927951968, "step": 5510 }, { "epoch": 1.837891927951968, "ref_ce_loss": 0.11799715459346771, "step": 5510 }, { "epoch": 1.837891927951968, "loss": 0.4726123809814453, "step": 5510 }, { "ce_loss": 0.13671162724494934, "epoch": 1.837891927951968, "step": 5510 }, { "distill_loss": 0.1430138349533081, "epoch": 1.837891927951968, "step": 5510 }, { "epoch": 1.837891927951968, "ref_ce_loss": 0.14614635705947876, "step": 5510 }, { "epoch": 1.837891927951968, "loss": 0.4414920210838318, "step": 5510 }, { "ce_loss": 0.14068323373794556, "epoch": 1.837891927951968, "step": 5510 }, { "distill_loss": 0.1583692580461502, "epoch": 1.837891927951968, "step": 5510 }, { "epoch": 1.837891927951968, "ref_ce_loss": 0.10938449949026108, "step": 5510 }, { "epoch": 1.8412274849899934, "loss": 0.5865, "step": 5520 }, { "epoch": 1.8412274849899934, "grad_norm": 10.9885835647583, "step": 5520 }, { "epoch": 1.8412274849899934, "learning_rate": 0.0002695760882456563, "step": 5520 }, { "epoch": 1.8412274849899934, "loss": 0.49264997243881226, "step": 5520 }, { "ce_loss": 0.15945468842983246, "epoch": 1.8412274849899934, "step": 5520 }, { "distill_loss": 0.11915487051010132, "epoch": 1.8412274849899934, "step": 5520 }, { "epoch": 1.8412274849899934, "ref_ce_loss": 0.10155269503593445, "step": 5520 }, { "epoch": 1.8412274849899934, "loss": 0.4440663754940033, "step": 5520 }, { "ce_loss": 0.1484590470790863, "epoch": 1.8412274849899934, "step": 5520 }, { "distill_loss": 0.11816152930259705, "epoch": 1.8412274849899934, "step": 5520 }, { "epoch": 1.8412274849899934, "ref_ce_loss": 0.1403661072254181, "step": 5520 }, { "epoch": 1.8412274849899934, "loss": 0.5174403190612793, "step": 5520 }, { "ce_loss": 0.12795519828796387, "epoch": 1.8412274849899934, "step": 5520 }, { "distill_loss": 0.12109988927841187, "epoch": 1.8412274849899934, "step": 5520 }, { "epoch": 1.8412274849899934, "ref_ce_loss": 0.09429611265659332, "step": 5520 }, { "epoch": 1.8412274849899934, "loss": 0.6381317377090454, "step": 5520 }, { "ce_loss": 0.17537306249141693, "epoch": 1.8412274849899934, "step": 5520 }, { "distill_loss": 0.12010416388511658, "epoch": 1.8412274849899934, "step": 5520 }, { "epoch": 1.8412274849899934, "ref_ce_loss": 0.1554737389087677, "step": 5520 }, { "epoch": 1.8445630420280188, "loss": 0.588, "step": 5530 }, { "epoch": 1.8445630420280188, "grad_norm": 2.1283841133117676, "step": 5530 }, { "epoch": 1.8445630420280188, "learning_rate": 0.00026945368284728014, "step": 5530 }, { "epoch": 1.8445630420280188, "loss": 0.4387381374835968, "step": 5530 }, { "ce_loss": 0.20292159914970398, "epoch": 1.8445630420280188, "step": 5530 }, { "distill_loss": 0.10617952048778534, "epoch": 1.8445630420280188, "step": 5530 }, { "epoch": 1.8445630420280188, "ref_ce_loss": 0.12931837141513824, "step": 5530 }, { "epoch": 1.8445630420280188, "loss": 0.7084087133407593, "step": 5530 }, { "ce_loss": 0.2622057795524597, "epoch": 1.8445630420280188, "step": 5530 }, { "distill_loss": 0.11426561325788498, "epoch": 1.8445630420280188, "step": 5530 }, { "epoch": 1.8445630420280188, "ref_ce_loss": 0.1607864797115326, "step": 5530 }, { "epoch": 1.8445630420280188, "loss": 0.523222804069519, "step": 5530 }, { "ce_loss": 0.24114282429218292, "epoch": 1.8445630420280188, "step": 5530 }, { "distill_loss": 0.09182117134332657, "epoch": 1.8445630420280188, "step": 5530 }, { "epoch": 1.8445630420280188, "ref_ce_loss": 0.12573231756687164, "step": 5530 }, { "epoch": 1.8445630420280188, "loss": 0.5910396575927734, "step": 5530 }, { "ce_loss": 0.16739298403263092, "epoch": 1.8445630420280188, "step": 5530 }, { "distill_loss": 0.09659696370363235, "epoch": 1.8445630420280188, "step": 5530 }, { "epoch": 1.8445630420280188, "ref_ce_loss": 0.1586391180753708, "step": 5530 }, { "epoch": 1.8478985990660441, "loss": 0.5762, "step": 5540 }, { "epoch": 1.8478985990660441, "grad_norm": 3.0936765670776367, "step": 5540 }, { "epoch": 1.8478985990660441, "learning_rate": 0.0002693310596125072, "step": 5540 }, { "epoch": 1.8478985990660441, "loss": 0.5124553442001343, "step": 5540 }, { "ce_loss": 0.14690622687339783, "epoch": 1.8478985990660441, "step": 5540 }, { "distill_loss": 0.07775744795799255, "epoch": 1.8478985990660441, "step": 5540 }, { "epoch": 1.8478985990660441, "ref_ce_loss": 0.11736592650413513, "step": 5540 }, { "epoch": 1.8478985990660441, "loss": 0.5300059914588928, "step": 5540 }, { "ce_loss": 0.20057353377342224, "epoch": 1.8478985990660441, "step": 5540 }, { "distill_loss": 0.09820909053087234, "epoch": 1.8478985990660441, "step": 5540 }, { "epoch": 1.8478985990660441, "ref_ce_loss": 0.18187911808490753, "step": 5540 }, { "epoch": 1.8478985990660441, "loss": 0.4994697570800781, "step": 5540 }, { "ce_loss": 0.22162176668643951, "epoch": 1.8478985990660441, "step": 5540 }, { "distill_loss": 0.1110476404428482, "epoch": 1.8478985990660441, "step": 5540 }, { "epoch": 1.8478985990660441, "ref_ce_loss": 0.16640129685401917, "step": 5540 }, { "epoch": 1.8478985990660441, "loss": 0.4754143953323364, "step": 5540 }, { "ce_loss": 0.22003041207790375, "epoch": 1.8478985990660441, "step": 5540 }, { "distill_loss": 0.09422904998064041, "epoch": 1.8478985990660441, "step": 5540 }, { "epoch": 1.8478985990660441, "ref_ce_loss": 0.1313227266073227, "step": 5540 }, { "epoch": 1.8512341561040695, "loss": 0.5599, "step": 5550 }, { "epoch": 1.8512341561040695, "grad_norm": 2.980518102645874, "step": 5550 }, { "epoch": 1.8512341561040695, "learning_rate": 0.00026920821876495374, "step": 5550 }, { "epoch": 1.8512341561040695, "loss": 0.8818994760513306, "step": 5550 }, { "ce_loss": 0.2660450339317322, "epoch": 1.8512341561040695, "step": 5550 }, { "distill_loss": 0.12561841309070587, "epoch": 1.8512341561040695, "step": 5550 }, { "epoch": 1.8512341561040695, "ref_ce_loss": 0.23656994104385376, "step": 5550 }, { "epoch": 1.8512341561040695, "loss": 0.40436068177223206, "step": 5550 }, { "ce_loss": 0.16543236374855042, "epoch": 1.8512341561040695, "step": 5550 }, { "distill_loss": 0.1111280545592308, "epoch": 1.8512341561040695, "step": 5550 }, { "epoch": 1.8512341561040695, "ref_ce_loss": 0.12749433517456055, "step": 5550 }, { "epoch": 1.8512341561040695, "loss": 0.610494077205658, "step": 5550 }, { "ce_loss": 0.18998795747756958, "epoch": 1.8512341561040695, "step": 5550 }, { "distill_loss": 0.12114927917718887, "epoch": 1.8512341561040695, "step": 5550 }, { "epoch": 1.8512341561040695, "ref_ce_loss": 0.13568200170993805, "step": 5550 }, { "epoch": 1.8512341561040695, "loss": 0.3950880169868469, "step": 5550 }, { "ce_loss": 0.1385015845298767, "epoch": 1.8512341561040695, "step": 5550 }, { "distill_loss": 0.10792937129735947, "epoch": 1.8512341561040695, "step": 5550 }, { "epoch": 1.8512341561040695, "ref_ce_loss": 0.09195967018604279, "step": 5550 }, { "epoch": 1.8545697131420948, "loss": 0.6336, "step": 5560 }, { "epoch": 1.8545697131420948, "grad_norm": 3.8770010471343994, "step": 5560 }, { "epoch": 1.8545697131420948, "learning_rate": 0.00026908516052863305, "step": 5560 }, { "epoch": 1.8545697131420948, "loss": 0.5988621115684509, "step": 5560 }, { "ce_loss": 0.275103896856308, "epoch": 1.8545697131420948, "step": 5560 }, { "distill_loss": 0.13239721953868866, "epoch": 1.8545697131420948, "step": 5560 }, { "epoch": 1.8545697131420948, "ref_ce_loss": 0.1453436315059662, "step": 5560 }, { "epoch": 1.8545697131420948, "loss": 0.5324811339378357, "step": 5560 }, { "ce_loss": 0.20064669847488403, "epoch": 1.8545697131420948, "step": 5560 }, { "distill_loss": 0.16439953446388245, "epoch": 1.8545697131420948, "step": 5560 }, { "epoch": 1.8545697131420948, "ref_ce_loss": 0.12346375733613968, "step": 5560 }, { "epoch": 1.8545697131420948, "loss": 0.9219769239425659, "step": 5560 }, { "ce_loss": 0.25965479016304016, "epoch": 1.8545697131420948, "step": 5560 }, { "distill_loss": 0.15143707394599915, "epoch": 1.8545697131420948, "step": 5560 }, { "epoch": 1.8545697131420948, "ref_ce_loss": 0.19766446948051453, "step": 5560 }, { "epoch": 1.8545697131420948, "loss": 0.7507549524307251, "step": 5560 }, { "ce_loss": 0.23712469637393951, "epoch": 1.8545697131420948, "step": 5560 }, { "distill_loss": 0.1717062145471573, "epoch": 1.8545697131420948, "step": 5560 }, { "epoch": 1.8545697131420948, "ref_ce_loss": 0.18241867423057556, "step": 5560 }, { "epoch": 1.8579052701801202, "loss": 0.6086, "step": 5570 }, { "epoch": 1.8579052701801202, "grad_norm": 2.5605103969573975, "step": 5570 }, { "epoch": 1.8579052701801202, "learning_rate": 0.0002689618851279549, "step": 5570 }, { "epoch": 1.8579052701801202, "loss": 0.6851871013641357, "step": 5570 }, { "ce_loss": 0.2842769920825958, "epoch": 1.8579052701801202, "step": 5570 }, { "distill_loss": 0.15424621105194092, "epoch": 1.8579052701801202, "step": 5570 }, { "epoch": 1.8579052701801202, "ref_ce_loss": 0.18643635511398315, "step": 5570 }, { "epoch": 1.8579052701801202, "loss": 0.386451780796051, "step": 5570 }, { "ce_loss": 0.15897487103939056, "epoch": 1.8579052701801202, "step": 5570 }, { "distill_loss": 0.11713258177042007, "epoch": 1.8579052701801202, "step": 5570 }, { "epoch": 1.8579052701801202, "ref_ce_loss": 0.10942494124174118, "step": 5570 }, { "epoch": 1.8579052701801202, "loss": 0.7646214962005615, "step": 5570 }, { "ce_loss": 0.18946322798728943, "epoch": 1.8579052701801202, "step": 5570 }, { "distill_loss": 0.15247109532356262, "epoch": 1.8579052701801202, "step": 5570 }, { "epoch": 1.8579052701801202, "ref_ce_loss": 0.12387377768754959, "step": 5570 }, { "epoch": 1.8579052701801202, "loss": 0.3443436920642853, "step": 5570 }, { "ce_loss": 0.05924952030181885, "epoch": 1.8579052701801202, "step": 5570 }, { "distill_loss": 0.10812821984291077, "epoch": 1.8579052701801202, "step": 5570 }, { "epoch": 1.8579052701801202, "ref_ce_loss": 0.07667309790849686, "step": 5570 }, { "epoch": 1.8612408272181455, "loss": 0.5919, "step": 5580 }, { "epoch": 1.8612408272181455, "grad_norm": 4.265673637390137, "step": 5580 }, { "epoch": 1.8612408272181455, "learning_rate": 0.0002688383927877248, "step": 5580 }, { "epoch": 1.8612408272181455, "loss": 0.4172850251197815, "step": 5580 }, { "ce_loss": 0.1882062405347824, "epoch": 1.8612408272181455, "step": 5580 }, { "distill_loss": 0.09461406618356705, "epoch": 1.8612408272181455, "step": 5580 }, { "epoch": 1.8612408272181455, "ref_ce_loss": 0.09314027428627014, "step": 5580 }, { "epoch": 1.8612408272181455, "loss": 0.6765507459640503, "step": 5580 }, { "ce_loss": 0.1747109591960907, "epoch": 1.8612408272181455, "step": 5580 }, { "distill_loss": 0.06782689690589905, "epoch": 1.8612408272181455, "step": 5580 }, { "epoch": 1.8612408272181455, "ref_ce_loss": 0.12354877591133118, "step": 5580 }, { "epoch": 1.8612408272181455, "loss": 0.6376481056213379, "step": 5580 }, { "ce_loss": 0.16509592533111572, "epoch": 1.8612408272181455, "step": 5580 }, { "distill_loss": 0.10020028054714203, "epoch": 1.8612408272181455, "step": 5580 }, { "epoch": 1.8612408272181455, "ref_ce_loss": 0.09290022403001785, "step": 5580 }, { "epoch": 1.8612408272181455, "loss": 0.540982186794281, "step": 5580 }, { "ce_loss": 0.19303928315639496, "epoch": 1.8612408272181455, "step": 5580 }, { "distill_loss": 0.11706292629241943, "epoch": 1.8612408272181455, "step": 5580 }, { "epoch": 1.8612408272181455, "ref_ce_loss": 0.15270400047302246, "step": 5580 }, { "epoch": 1.864576384256171, "loss": 0.5862, "step": 5590 }, { "epoch": 1.864576384256171, "grad_norm": 2.270423650741577, "step": 5590 }, { "epoch": 1.864576384256171, "learning_rate": 0.00026871468373314424, "step": 5590 }, { "epoch": 1.864576384256171, "loss": 0.55879807472229, "step": 5590 }, { "ce_loss": 0.2468176931142807, "epoch": 1.864576384256171, "step": 5590 }, { "distill_loss": 0.09950075298547745, "epoch": 1.864576384256171, "step": 5590 }, { "epoch": 1.864576384256171, "ref_ce_loss": 0.15365587174892426, "step": 5590 }, { "epoch": 1.864576384256171, "loss": 0.6020499467849731, "step": 5590 }, { "ce_loss": 0.18916167318820953, "epoch": 1.864576384256171, "step": 5590 }, { "distill_loss": 0.11123539507389069, "epoch": 1.864576384256171, "step": 5590 }, { "epoch": 1.864576384256171, "ref_ce_loss": 0.214087575674057, "step": 5590 }, { "epoch": 1.864576384256171, "loss": 0.740506112575531, "step": 5590 }, { "ce_loss": 0.053918082267045975, "epoch": 1.864576384256171, "step": 5590 }, { "distill_loss": 0.07949318736791611, "epoch": 1.864576384256171, "step": 5590 }, { "epoch": 1.864576384256171, "ref_ce_loss": 0.08161577582359314, "step": 5590 }, { "epoch": 1.864576384256171, "loss": 0.5570569038391113, "step": 5590 }, { "ce_loss": 0.2485281080007553, "epoch": 1.864576384256171, "step": 5590 }, { "distill_loss": 0.13526736199855804, "epoch": 1.864576384256171, "step": 5590 }, { "epoch": 1.864576384256171, "ref_ce_loss": 0.1323298215866089, "step": 5590 }, { "epoch": 1.8679119412941962, "loss": 0.612, "step": 5600 }, { "epoch": 1.8679119412941962, "grad_norm": 2.4391157627105713, "step": 5600 }, { "epoch": 1.8679119412941962, "learning_rate": 0.0002685907581898097, "step": 5600 }, { "epoch": 1.8679119412941962, "loss": 0.40929847955703735, "step": 5600 }, { "ce_loss": 0.145944282412529, "epoch": 1.8679119412941962, "step": 5600 }, { "distill_loss": 0.1584329605102539, "epoch": 1.8679119412941962, "step": 5600 }, { "epoch": 1.8679119412941962, "ref_ce_loss": 0.10474085062742233, "step": 5600 }, { "epoch": 1.8679119412941962, "loss": 0.5263919830322266, "step": 5600 }, { "ce_loss": 0.1532854288816452, "epoch": 1.8679119412941962, "step": 5600 }, { "distill_loss": 0.13151933252811432, "epoch": 1.8679119412941962, "step": 5600 }, { "epoch": 1.8679119412941962, "ref_ce_loss": 0.14014704525470734, "step": 5600 }, { "epoch": 1.8679119412941962, "loss": 0.509942889213562, "step": 5600 }, { "ce_loss": 0.1427435576915741, "epoch": 1.8679119412941962, "step": 5600 }, { "distill_loss": 0.13369926810264587, "epoch": 1.8679119412941962, "step": 5600 }, { "epoch": 1.8679119412941962, "ref_ce_loss": 0.1754363775253296, "step": 5600 }, { "epoch": 1.8679119412941962, "loss": 0.4154976010322571, "step": 5600 }, { "ce_loss": 0.16734033823013306, "epoch": 1.8679119412941962, "step": 5600 }, { "distill_loss": 0.11521608382463455, "epoch": 1.8679119412941962, "step": 5600 }, { "epoch": 1.8679119412941962, "ref_ce_loss": 0.09764881432056427, "step": 5600 }, { "epoch": 1.8712474983322216, "loss": 0.6108, "step": 5610 }, { "epoch": 1.8712474983322216, "grad_norm": 2.0599684715270996, "step": 5610 }, { "epoch": 1.8712474983322216, "learning_rate": 0.0002684666163837124, "step": 5610 }, { "epoch": 1.8712474983322216, "loss": 0.45766517519950867, "step": 5610 }, { "ce_loss": 0.1805308759212494, "epoch": 1.8712474983322216, "step": 5610 }, { "distill_loss": 0.11159662157297134, "epoch": 1.8712474983322216, "step": 5610 }, { "epoch": 1.8712474983322216, "ref_ce_loss": 0.1650044173002243, "step": 5610 }, { "epoch": 1.8712474983322216, "loss": 0.4901134967803955, "step": 5610 }, { "ce_loss": 0.20846788585186005, "epoch": 1.8712474983322216, "step": 5610 }, { "distill_loss": 0.1440722793340683, "epoch": 1.8712474983322216, "step": 5610 }, { "epoch": 1.8712474983322216, "ref_ce_loss": 0.13724854588508606, "step": 5610 }, { "epoch": 1.8712474983322216, "loss": 0.5946434140205383, "step": 5610 }, { "ce_loss": 0.2532452642917633, "epoch": 1.8712474983322216, "step": 5610 }, { "distill_loss": 0.16743330657482147, "epoch": 1.8712474983322216, "step": 5610 }, { "epoch": 1.8712474983322216, "ref_ce_loss": 0.10788458585739136, "step": 5610 }, { "epoch": 1.8712474983322216, "loss": 0.8087218403816223, "step": 5610 }, { "ce_loss": 0.22373901307582855, "epoch": 1.8712474983322216, "step": 5610 }, { "distill_loss": 0.11417116969823837, "epoch": 1.8712474983322216, "step": 5610 }, { "epoch": 1.8712474983322216, "ref_ce_loss": 0.1280953735113144, "step": 5610 }, { "epoch": 1.874583055370247, "loss": 0.585, "step": 5620 }, { "epoch": 1.874583055370247, "grad_norm": 2.23083758354187, "step": 5620 }, { "epoch": 1.874583055370247, "learning_rate": 0.0002683422585412381, "step": 5620 }, { "epoch": 1.874583055370247, "loss": 0.41577935218811035, "step": 5620 }, { "ce_loss": 0.15030327439308167, "epoch": 1.874583055370247, "step": 5620 }, { "distill_loss": 0.11963702738285065, "epoch": 1.874583055370247, "step": 5620 }, { "epoch": 1.874583055370247, "ref_ce_loss": 0.09877686202526093, "step": 5620 }, { "epoch": 1.874583055370247, "loss": 0.4805477559566498, "step": 5620 }, { "ce_loss": 0.19180341064929962, "epoch": 1.874583055370247, "step": 5620 }, { "distill_loss": 0.10885943472385406, "epoch": 1.874583055370247, "step": 5620 }, { "epoch": 1.874583055370247, "ref_ce_loss": 0.11150091886520386, "step": 5620 }, { "epoch": 1.874583055370247, "loss": 0.9445975422859192, "step": 5620 }, { "ce_loss": 0.1629776507616043, "epoch": 1.874583055370247, "step": 5620 }, { "distill_loss": 0.12794573605060577, "epoch": 1.874583055370247, "step": 5620 }, { "epoch": 1.874583055370247, "ref_ce_loss": 0.13277025520801544, "step": 5620 }, { "epoch": 1.874583055370247, "loss": 0.9632389545440674, "step": 5620 }, { "ce_loss": 0.30103176832199097, "epoch": 1.874583055370247, "step": 5620 }, { "distill_loss": 0.15930506587028503, "epoch": 1.874583055370247, "step": 5620 }, { "epoch": 1.874583055370247, "ref_ce_loss": 0.15233488380908966, "step": 5620 }, { "epoch": 1.8779186124082723, "loss": 0.6478, "step": 5630 }, { "epoch": 1.8779186124082723, "grad_norm": 2.3699145317077637, "step": 5630 }, { "epoch": 1.8779186124082723, "learning_rate": 0.00026821768488916644, "step": 5630 }, { "epoch": 1.8779186124082723, "loss": 0.551582932472229, "step": 5630 }, { "ce_loss": 0.09384066611528397, "epoch": 1.8779186124082723, "step": 5630 }, { "distill_loss": 0.06880702078342438, "epoch": 1.8779186124082723, "step": 5630 }, { "epoch": 1.8779186124082723, "ref_ce_loss": 0.09983465075492859, "step": 5630 }, { "epoch": 1.8779186124082723, "loss": 0.856539249420166, "step": 5630 }, { "ce_loss": 0.2709778845310211, "epoch": 1.8779186124082723, "step": 5630 }, { "distill_loss": 0.15140986442565918, "epoch": 1.8779186124082723, "step": 5630 }, { "epoch": 1.8779186124082723, "ref_ce_loss": 0.15218645334243774, "step": 5630 }, { "epoch": 1.8779186124082723, "loss": 0.26097142696380615, "step": 5630 }, { "ce_loss": 0.06970866024494171, "epoch": 1.8779186124082723, "step": 5630 }, { "distill_loss": 0.09343140572309494, "epoch": 1.8779186124082723, "step": 5630 }, { "epoch": 1.8779186124082723, "ref_ce_loss": 0.0976763591170311, "step": 5630 }, { "epoch": 1.8779186124082723, "loss": 0.6135655045509338, "step": 5630 }, { "ce_loss": 0.33973413705825806, "epoch": 1.8779186124082723, "step": 5630 }, { "distill_loss": 0.13375625014305115, "epoch": 1.8779186124082723, "step": 5630 }, { "epoch": 1.8779186124082723, "ref_ce_loss": 0.13988807797431946, "step": 5630 }, { "epoch": 1.8812541694462976, "loss": 0.5563, "step": 5640 }, { "epoch": 1.8812541694462976, "grad_norm": 2.838254451751709, "step": 5640 }, { "epoch": 1.8812541694462976, "learning_rate": 0.0002680928956546706, "step": 5640 }, { "epoch": 1.8812541694462976, "loss": 0.5370069146156311, "step": 5640 }, { "ce_loss": 0.2182268649339676, "epoch": 1.8812541694462976, "step": 5640 }, { "distill_loss": 0.13402530550956726, "epoch": 1.8812541694462976, "step": 5640 }, { "epoch": 1.8812541694462976, "ref_ce_loss": 0.11913847923278809, "step": 5640 }, { "epoch": 1.8812541694462976, "loss": 0.6514466404914856, "step": 5640 }, { "ce_loss": 0.2067224681377411, "epoch": 1.8812541694462976, "step": 5640 }, { "distill_loss": 0.1199726015329361, "epoch": 1.8812541694462976, "step": 5640 }, { "epoch": 1.8812541694462976, "ref_ce_loss": 0.1521792709827423, "step": 5640 }, { "epoch": 1.8812541694462976, "loss": 0.4596925377845764, "step": 5640 }, { "ce_loss": 0.16043129563331604, "epoch": 1.8812541694462976, "step": 5640 }, { "distill_loss": 0.10075315088033676, "epoch": 1.8812541694462976, "step": 5640 }, { "epoch": 1.8812541694462976, "ref_ce_loss": 0.15545539557933807, "step": 5640 }, { "epoch": 1.8812541694462976, "loss": 0.6942105889320374, "step": 5640 }, { "ce_loss": 0.2646586298942566, "epoch": 1.8812541694462976, "step": 5640 }, { "distill_loss": 0.10686776787042618, "epoch": 1.8812541694462976, "step": 5640 }, { "epoch": 1.8812541694462976, "ref_ce_loss": 0.14801819622516632, "step": 5640 }, { "epoch": 1.884589726484323, "loss": 0.5193, "step": 5650 }, { "epoch": 1.884589726484323, "grad_norm": 2.476203203201294, "step": 5650 }, { "epoch": 1.884589726484323, "learning_rate": 0.00026796789106531694, "step": 5650 }, { "epoch": 1.884589726484323, "loss": 0.5218818783760071, "step": 5650 }, { "ce_loss": 0.2017936408519745, "epoch": 1.884589726484323, "step": 5650 }, { "distill_loss": 0.09580246359109879, "epoch": 1.884589726484323, "step": 5650 }, { "epoch": 1.884589726484323, "ref_ce_loss": 0.15740014612674713, "step": 5650 }, { "epoch": 1.884589726484323, "loss": 0.7491849064826965, "step": 5650 }, { "ce_loss": 0.3006199896335602, "epoch": 1.884589726484323, "step": 5650 }, { "distill_loss": 0.14339643716812134, "epoch": 1.884589726484323, "step": 5650 }, { "epoch": 1.884589726484323, "ref_ce_loss": 0.14143864810466766, "step": 5650 }, { "epoch": 1.884589726484323, "loss": 0.5106698274612427, "step": 5650 }, { "ce_loss": 0.22800149023532867, "epoch": 1.884589726484323, "step": 5650 }, { "distill_loss": 0.10951074957847595, "epoch": 1.884589726484323, "step": 5650 }, { "epoch": 1.884589726484323, "ref_ce_loss": 0.17092949151992798, "step": 5650 }, { "epoch": 1.884589726484323, "loss": 0.35551875829696655, "step": 5650 }, { "ce_loss": 0.09727875888347626, "epoch": 1.884589726484323, "step": 5650 }, { "distill_loss": 0.0897751897573471, "epoch": 1.884589726484323, "step": 5650 }, { "epoch": 1.884589726484323, "ref_ce_loss": 0.11061455309391022, "step": 5650 }, { "epoch": 1.8879252835223483, "loss": 0.5365, "step": 5660 }, { "epoch": 1.8879252835223483, "grad_norm": 2.804887533187866, "step": 5660 }, { "epoch": 1.8879252835223483, "learning_rate": 0.0002678426713490645, "step": 5660 }, { "epoch": 1.8879252835223483, "loss": 0.7548896670341492, "step": 5660 }, { "ce_loss": 0.2367536723613739, "epoch": 1.8879252835223483, "step": 5660 }, { "distill_loss": 0.08415091782808304, "epoch": 1.8879252835223483, "step": 5660 }, { "epoch": 1.8879252835223483, "ref_ce_loss": 0.13766418397426605, "step": 5660 }, { "epoch": 1.8879252835223483, "loss": 0.35744622349739075, "step": 5660 }, { "ce_loss": 0.15886476635932922, "epoch": 1.8879252835223483, "step": 5660 }, { "distill_loss": 0.06705141812562943, "epoch": 1.8879252835223483, "step": 5660 }, { "epoch": 1.8879252835223483, "ref_ce_loss": 0.13137438893318176, "step": 5660 }, { "epoch": 1.8879252835223483, "loss": 0.49181342124938965, "step": 5660 }, { "ce_loss": 0.18229323625564575, "epoch": 1.8879252835223483, "step": 5660 }, { "distill_loss": 0.06833770871162415, "epoch": 1.8879252835223483, "step": 5660 }, { "epoch": 1.8879252835223483, "ref_ce_loss": 0.12497011572122574, "step": 5660 }, { "epoch": 1.8879252835223483, "loss": 0.45327556133270264, "step": 5660 }, { "ce_loss": 0.14888149499893188, "epoch": 1.8879252835223483, "step": 5660 }, { "distill_loss": 0.07899749279022217, "epoch": 1.8879252835223483, "step": 5660 }, { "epoch": 1.8879252835223483, "ref_ce_loss": 0.12956686317920685, "step": 5660 }, { "epoch": 1.8912608405603737, "loss": 0.6139, "step": 5670 }, { "epoch": 1.8912608405603737, "grad_norm": 3.510312557220459, "step": 5670 }, { "epoch": 1.8912608405603737, "learning_rate": 0.0002677172367342646, "step": 5670 }, { "epoch": 1.8912608405603737, "loss": 0.6080343127250671, "step": 5670 }, { "ce_loss": 0.18910494446754456, "epoch": 1.8912608405603737, "step": 5670 }, { "distill_loss": 0.10964290797710419, "epoch": 1.8912608405603737, "step": 5670 }, { "epoch": 1.8912608405603737, "ref_ce_loss": 0.19770650565624237, "step": 5670 }, { "epoch": 1.8912608405603737, "loss": 0.3534941077232361, "step": 5670 }, { "ce_loss": 0.1675875186920166, "epoch": 1.8912608405603737, "step": 5670 }, { "distill_loss": 0.10437571257352829, "epoch": 1.8912608405603737, "step": 5670 }, { "epoch": 1.8912608405603737, "ref_ce_loss": 0.08139434456825256, "step": 5670 }, { "epoch": 1.8912608405603737, "loss": 0.34793180227279663, "step": 5670 }, { "ce_loss": 0.11697270721197128, "epoch": 1.8912608405603737, "step": 5670 }, { "distill_loss": 0.09839901328086853, "epoch": 1.8912608405603737, "step": 5670 }, { "epoch": 1.8912608405603737, "ref_ce_loss": 0.13236558437347412, "step": 5670 }, { "epoch": 1.8912608405603737, "loss": 0.9213211536407471, "step": 5670 }, { "ce_loss": 0.31074386835098267, "epoch": 1.8912608405603737, "step": 5670 }, { "distill_loss": 0.19469662010669708, "epoch": 1.8912608405603737, "step": 5670 }, { "epoch": 1.8912608405603737, "ref_ce_loss": 0.14483579993247986, "step": 5670 }, { "epoch": 1.894596397598399, "loss": 0.6004, "step": 5680 }, { "epoch": 1.894596397598399, "grad_norm": 2.557516098022461, "step": 5680 }, { "epoch": 1.894596397598399, "learning_rate": 0.00026759158744966066, "step": 5680 }, { "epoch": 1.894596397598399, "loss": 0.41085487604141235, "step": 5680 }, { "ce_loss": 0.14269503951072693, "epoch": 1.894596397598399, "step": 5680 }, { "distill_loss": 0.12160893529653549, "epoch": 1.894596397598399, "step": 5680 }, { "epoch": 1.894596397598399, "ref_ce_loss": 0.10581735521554947, "step": 5680 }, { "epoch": 1.894596397598399, "loss": 0.49807649850845337, "step": 5680 }, { "ce_loss": 0.14981544017791748, "epoch": 1.894596397598399, "step": 5680 }, { "distill_loss": 0.17303360998630524, "epoch": 1.894596397598399, "step": 5680 }, { "epoch": 1.894596397598399, "ref_ce_loss": 0.11994165182113647, "step": 5680 }, { "epoch": 1.894596397598399, "loss": 0.7087249159812927, "step": 5680 }, { "ce_loss": 0.20051364600658417, "epoch": 1.894596397598399, "step": 5680 }, { "distill_loss": 0.2588937282562256, "epoch": 1.894596397598399, "step": 5680 }, { "epoch": 1.894596397598399, "ref_ce_loss": 0.14541538059711456, "step": 5680 }, { "epoch": 1.894596397598399, "loss": 0.4271368384361267, "step": 5680 }, { "ce_loss": 0.16366565227508545, "epoch": 1.894596397598399, "step": 5680 }, { "distill_loss": 0.1520344465970993, "epoch": 1.894596397598399, "step": 5680 }, { "epoch": 1.894596397598399, "ref_ce_loss": 0.11106879264116287, "step": 5680 }, { "epoch": 1.8979319546364244, "loss": 0.6308, "step": 5690 }, { "epoch": 1.8979319546364244, "grad_norm": 2.758974075317383, "step": 5690 }, { "epoch": 1.8979319546364244, "learning_rate": 0.0002674657237243873, "step": 5690 }, { "epoch": 1.8979319546364244, "loss": 0.5080286264419556, "step": 5690 }, { "ce_loss": 0.13973647356033325, "epoch": 1.8979319546364244, "step": 5690 }, { "distill_loss": 0.21855957806110382, "epoch": 1.8979319546364244, "step": 5690 }, { "epoch": 1.8979319546364244, "ref_ce_loss": 0.10131456702947617, "step": 5690 }, { "epoch": 1.8979319546364244, "loss": 0.7637765407562256, "step": 5690 }, { "ce_loss": 0.1351289600133896, "epoch": 1.8979319546364244, "step": 5690 }, { "distill_loss": 0.14129924774169922, "epoch": 1.8979319546364244, "step": 5690 }, { "epoch": 1.8979319546364244, "ref_ce_loss": 0.06359421461820602, "step": 5690 }, { "epoch": 1.8979319546364244, "loss": 0.5765724182128906, "step": 5690 }, { "ce_loss": 0.24543757736682892, "epoch": 1.8979319546364244, "step": 5690 }, { "distill_loss": 0.1919766515493393, "epoch": 1.8979319546364244, "step": 5690 }, { "epoch": 1.8979319546364244, "ref_ce_loss": 0.13904628157615662, "step": 5690 }, { "epoch": 1.8979319546364244, "loss": 0.4055544435977936, "step": 5690 }, { "ce_loss": 0.07628988474607468, "epoch": 1.8979319546364244, "step": 5690 }, { "distill_loss": 0.1454552859067917, "epoch": 1.8979319546364244, "step": 5690 }, { "epoch": 1.8979319546364244, "ref_ce_loss": 0.13771748542785645, "step": 5690 }, { "epoch": 1.9012675116744497, "loss": 0.6209, "step": 5700 }, { "epoch": 1.9012675116744497, "grad_norm": 5.222294807434082, "step": 5700 }, { "epoch": 1.9012675116744497, "learning_rate": 0.0002673396457879703, "step": 5700 }, { "epoch": 1.9012675116744497, "loss": 0.7928350567817688, "step": 5700 }, { "ce_loss": 0.18566617369651794, "epoch": 1.9012675116744497, "step": 5700 }, { "distill_loss": 0.1641082614660263, "epoch": 1.9012675116744497, "step": 5700 }, { "epoch": 1.9012675116744497, "ref_ce_loss": 0.11963324248790741, "step": 5700 }, { "epoch": 1.9012675116744497, "loss": 0.9268467426300049, "step": 5700 }, { "ce_loss": 0.2279396653175354, "epoch": 1.9012675116744497, "step": 5700 }, { "distill_loss": 0.17557619512081146, "epoch": 1.9012675116744497, "step": 5700 }, { "epoch": 1.9012675116744497, "ref_ce_loss": 0.18660259246826172, "step": 5700 }, { "epoch": 1.9012675116744497, "loss": 0.3865606188774109, "step": 5700 }, { "ce_loss": 0.1271536648273468, "epoch": 1.9012675116744497, "step": 5700 }, { "distill_loss": 0.1433785855770111, "epoch": 1.9012675116744497, "step": 5700 }, { "epoch": 1.9012675116744497, "ref_ce_loss": 0.11587008833885193, "step": 5700 }, { "epoch": 1.9012675116744497, "loss": 0.5936480760574341, "step": 5700 }, { "ce_loss": 0.19039350748062134, "epoch": 1.9012675116744497, "step": 5700 }, { "distill_loss": 0.16909794509410858, "epoch": 1.9012675116744497, "step": 5700 }, { "epoch": 1.9012675116744497, "ref_ce_loss": 0.10260917991399765, "step": 5700 }, { "epoch": 1.904603068712475, "loss": 0.6287, "step": 5710 }, { "epoch": 1.904603068712475, "grad_norm": 3.323878526687622, "step": 5710 }, { "epoch": 1.904603068712475, "learning_rate": 0.00026721335387032603, "step": 5710 }, { "epoch": 1.904603068712475, "loss": 0.5773212909698486, "step": 5710 }, { "ce_loss": 0.20355701446533203, "epoch": 1.904603068712475, "step": 5710 }, { "distill_loss": 0.13835524022579193, "epoch": 1.904603068712475, "step": 5710 }, { "epoch": 1.904603068712475, "ref_ce_loss": 0.13054907321929932, "step": 5710 }, { "epoch": 1.904603068712475, "loss": 0.6119946241378784, "step": 5710 }, { "ce_loss": 0.08676505833864212, "epoch": 1.904603068712475, "step": 5710 }, { "distill_loss": 0.12342211604118347, "epoch": 1.904603068712475, "step": 5710 }, { "epoch": 1.904603068712475, "ref_ce_loss": 0.15018554031848907, "step": 5710 }, { "epoch": 1.904603068712475, "loss": 0.4971517324447632, "step": 5710 }, { "ce_loss": 0.11750080436468124, "epoch": 1.904603068712475, "step": 5710 }, { "distill_loss": 0.09639900177717209, "epoch": 1.904603068712475, "step": 5710 }, { "epoch": 1.904603068712475, "ref_ce_loss": 0.13823343813419342, "step": 5710 }, { "epoch": 1.904603068712475, "loss": 0.5045558214187622, "step": 5710 }, { "ce_loss": 0.10029848664999008, "epoch": 1.904603068712475, "step": 5710 }, { "distill_loss": 0.11299914866685867, "epoch": 1.904603068712475, "step": 5710 }, { "epoch": 1.904603068712475, "ref_ce_loss": 0.10818342119455338, "step": 5710 }, { "epoch": 1.9079386257505004, "loss": 0.5606, "step": 5720 }, { "epoch": 1.9079386257505004, "grad_norm": 4.714890003204346, "step": 5720 }, { "epoch": 1.9079386257505004, "learning_rate": 0.0002670868482017613, "step": 5720 }, { "epoch": 1.9079386257505004, "loss": 0.5226252675056458, "step": 5720 }, { "ce_loss": 0.2095170021057129, "epoch": 1.9079386257505004, "step": 5720 }, { "distill_loss": 0.11923301219940186, "epoch": 1.9079386257505004, "step": 5720 }, { "epoch": 1.9079386257505004, "ref_ce_loss": 0.16285623610019684, "step": 5720 }, { "epoch": 1.9079386257505004, "loss": 0.2856457829475403, "step": 5720 }, { "ce_loss": 0.09019511193037033, "epoch": 1.9079386257505004, "step": 5720 }, { "distill_loss": 0.10442131757736206, "epoch": 1.9079386257505004, "step": 5720 }, { "epoch": 1.9079386257505004, "ref_ce_loss": 0.09080447256565094, "step": 5720 }, { "epoch": 1.9079386257505004, "loss": 1.5851621627807617, "step": 5720 }, { "ce_loss": 0.34080591797828674, "epoch": 1.9079386257505004, "step": 5720 }, { "distill_loss": 0.1635332554578781, "epoch": 1.9079386257505004, "step": 5720 }, { "epoch": 1.9079386257505004, "ref_ce_loss": 0.24340665340423584, "step": 5720 }, { "epoch": 1.9079386257505004, "loss": 0.760962963104248, "step": 5720 }, { "ce_loss": 0.33830568194389343, "epoch": 1.9079386257505004, "step": 5720 }, { "distill_loss": 0.18809860944747925, "epoch": 1.9079386257505004, "step": 5720 }, { "epoch": 1.9079386257505004, "ref_ce_loss": 0.19881245493888855, "step": 5720 }, { "epoch": 1.9112741827885258, "loss": 0.691, "step": 5730 }, { "epoch": 1.9112741827885258, "grad_norm": 1.771593451499939, "step": 5730 }, { "epoch": 1.9112741827885258, "learning_rate": 0.0002669601290129724, "step": 5730 }, { "epoch": 1.9112741827885258, "loss": 1.1627140045166016, "step": 5730 }, { "ce_loss": 0.25697022676467896, "epoch": 1.9112741827885258, "step": 5730 }, { "distill_loss": 0.12009838223457336, "epoch": 1.9112741827885258, "step": 5730 }, { "epoch": 1.9112741827885258, "ref_ce_loss": 0.1639402210712433, "step": 5730 }, { "epoch": 1.9112741827885258, "loss": 0.9581549167633057, "step": 5730 }, { "ce_loss": 0.19211870431900024, "epoch": 1.9112741827885258, "step": 5730 }, { "distill_loss": 0.14240829646587372, "epoch": 1.9112741827885258, "step": 5730 }, { "epoch": 1.9112741827885258, "ref_ce_loss": 0.13938498497009277, "step": 5730 }, { "epoch": 1.9112741827885258, "loss": 0.6060691475868225, "step": 5730 }, { "ce_loss": 0.2377042919397354, "epoch": 1.9112741827885258, "step": 5730 }, { "distill_loss": 0.128531351685524, "epoch": 1.9112741827885258, "step": 5730 }, { "epoch": 1.9112741827885258, "ref_ce_loss": 0.17777924239635468, "step": 5730 }, { "epoch": 1.9112741827885258, "loss": 0.5381349921226501, "step": 5730 }, { "ce_loss": 0.2451322376728058, "epoch": 1.9112741827885258, "step": 5730 }, { "distill_loss": 0.11923262476921082, "epoch": 1.9112741827885258, "step": 5730 }, { "epoch": 1.9112741827885258, "ref_ce_loss": 0.13032066822052002, "step": 5730 }, { "epoch": 1.9146097398265511, "loss": 0.6032, "step": 5740 }, { "epoch": 1.9146097398265511, "grad_norm": 3.885390281677246, "step": 5740 }, { "epoch": 1.9146097398265511, "learning_rate": 0.00026683319653504514, "step": 5740 }, { "epoch": 1.9146097398265511, "loss": 0.662714958190918, "step": 5740 }, { "ce_loss": 0.1848512589931488, "epoch": 1.9146097398265511, "step": 5740 }, { "distill_loss": 0.11129119992256165, "epoch": 1.9146097398265511, "step": 5740 }, { "epoch": 1.9146097398265511, "ref_ce_loss": 0.13736315071582794, "step": 5740 }, { "epoch": 1.9146097398265511, "loss": 0.7138192653656006, "step": 5740 }, { "ce_loss": 0.14085586369037628, "epoch": 1.9146097398265511, "step": 5740 }, { "distill_loss": 0.09094604104757309, "epoch": 1.9146097398265511, "step": 5740 }, { "epoch": 1.9146097398265511, "ref_ce_loss": 0.11834074556827545, "step": 5740 }, { "epoch": 1.9146097398265511, "loss": 0.6987002491950989, "step": 5740 }, { "ce_loss": 0.1906113177537918, "epoch": 1.9146097398265511, "step": 5740 }, { "distill_loss": 0.11060132831335068, "epoch": 1.9146097398265511, "step": 5740 }, { "epoch": 1.9146097398265511, "ref_ce_loss": 0.16047611832618713, "step": 5740 }, { "epoch": 1.9146097398265511, "loss": 0.8356293439865112, "step": 5740 }, { "ce_loss": 0.20562797784805298, "epoch": 1.9146097398265511, "step": 5740 }, { "distill_loss": 0.09776540845632553, "epoch": 1.9146097398265511, "step": 5740 }, { "epoch": 1.9146097398265511, "ref_ce_loss": 0.1992952972650528, "step": 5740 }, { "epoch": 1.9179452968645765, "loss": 0.556, "step": 5750 }, { "epoch": 1.9179452968645765, "grad_norm": 2.8669350147247314, "step": 5750 }, { "epoch": 1.9179452968645765, "learning_rate": 0.0002667060509994544, "step": 5750 }, { "epoch": 1.9179452968645765, "loss": 0.4409118890762329, "step": 5750 }, { "ce_loss": 0.14581716060638428, "epoch": 1.9179452968645765, "step": 5750 }, { "distill_loss": 0.10838485509157181, "epoch": 1.9179452968645765, "step": 5750 }, { "epoch": 1.9179452968645765, "ref_ce_loss": 0.11615348607301712, "step": 5750 }, { "epoch": 1.9179452968645765, "loss": 0.6378897428512573, "step": 5750 }, { "ce_loss": 0.20877128839492798, "epoch": 1.9179452968645765, "step": 5750 }, { "distill_loss": 0.1472756564617157, "epoch": 1.9179452968645765, "step": 5750 }, { "epoch": 1.9179452968645765, "ref_ce_loss": 0.12192574888467789, "step": 5750 }, { "epoch": 1.9179452968645765, "loss": 0.5242109298706055, "step": 5750 }, { "ce_loss": 0.20813210308551788, "epoch": 1.9179452968645765, "step": 5750 }, { "distill_loss": 0.1383836269378662, "epoch": 1.9179452968645765, "step": 5750 }, { "epoch": 1.9179452968645765, "ref_ce_loss": 0.11328185349702835, "step": 5750 }, { "epoch": 1.9179452968645765, "loss": 0.34768980741500854, "step": 5750 }, { "ce_loss": 0.09965898841619492, "epoch": 1.9179452968645765, "step": 5750 }, { "distill_loss": 0.0946233868598938, "epoch": 1.9179452968645765, "step": 5750 }, { "epoch": 1.9179452968645765, "ref_ce_loss": 0.10129421949386597, "step": 5750 }, { "epoch": 1.9212808539026018, "loss": 0.5762, "step": 5760 }, { "epoch": 1.9212808539026018, "grad_norm": 1.8706265687942505, "step": 5760 }, { "epoch": 1.9212808539026018, "learning_rate": 0.0002665786926380634, "step": 5760 }, { "epoch": 1.9212808539026018, "loss": 0.3974435329437256, "step": 5760 }, { "ce_loss": 0.11973472684621811, "epoch": 1.9212808539026018, "step": 5760 }, { "distill_loss": 0.10423251986503601, "epoch": 1.9212808539026018, "step": 5760 }, { "epoch": 1.9212808539026018, "ref_ce_loss": 0.11439381539821625, "step": 5760 }, { "epoch": 1.9212808539026018, "loss": 0.6238061785697937, "step": 5760 }, { "ce_loss": 0.2051391750574112, "epoch": 1.9212808539026018, "step": 5760 }, { "distill_loss": 0.13982662558555603, "epoch": 1.9212808539026018, "step": 5760 }, { "epoch": 1.9212808539026018, "ref_ce_loss": 0.14438097178936005, "step": 5760 }, { "epoch": 1.9212808539026018, "loss": 0.7971160411834717, "step": 5760 }, { "ce_loss": 0.280038446187973, "epoch": 1.9212808539026018, "step": 5760 }, { "distill_loss": 0.11909227073192596, "epoch": 1.9212808539026018, "step": 5760 }, { "epoch": 1.9212808539026018, "ref_ce_loss": 0.1876353621482849, "step": 5760 }, { "epoch": 1.9212808539026018, "loss": 0.7649089694023132, "step": 5760 }, { "ce_loss": 0.3388735055923462, "epoch": 1.9212808539026018, "step": 5760 }, { "distill_loss": 0.16161924600601196, "epoch": 1.9212808539026018, "step": 5760 }, { "epoch": 1.9212808539026018, "ref_ce_loss": 0.20831725001335144, "step": 5760 }, { "epoch": 1.9246164109406272, "loss": 0.6259, "step": 5770 }, { "epoch": 1.9246164109406272, "grad_norm": 2.445451498031616, "step": 5770 }, { "epoch": 1.9246164109406272, "learning_rate": 0.0002664511216831235, "step": 5770 }, { "epoch": 1.9246164109406272, "loss": 0.5543986558914185, "step": 5770 }, { "ce_loss": 0.25721198320388794, "epoch": 1.9246164109406272, "step": 5770 }, { "distill_loss": 0.14469127357006073, "epoch": 1.9246164109406272, "step": 5770 }, { "epoch": 1.9246164109406272, "ref_ce_loss": 0.11064379662275314, "step": 5770 }, { "epoch": 1.9246164109406272, "loss": 0.42987683415412903, "step": 5770 }, { "ce_loss": 0.19463425874710083, "epoch": 1.9246164109406272, "step": 5770 }, { "distill_loss": 0.10757134109735489, "epoch": 1.9246164109406272, "step": 5770 }, { "epoch": 1.9246164109406272, "ref_ce_loss": 0.10230542719364166, "step": 5770 }, { "epoch": 1.9246164109406272, "loss": 0.44369128346443176, "step": 5770 }, { "ce_loss": 0.16867774724960327, "epoch": 1.9246164109406272, "step": 5770 }, { "distill_loss": 0.11673156917095184, "epoch": 1.9246164109406272, "step": 5770 }, { "epoch": 1.9246164109406272, "ref_ce_loss": 0.11092256754636765, "step": 5770 }, { "epoch": 1.9246164109406272, "loss": 0.9410002827644348, "step": 5770 }, { "ce_loss": 0.24478532373905182, "epoch": 1.9246164109406272, "step": 5770 }, { "distill_loss": 0.14264214038848877, "epoch": 1.9246164109406272, "step": 5770 }, { "epoch": 1.9246164109406272, "ref_ce_loss": 0.1397172510623932, "step": 5770 }, { "epoch": 1.9279519679786525, "loss": 0.5818, "step": 5780 }, { "epoch": 1.9279519679786525, "grad_norm": 2.4682679176330566, "step": 5780 }, { "epoch": 1.9279519679786525, "learning_rate": 0.000266323338367274, "step": 5780 }, { "epoch": 1.9279519679786525, "loss": 0.45416784286499023, "step": 5780 }, { "ce_loss": 0.11269327998161316, "epoch": 1.9279519679786525, "step": 5780 }, { "distill_loss": 0.11082468926906586, "epoch": 1.9279519679786525, "step": 5780 }, { "epoch": 1.9279519679786525, "ref_ce_loss": 0.09867662936449051, "step": 5780 }, { "epoch": 1.9279519679786525, "loss": 0.4797813296318054, "step": 5780 }, { "ce_loss": 0.13305124640464783, "epoch": 1.9279519679786525, "step": 5780 }, { "distill_loss": 0.14016100764274597, "epoch": 1.9279519679786525, "step": 5780 }, { "epoch": 1.9279519679786525, "ref_ce_loss": 0.10984981060028076, "step": 5780 }, { "epoch": 1.9279519679786525, "loss": 0.4248768091201782, "step": 5780 }, { "ce_loss": 0.14768195152282715, "epoch": 1.9279519679786525, "step": 5780 }, { "distill_loss": 0.11138048022985458, "epoch": 1.9279519679786525, "step": 5780 }, { "epoch": 1.9279519679786525, "ref_ce_loss": 0.11871366202831268, "step": 5780 }, { "epoch": 1.9279519679786525, "loss": 0.5333288311958313, "step": 5780 }, { "ce_loss": 0.1860743910074234, "epoch": 1.9279519679786525, "step": 5780 }, { "distill_loss": 0.13695748150348663, "epoch": 1.9279519679786525, "step": 5780 }, { "epoch": 1.9279519679786525, "ref_ce_loss": 0.11091623455286026, "step": 5780 }, { "epoch": 1.9312875250166779, "loss": 0.5371, "step": 5790 }, { "epoch": 1.9312875250166779, "grad_norm": 3.2531745433807373, "step": 5790 }, { "epoch": 1.9312875250166779, "learning_rate": 0.000266195342923541, "step": 5790 }, { "epoch": 1.9312875250166779, "loss": 0.36490774154663086, "step": 5790 }, { "ce_loss": 0.10847225040197372, "epoch": 1.9312875250166779, "step": 5790 }, { "distill_loss": 0.07613347470760345, "epoch": 1.9312875250166779, "step": 5790 }, { "epoch": 1.9312875250166779, "ref_ce_loss": 0.1256321221590042, "step": 5790 }, { "epoch": 1.9312875250166779, "loss": 0.3799107074737549, "step": 5790 }, { "ce_loss": 0.17202770709991455, "epoch": 1.9312875250166779, "step": 5790 }, { "distill_loss": 0.12261956185102463, "epoch": 1.9312875250166779, "step": 5790 }, { "epoch": 1.9312875250166779, "ref_ce_loss": 0.08407846093177795, "step": 5790 }, { "epoch": 1.9312875250166779, "loss": 0.4411643445491791, "step": 5790 }, { "ce_loss": 0.08297307789325714, "epoch": 1.9312875250166779, "step": 5790 }, { "distill_loss": 0.07652588933706284, "epoch": 1.9312875250166779, "step": 5790 }, { "epoch": 1.9312875250166779, "ref_ce_loss": 0.12075881659984589, "step": 5790 }, { "epoch": 1.9312875250166779, "loss": 0.5024446845054626, "step": 5790 }, { "ce_loss": 0.16614662110805511, "epoch": 1.9312875250166779, "step": 5790 }, { "distill_loss": 0.10244978964328766, "epoch": 1.9312875250166779, "step": 5790 }, { "epoch": 1.9312875250166779, "ref_ce_loss": 0.13249625265598297, "step": 5790 }, { "epoch": 1.9346230820547032, "loss": 0.6017, "step": 5800 }, { "epoch": 1.9346230820547032, "grad_norm": 2.583327054977417, "step": 5800 }, { "epoch": 1.9346230820547032, "learning_rate": 0.0002660671355853379, "step": 5800 }, { "epoch": 1.9346230820547032, "loss": 0.8501856327056885, "step": 5800 }, { "ce_loss": 0.2730255424976349, "epoch": 1.9346230820547032, "step": 5800 }, { "distill_loss": 0.13773846626281738, "epoch": 1.9346230820547032, "step": 5800 }, { "epoch": 1.9346230820547032, "ref_ce_loss": 0.11032268404960632, "step": 5800 }, { "epoch": 1.9346230820547032, "loss": 0.3907512426376343, "step": 5800 }, { "ce_loss": 0.1428876668214798, "epoch": 1.9346230820547032, "step": 5800 }, { "distill_loss": 0.1206461638212204, "epoch": 1.9346230820547032, "step": 5800 }, { "epoch": 1.9346230820547032, "ref_ce_loss": 0.12690801918506622, "step": 5800 }, { "epoch": 1.9346230820547032, "loss": 1.1583483219146729, "step": 5800 }, { "ce_loss": 0.3016435205936432, "epoch": 1.9346230820547032, "step": 5800 }, { "distill_loss": 0.1703663468360901, "epoch": 1.9346230820547032, "step": 5800 }, { "epoch": 1.9346230820547032, "ref_ce_loss": 0.12071062624454498, "step": 5800 }, { "epoch": 1.9346230820547032, "loss": 0.7512863874435425, "step": 5800 }, { "ce_loss": 0.3120938241481781, "epoch": 1.9346230820547032, "step": 5800 }, { "distill_loss": 0.1428443342447281, "epoch": 1.9346230820547032, "step": 5800 }, { "epoch": 1.9346230820547032, "ref_ce_loss": 0.2358296513557434, "step": 5800 }, { "epoch": 1.9379586390927286, "loss": 0.6324, "step": 5810 }, { "epoch": 1.9379586390927286, "grad_norm": 4.115167617797852, "step": 5810 }, { "epoch": 1.9379586390927286, "learning_rate": 0.0002659387165864642, "step": 5810 }, { "epoch": 1.9379586390927286, "loss": 0.48370733857154846, "step": 5810 }, { "ce_loss": 0.2041088342666626, "epoch": 1.9379586390927286, "step": 5810 }, { "distill_loss": 0.11525950580835342, "epoch": 1.9379586390927286, "step": 5810 }, { "epoch": 1.9379586390927286, "ref_ce_loss": 0.10774882882833481, "step": 5810 }, { "epoch": 1.9379586390927286, "loss": 0.39353325963020325, "step": 5810 }, { "ce_loss": 0.09266683459281921, "epoch": 1.9379586390927286, "step": 5810 }, { "distill_loss": 0.10596026480197906, "epoch": 1.9379586390927286, "step": 5810 }, { "epoch": 1.9379586390927286, "ref_ce_loss": 0.12899544835090637, "step": 5810 }, { "epoch": 1.9379586390927286, "loss": 0.5259385108947754, "step": 5810 }, { "ce_loss": 0.17804045975208282, "epoch": 1.9379586390927286, "step": 5810 }, { "distill_loss": 0.12347061187028885, "epoch": 1.9379586390927286, "step": 5810 }, { "epoch": 1.9379586390927286, "ref_ce_loss": 0.08819334208965302, "step": 5810 }, { "epoch": 1.9379586390927286, "loss": 0.5755244493484497, "step": 5810 }, { "ce_loss": 0.22214099764823914, "epoch": 1.9379586390927286, "step": 5810 }, { "distill_loss": 0.0878373458981514, "epoch": 1.9379586390927286, "step": 5810 }, { "epoch": 1.9379586390927286, "ref_ce_loss": 0.13193178176879883, "step": 5810 }, { "epoch": 1.941294196130754, "loss": 0.5549, "step": 5820 }, { "epoch": 1.941294196130754, "grad_norm": 3.8637335300445557, "step": 5820 }, { "epoch": 1.941294196130754, "learning_rate": 0.0002658100861611056, "step": 5820 }, { "epoch": 1.941294196130754, "loss": 0.40220752358436584, "step": 5820 }, { "ce_loss": 0.09550049155950546, "epoch": 1.941294196130754, "step": 5820 }, { "distill_loss": 0.08620162308216095, "epoch": 1.941294196130754, "step": 5820 }, { "epoch": 1.941294196130754, "ref_ce_loss": 0.12067662179470062, "step": 5820 }, { "epoch": 1.941294196130754, "loss": 0.3592372536659241, "step": 5820 }, { "ce_loss": 0.1292707771062851, "epoch": 1.941294196130754, "step": 5820 }, { "distill_loss": 0.09745273739099503, "epoch": 1.941294196130754, "step": 5820 }, { "epoch": 1.941294196130754, "ref_ce_loss": 0.09250719100236893, "step": 5820 }, { "epoch": 1.941294196130754, "loss": 0.6222105622291565, "step": 5820 }, { "ce_loss": 0.17958803474903107, "epoch": 1.941294196130754, "step": 5820 }, { "distill_loss": 0.10309149324893951, "epoch": 1.941294196130754, "step": 5820 }, { "epoch": 1.941294196130754, "ref_ce_loss": 0.16316074132919312, "step": 5820 }, { "epoch": 1.941294196130754, "loss": 0.8499335646629333, "step": 5820 }, { "ce_loss": 0.19285449385643005, "epoch": 1.941294196130754, "step": 5820 }, { "distill_loss": 0.07820558547973633, "epoch": 1.941294196130754, "step": 5820 }, { "epoch": 1.941294196130754, "ref_ce_loss": 0.1462705433368683, "step": 5820 }, { "epoch": 1.9446297531687793, "loss": 0.5729, "step": 5830 }, { "epoch": 1.9446297531687793, "grad_norm": 1.8970744609832764, "step": 5830 }, { "epoch": 1.9446297531687793, "learning_rate": 0.0002656812445438332, "step": 5830 }, { "epoch": 1.9446297531687793, "loss": 0.5597825050354004, "step": 5830 }, { "ce_loss": 0.232600137591362, "epoch": 1.9446297531687793, "step": 5830 }, { "distill_loss": 0.1474754512310028, "epoch": 1.9446297531687793, "step": 5830 }, { "epoch": 1.9446297531687793, "ref_ce_loss": 0.09885487705469131, "step": 5830 }, { "epoch": 1.9446297531687793, "loss": 0.43907564878463745, "step": 5830 }, { "ce_loss": 0.18678975105285645, "epoch": 1.9446297531687793, "step": 5830 }, { "distill_loss": 0.10709083825349808, "epoch": 1.9446297531687793, "step": 5830 }, { "epoch": 1.9446297531687793, "ref_ce_loss": 0.10085967928171158, "step": 5830 }, { "epoch": 1.9446297531687793, "loss": 0.43083420395851135, "step": 5830 }, { "ce_loss": 0.1751118153333664, "epoch": 1.9446297531687793, "step": 5830 }, { "distill_loss": 0.12244075536727905, "epoch": 1.9446297531687793, "step": 5830 }, { "epoch": 1.9446297531687793, "ref_ce_loss": 0.08682743459939957, "step": 5830 }, { "epoch": 1.9446297531687793, "loss": 0.7343491911888123, "step": 5830 }, { "ce_loss": 0.2072955071926117, "epoch": 1.9446297531687793, "step": 5830 }, { "distill_loss": 0.10877859592437744, "epoch": 1.9446297531687793, "step": 5830 }, { "epoch": 1.9446297531687793, "ref_ce_loss": 0.14639054238796234, "step": 5830 }, { "epoch": 1.9479653102068046, "loss": 0.5426, "step": 5840 }, { "epoch": 1.9479653102068046, "grad_norm": 1.6072348356246948, "step": 5840 }, { "epoch": 1.9479653102068046, "learning_rate": 0.0002655521919696032, "step": 5840 }, { "epoch": 1.9479653102068046, "loss": 0.8225134611129761, "step": 5840 }, { "ce_loss": 0.31686344742774963, "epoch": 1.9479653102068046, "step": 5840 }, { "distill_loss": 0.14142484962940216, "epoch": 1.9479653102068046, "step": 5840 }, { "epoch": 1.9479653102068046, "ref_ce_loss": 0.14611388742923737, "step": 5840 }, { "epoch": 1.9479653102068046, "loss": 0.36344388127326965, "step": 5840 }, { "ce_loss": 0.1438729166984558, "epoch": 1.9479653102068046, "step": 5840 }, { "distill_loss": 0.12257090955972672, "epoch": 1.9479653102068046, "step": 5840 }, { "epoch": 1.9479653102068046, "ref_ce_loss": 0.09651162475347519, "step": 5840 }, { "epoch": 1.9479653102068046, "loss": 0.5657672882080078, "step": 5840 }, { "ce_loss": 0.14495164155960083, "epoch": 1.9479653102068046, "step": 5840 }, { "distill_loss": 0.09493519365787506, "epoch": 1.9479653102068046, "step": 5840 }, { "epoch": 1.9479653102068046, "ref_ce_loss": 0.10763535648584366, "step": 5840 }, { "epoch": 1.9479653102068046, "loss": 0.49626821279525757, "step": 5840 }, { "ce_loss": 0.25028860569000244, "epoch": 1.9479653102068046, "step": 5840 }, { "distill_loss": 0.1078210175037384, "epoch": 1.9479653102068046, "step": 5840 }, { "epoch": 1.9479653102068046, "ref_ce_loss": 0.13762708008289337, "step": 5840 }, { "epoch": 1.95130086724483, "loss": 0.6354, "step": 5850 }, { "epoch": 1.95130086724483, "grad_norm": 3.0269176959991455, "step": 5850 }, { "epoch": 1.95130086724483, "learning_rate": 0.0002654229286737567, "step": 5850 }, { "epoch": 1.95130086724483, "loss": 0.512065589427948, "step": 5850 }, { "ce_loss": 0.1600334197282791, "epoch": 1.95130086724483, "step": 5850 }, { "distill_loss": 0.09044967591762543, "epoch": 1.95130086724483, "step": 5850 }, { "epoch": 1.95130086724483, "ref_ce_loss": 0.11078616976737976, "step": 5850 }, { "epoch": 1.95130086724483, "loss": 0.7213431596755981, "step": 5850 }, { "ce_loss": 0.16717001795768738, "epoch": 1.95130086724483, "step": 5850 }, { "distill_loss": 0.12851962447166443, "epoch": 1.95130086724483, "step": 5850 }, { "epoch": 1.95130086724483, "ref_ce_loss": 0.1945911943912506, "step": 5850 }, { "epoch": 1.95130086724483, "loss": 0.5515893697738647, "step": 5850 }, { "ce_loss": 0.10928317159414291, "epoch": 1.95130086724483, "step": 5850 }, { "distill_loss": 0.09205825626850128, "epoch": 1.95130086724483, "step": 5850 }, { "epoch": 1.95130086724483, "ref_ce_loss": 0.11985401064157486, "step": 5850 }, { "epoch": 1.95130086724483, "loss": 0.8128691911697388, "step": 5850 }, { "ce_loss": 0.15571244060993195, "epoch": 1.95130086724483, "step": 5850 }, { "distill_loss": 0.12451403588056564, "epoch": 1.95130086724483, "step": 5850 }, { "epoch": 1.95130086724483, "ref_ce_loss": 0.10983094573020935, "step": 5850 }, { "epoch": 1.9546364242828553, "loss": 0.7005, "step": 5860 }, { "epoch": 1.9546364242828553, "grad_norm": 3.6902854442596436, "step": 5860 }, { "epoch": 1.9546364242828553, "learning_rate": 0.00026529345489201896, "step": 5860 }, { "epoch": 1.9546364242828553, "loss": 0.6848353147506714, "step": 5860 }, { "ce_loss": 0.14601679146289825, "epoch": 1.9546364242828553, "step": 5860 }, { "distill_loss": 0.11411919444799423, "epoch": 1.9546364242828553, "step": 5860 }, { "epoch": 1.9546364242828553, "ref_ce_loss": 0.12418550252914429, "step": 5860 }, { "epoch": 1.9546364242828553, "loss": 0.46468979120254517, "step": 5860 }, { "ce_loss": 0.21989013254642487, "epoch": 1.9546364242828553, "step": 5860 }, { "distill_loss": 0.11546780169010162, "epoch": 1.9546364242828553, "step": 5860 }, { "epoch": 1.9546364242828553, "ref_ce_loss": 0.12887084484100342, "step": 5860 }, { "epoch": 1.9546364242828553, "loss": 0.6564695835113525, "step": 5860 }, { "ce_loss": 0.3337359130382538, "epoch": 1.9546364242828553, "step": 5860 }, { "distill_loss": 0.13268591463565826, "epoch": 1.9546364242828553, "step": 5860 }, { "epoch": 1.9546364242828553, "ref_ce_loss": 0.1438097208738327, "step": 5860 }, { "epoch": 1.9546364242828553, "loss": 0.9603589177131653, "step": 5860 }, { "ce_loss": 0.13805945217609406, "epoch": 1.9546364242828553, "step": 5860 }, { "distill_loss": 0.12165968120098114, "epoch": 1.9546364242828553, "step": 5860 }, { "epoch": 1.9546364242828553, "ref_ce_loss": 0.12685857713222504, "step": 5860 }, { "epoch": 1.9579719813208807, "loss": 0.5612, "step": 5870 }, { "epoch": 1.9579719813208807, "grad_norm": 2.58427095413208, "step": 5870 }, { "epoch": 1.9579719813208807, "learning_rate": 0.000265163770860499, "step": 5870 }, { "epoch": 1.9579719813208807, "loss": 0.42864927649497986, "step": 5870 }, { "ce_loss": 0.14903698861598969, "epoch": 1.9579719813208807, "step": 5870 }, { "distill_loss": 0.12617306411266327, "epoch": 1.9579719813208807, "step": 5870 }, { "epoch": 1.9579719813208807, "ref_ce_loss": 0.10953165590763092, "step": 5870 }, { "epoch": 1.9579719813208807, "loss": 0.4737485349178314, "step": 5870 }, { "ce_loss": 0.19709208607673645, "epoch": 1.9579719813208807, "step": 5870 }, { "distill_loss": 0.11354006826877594, "epoch": 1.9579719813208807, "step": 5870 }, { "epoch": 1.9579719813208807, "ref_ce_loss": 0.1605563461780548, "step": 5870 }, { "epoch": 1.9579719813208807, "loss": 0.7547450065612793, "step": 5870 }, { "ce_loss": 0.19148224592208862, "epoch": 1.9579719813208807, "step": 5870 }, { "distill_loss": 0.1140814945101738, "epoch": 1.9579719813208807, "step": 5870 }, { "epoch": 1.9579719813208807, "ref_ce_loss": 0.14812114834785461, "step": 5870 }, { "epoch": 1.9579719813208807, "loss": 0.4125843048095703, "step": 5870 }, { "ce_loss": 0.1644628345966339, "epoch": 1.9579719813208807, "step": 5870 }, { "distill_loss": 0.08505094796419144, "epoch": 1.9579719813208807, "step": 5870 }, { "epoch": 1.9579719813208807, "ref_ce_loss": 0.10612877458333969, "step": 5870 }, { "epoch": 1.961307538358906, "loss": 0.5927, "step": 5880 }, { "epoch": 1.961307538358906, "grad_norm": 2.976412057876587, "step": 5880 }, { "epoch": 1.961307538358906, "learning_rate": 0.0002650338768156894, "step": 5880 }, { "epoch": 1.961307538358906, "loss": 0.6274206638336182, "step": 5880 }, { "ce_loss": 0.250379353761673, "epoch": 1.961307538358906, "step": 5880 }, { "distill_loss": 0.11398634314537048, "epoch": 1.961307538358906, "step": 5880 }, { "epoch": 1.961307538358906, "ref_ce_loss": 0.14656729996204376, "step": 5880 }, { "epoch": 1.961307538358906, "loss": 0.6905542612075806, "step": 5880 }, { "ce_loss": 0.22709456086158752, "epoch": 1.961307538358906, "step": 5880 }, { "distill_loss": 0.11130400002002716, "epoch": 1.961307538358906, "step": 5880 }, { "epoch": 1.961307538358906, "ref_ce_loss": 0.07458300143480301, "step": 5880 }, { "epoch": 1.961307538358906, "loss": 0.6171120405197144, "step": 5880 }, { "ce_loss": 0.11764512956142426, "epoch": 1.961307538358906, "step": 5880 }, { "distill_loss": 0.10441337525844574, "epoch": 1.961307538358906, "step": 5880 }, { "epoch": 1.961307538358906, "ref_ce_loss": 0.1329973042011261, "step": 5880 }, { "epoch": 1.961307538358906, "loss": 0.3864017128944397, "step": 5880 }, { "ce_loss": 0.1463531106710434, "epoch": 1.961307538358906, "step": 5880 }, { "distill_loss": 0.08627346903085709, "epoch": 1.961307538358906, "step": 5880 }, { "epoch": 1.961307538358906, "ref_ce_loss": 0.09403983503580093, "step": 5880 }, { "epoch": 1.9646430953969314, "loss": 0.5942, "step": 5890 }, { "epoch": 1.9646430953969314, "grad_norm": 3.624955177307129, "step": 5890 }, { "epoch": 1.9646430953969314, "learning_rate": 0.0002649037729944657, "step": 5890 }, { "epoch": 1.9646430953969314, "loss": 0.8614783883094788, "step": 5890 }, { "ce_loss": 0.23180143535137177, "epoch": 1.9646430953969314, "step": 5890 }, { "distill_loss": 0.10813884437084198, "epoch": 1.9646430953969314, "step": 5890 }, { "epoch": 1.9646430953969314, "ref_ce_loss": 0.16049298644065857, "step": 5890 }, { "epoch": 1.9646430953969314, "loss": 0.6531976461410522, "step": 5890 }, { "ce_loss": 0.22142180800437927, "epoch": 1.9646430953969314, "step": 5890 }, { "distill_loss": 0.11609365046024323, "epoch": 1.9646430953969314, "step": 5890 }, { "epoch": 1.9646430953969314, "ref_ce_loss": 0.2098771631717682, "step": 5890 }, { "epoch": 1.9646430953969314, "loss": 0.7346011996269226, "step": 5890 }, { "ce_loss": 0.22203612327575684, "epoch": 1.9646430953969314, "step": 5890 }, { "distill_loss": 0.13322067260742188, "epoch": 1.9646430953969314, "step": 5890 }, { "epoch": 1.9646430953969314, "ref_ce_loss": 0.11884419620037079, "step": 5890 }, { "epoch": 1.9646430953969314, "loss": 0.3451249301433563, "step": 5890 }, { "ce_loss": 0.09568674117326736, "epoch": 1.9646430953969314, "step": 5890 }, { "distill_loss": 0.11562363058328629, "epoch": 1.9646430953969314, "step": 5890 }, { "epoch": 1.9646430953969314, "ref_ce_loss": 0.13373398780822754, "step": 5890 }, { "epoch": 1.9679786524349567, "loss": 0.6328, "step": 5900 }, { "epoch": 1.9679786524349567, "grad_norm": 3.982816457748413, "step": 5900 }, { "epoch": 1.9679786524349567, "learning_rate": 0.0002647734596340859, "step": 5900 }, { "epoch": 1.9679786524349567, "loss": 0.5729438066482544, "step": 5900 }, { "ce_loss": 0.20175018906593323, "epoch": 1.9679786524349567, "step": 5900 }, { "distill_loss": 0.10930713266134262, "epoch": 1.9679786524349567, "step": 5900 }, { "epoch": 1.9679786524349567, "ref_ce_loss": 0.11769817024469376, "step": 5900 }, { "epoch": 1.9679786524349567, "loss": 0.2672497034072876, "step": 5900 }, { "ce_loss": 0.10306866466999054, "epoch": 1.9679786524349567, "step": 5900 }, { "distill_loss": 0.09915365278720856, "epoch": 1.9679786524349567, "step": 5900 }, { "epoch": 1.9679786524349567, "ref_ce_loss": 0.06501106917858124, "step": 5900 }, { "epoch": 1.9679786524349567, "loss": 0.5962284803390503, "step": 5900 }, { "ce_loss": 0.2861151099205017, "epoch": 1.9679786524349567, "step": 5900 }, { "distill_loss": 0.12377713620662689, "epoch": 1.9679786524349567, "step": 5900 }, { "epoch": 1.9679786524349567, "ref_ce_loss": 0.18407902121543884, "step": 5900 }, { "epoch": 1.9679786524349567, "loss": 0.5468906164169312, "step": 5900 }, { "ce_loss": 0.17363518476486206, "epoch": 1.9679786524349567, "step": 5900 }, { "distill_loss": 0.11055031418800354, "epoch": 1.9679786524349567, "step": 5900 }, { "epoch": 1.9679786524349567, "ref_ce_loss": 0.10016216337680817, "step": 5900 }, { "epoch": 1.971314209472982, "loss": 0.5764, "step": 5910 }, { "epoch": 1.971314209472982, "grad_norm": 3.0539867877960205, "step": 5910 }, { "epoch": 1.971314209472982, "learning_rate": 0.00026464293697219015, "step": 5910 }, { "epoch": 1.971314209472982, "loss": 0.5851742029190063, "step": 5910 }, { "ce_loss": 0.24227118492126465, "epoch": 1.971314209472982, "step": 5910 }, { "distill_loss": 0.11767081916332245, "epoch": 1.971314209472982, "step": 5910 }, { "epoch": 1.971314209472982, "ref_ce_loss": 0.1285375952720642, "step": 5910 }, { "epoch": 1.971314209472982, "loss": 0.5376459956169128, "step": 5910 }, { "ce_loss": 0.1833650916814804, "epoch": 1.971314209472982, "step": 5910 }, { "distill_loss": 0.12777496874332428, "epoch": 1.971314209472982, "step": 5910 }, { "epoch": 1.971314209472982, "ref_ce_loss": 0.09195635467767715, "step": 5910 }, { "epoch": 1.971314209472982, "loss": 0.4259001910686493, "step": 5910 }, { "ce_loss": 0.16761203110218048, "epoch": 1.971314209472982, "step": 5910 }, { "distill_loss": 0.1049601286649704, "epoch": 1.971314209472982, "step": 5910 }, { "epoch": 1.971314209472982, "ref_ce_loss": 0.08499225974082947, "step": 5910 }, { "epoch": 1.971314209472982, "loss": 0.46280694007873535, "step": 5910 }, { "ce_loss": 0.17035450041294098, "epoch": 1.971314209472982, "step": 5910 }, { "distill_loss": 0.100443035364151, "epoch": 1.971314209472982, "step": 5910 }, { "epoch": 1.971314209472982, "ref_ce_loss": 0.19197778403759003, "step": 5910 }, { "epoch": 1.9746497665110074, "loss": 0.5723, "step": 5920 }, { "epoch": 1.9746497665110074, "grad_norm": 3.073291063308716, "step": 5920 }, { "epoch": 1.9746497665110074, "learning_rate": 0.00026451220524680025, "step": 5920 }, { "epoch": 1.9746497665110074, "loss": 0.5005868077278137, "step": 5920 }, { "ce_loss": 0.1469360888004303, "epoch": 1.9746497665110074, "step": 5920 }, { "distill_loss": 0.11416827142238617, "epoch": 1.9746497665110074, "step": 5920 }, { "epoch": 1.9746497665110074, "ref_ce_loss": 0.1178518682718277, "step": 5920 }, { "epoch": 1.9746497665110074, "loss": 0.3899126648902893, "step": 5920 }, { "ce_loss": 0.12613821029663086, "epoch": 1.9746497665110074, "step": 5920 }, { "distill_loss": 0.09031940251588821, "epoch": 1.9746497665110074, "step": 5920 }, { "epoch": 1.9746497665110074, "ref_ce_loss": 0.12185925245285034, "step": 5920 }, { "epoch": 1.9746497665110074, "loss": 0.6879602074623108, "step": 5920 }, { "ce_loss": 0.24272885918617249, "epoch": 1.9746497665110074, "step": 5920 }, { "distill_loss": 0.1410408467054367, "epoch": 1.9746497665110074, "step": 5920 }, { "epoch": 1.9746497665110074, "ref_ce_loss": 0.13041582703590393, "step": 5920 }, { "epoch": 1.9746497665110074, "loss": 0.5270417332649231, "step": 5920 }, { "ce_loss": 0.18191879987716675, "epoch": 1.9746497665110074, "step": 5920 }, { "distill_loss": 0.11637015640735626, "epoch": 1.9746497665110074, "step": 5920 }, { "epoch": 1.9746497665110074, "ref_ce_loss": 0.09732194989919662, "step": 5920 }, { "epoch": 1.9779853235490328, "loss": 0.609, "step": 5930 }, { "epoch": 1.9779853235490328, "grad_norm": 2.8188343048095703, "step": 5930 }, { "epoch": 1.9779853235490328, "learning_rate": 0.0002643812646963194, "step": 5930 }, { "epoch": 1.9779853235490328, "loss": 2.293231725692749, "step": 5930 }, { "ce_loss": 1.2716293334960938, "epoch": 1.9779853235490328, "step": 5930 }, { "distill_loss": 0.10453981161117554, "epoch": 1.9779853235490328, "step": 5930 }, { "epoch": 1.9779853235490328, "ref_ce_loss": 0.5263649821281433, "step": 5930 }, { "epoch": 1.9779853235490328, "loss": 2.203127861022949, "step": 5930 }, { "ce_loss": 1.4657419919967651, "epoch": 1.9779853235490328, "step": 5930 }, { "distill_loss": 0.09050406515598297, "epoch": 1.9779853235490328, "step": 5930 }, { "epoch": 1.9779853235490328, "ref_ce_loss": 0.5690605640411377, "step": 5930 }, { "epoch": 1.9779853235490328, "loss": 2.1848506927490234, "step": 5930 }, { "ce_loss": 1.1554293632507324, "epoch": 1.9779853235490328, "step": 5930 }, { "distill_loss": 0.11221285909414291, "epoch": 1.9779853235490328, "step": 5930 }, { "epoch": 1.9779853235490328, "ref_ce_loss": 0.7525448203086853, "step": 5930 }, { "epoch": 1.9779853235490328, "loss": 1.8322076797485352, "step": 5930 }, { "ce_loss": 1.1817811727523804, "epoch": 1.9779853235490328, "step": 5930 }, { "distill_loss": 0.11352679133415222, "epoch": 1.9779853235490328, "step": 5930 }, { "epoch": 1.9779853235490328, "ref_ce_loss": 0.49499446153640747, "step": 5930 }, { "epoch": 1.9813208805870581, "loss": 0.8654, "step": 5940 }, { "epoch": 1.9813208805870581, "grad_norm": 27.285018920898438, "step": 5940 }, { "epoch": 1.9813208805870581, "learning_rate": 0.00026425011555953145, "step": 5940 }, { "epoch": 1.9813208805870581, "loss": 1.115926742553711, "step": 5940 }, { "ce_loss": 0.23986463248729706, "epoch": 1.9813208805870581, "step": 5940 }, { "distill_loss": 0.6295623779296875, "epoch": 1.9813208805870581, "step": 5940 }, { "epoch": 1.9813208805870581, "ref_ce_loss": 0.11818370968103409, "step": 5940 }, { "epoch": 1.9813208805870581, "loss": 0.8795012831687927, "step": 5940 }, { "ce_loss": 0.1840430498123169, "epoch": 1.9813208805870581, "step": 5940 }, { "distill_loss": 0.5369489789009094, "epoch": 1.9813208805870581, "step": 5940 }, { "epoch": 1.9813208805870581, "ref_ce_loss": 0.08884285390377045, "step": 5940 }, { "epoch": 1.9813208805870581, "loss": 1.0463898181915283, "step": 5940 }, { "ce_loss": 0.26496535539627075, "epoch": 1.9813208805870581, "step": 5940 }, { "distill_loss": 0.6432059407234192, "epoch": 1.9813208805870581, "step": 5940 }, { "epoch": 1.9813208805870581, "ref_ce_loss": 0.13793134689331055, "step": 5940 }, { "epoch": 1.9813208805870581, "loss": 1.0505305528640747, "step": 5940 }, { "ce_loss": 0.23481756448745728, "epoch": 1.9813208805870581, "step": 5940 }, { "distill_loss": 0.6528819799423218, "epoch": 1.9813208805870581, "step": 5940 }, { "epoch": 1.9813208805870581, "ref_ce_loss": 0.10958369076251984, "step": 5940 }, { "epoch": 1.9846564376250835, "loss": 0.8436, "step": 5950 }, { "epoch": 1.9846564376250835, "grad_norm": 3.0225157737731934, "step": 5950 }, { "epoch": 1.9846564376250835, "learning_rate": 0.00026411875807560075, "step": 5950 }, { "epoch": 1.9846564376250835, "loss": 0.6642615795135498, "step": 5950 }, { "ce_loss": 0.11888474971055984, "epoch": 1.9846564376250835, "step": 5950 }, { "distill_loss": 0.2751652002334595, "epoch": 1.9846564376250835, "step": 5950 }, { "epoch": 1.9846564376250835, "ref_ce_loss": 0.13492824137210846, "step": 5950 }, { "epoch": 1.9846564376250835, "loss": 0.7998644113540649, "step": 5950 }, { "ce_loss": 0.17532217502593994, "epoch": 1.9846564376250835, "step": 5950 }, { "distill_loss": 0.35970568656921387, "epoch": 1.9846564376250835, "step": 5950 }, { "epoch": 1.9846564376250835, "ref_ce_loss": 0.13275721669197083, "step": 5950 }, { "epoch": 1.9846564376250835, "loss": 0.8139857649803162, "step": 5950 }, { "ce_loss": 0.2516137659549713, "epoch": 1.9846564376250835, "step": 5950 }, { "distill_loss": 0.30513206124305725, "epoch": 1.9846564376250835, "step": 5950 }, { "epoch": 1.9846564376250835, "ref_ce_loss": 0.1743960827589035, "step": 5950 }, { "epoch": 1.9846564376250835, "loss": 0.5148938894271851, "step": 5950 }, { "ce_loss": 0.11828169226646423, "epoch": 1.9846564376250835, "step": 5950 }, { "distill_loss": 0.2190283238887787, "epoch": 1.9846564376250835, "step": 5950 }, { "epoch": 1.9846564376250835, "ref_ce_loss": 0.11594554036855698, "step": 5950 }, { "epoch": 1.9879919946631088, "loss": 0.7121, "step": 5960 }, { "epoch": 1.9879919946631088, "grad_norm": 3.4296367168426514, "step": 5960 }, { "epoch": 1.9879919946631088, "learning_rate": 0.00026398719248407147, "step": 5960 }, { "epoch": 1.9879919946631088, "loss": 0.818779706954956, "step": 5960 }, { "ce_loss": 0.2508390247821808, "epoch": 1.9879919946631088, "step": 5960 }, { "distill_loss": 0.18431255221366882, "epoch": 1.9879919946631088, "step": 5960 }, { "epoch": 1.9879919946631088, "ref_ce_loss": 0.1367606669664383, "step": 5960 }, { "epoch": 1.9879919946631088, "loss": 0.5594602823257446, "step": 5960 }, { "ce_loss": 0.20339643955230713, "epoch": 1.9879919946631088, "step": 5960 }, { "distill_loss": 0.18110495805740356, "epoch": 1.9879919946631088, "step": 5960 }, { "epoch": 1.9879919946631088, "ref_ce_loss": 0.13036209344863892, "step": 5960 }, { "epoch": 1.9879919946631088, "loss": 0.5207152366638184, "step": 5960 }, { "ce_loss": 0.15699785947799683, "epoch": 1.9879919946631088, "step": 5960 }, { "distill_loss": 0.18864737451076508, "epoch": 1.9879919946631088, "step": 5960 }, { "epoch": 1.9879919946631088, "ref_ce_loss": 0.12153347581624985, "step": 5960 }, { "epoch": 1.9879919946631088, "loss": 0.4976940453052521, "step": 5960 }, { "ce_loss": 0.17792610824108124, "epoch": 1.9879919946631088, "step": 5960 }, { "distill_loss": 0.14083155989646912, "epoch": 1.9879919946631088, "step": 5960 }, { "epoch": 1.9879919946631088, "ref_ce_loss": 0.0966520607471466, "step": 5960 }, { "epoch": 1.9913275517011342, "loss": 0.5663, "step": 5970 }, { "epoch": 1.9913275517011342, "grad_norm": 3.2900636196136475, "step": 5970 }, { "epoch": 1.9913275517011342, "learning_rate": 0.0002638554190248674, "step": 5970 }, { "epoch": 1.9913275517011342, "loss": 0.6181614995002747, "step": 5970 }, { "ce_loss": 0.1431739330291748, "epoch": 1.9913275517011342, "step": 5970 }, { "distill_loss": 0.20133964717388153, "epoch": 1.9913275517011342, "step": 5970 }, { "epoch": 1.9913275517011342, "ref_ce_loss": 0.1477632224559784, "step": 5970 }, { "epoch": 1.9913275517011342, "loss": 0.5945953130722046, "step": 5970 }, { "ce_loss": 0.17509131133556366, "epoch": 1.9913275517011342, "step": 5970 }, { "distill_loss": 0.13901633024215698, "epoch": 1.9913275517011342, "step": 5970 }, { "epoch": 1.9913275517011342, "ref_ce_loss": 0.13561907410621643, "step": 5970 }, { "epoch": 1.9913275517011342, "loss": 0.7570385336875916, "step": 5970 }, { "ce_loss": 0.19282585382461548, "epoch": 1.9913275517011342, "step": 5970 }, { "distill_loss": 0.2022811472415924, "epoch": 1.9913275517011342, "step": 5970 }, { "epoch": 1.9913275517011342, "ref_ce_loss": 0.19970786571502686, "step": 5970 }, { "epoch": 1.9913275517011342, "loss": 0.686042070388794, "step": 5970 }, { "ce_loss": 0.19220466911792755, "epoch": 1.9913275517011342, "step": 5970 }, { "distill_loss": 0.22768956422805786, "epoch": 1.9913275517011342, "step": 5970 }, { "epoch": 1.9913275517011342, "ref_ce_loss": 0.12446795403957367, "step": 5970 }, { "epoch": 1.9946631087391595, "loss": 0.7199, "step": 5980 }, { "epoch": 1.9946631087391595, "grad_norm": 4.222412109375, "step": 5980 }, { "epoch": 1.9946631087391595, "learning_rate": 0.0002637234379382913, "step": 5980 }, { "epoch": 1.9946631087391595, "loss": 0.5888592004776001, "step": 5980 }, { "ce_loss": 0.11838914453983307, "epoch": 1.9946631087391595, "step": 5980 }, { "distill_loss": 0.16122058033943176, "epoch": 1.9946631087391595, "step": 5980 }, { "epoch": 1.9946631087391595, "ref_ce_loss": 0.12328799068927765, "step": 5980 }, { "epoch": 1.9946631087391595, "loss": 0.8882757425308228, "step": 5980 }, { "ce_loss": 0.2505829334259033, "epoch": 1.9946631087391595, "step": 5980 }, { "distill_loss": 0.33013272285461426, "epoch": 1.9946631087391595, "step": 5980 }, { "epoch": 1.9946631087391595, "ref_ce_loss": 0.184593066573143, "step": 5980 }, { "epoch": 1.9946631087391595, "loss": 0.6736380457878113, "step": 5980 }, { "ce_loss": 0.16651169955730438, "epoch": 1.9946631087391595, "step": 5980 }, { "distill_loss": 0.18379709124565125, "epoch": 1.9946631087391595, "step": 5980 }, { "epoch": 1.9946631087391595, "ref_ce_loss": 0.13057413697242737, "step": 5980 }, { "epoch": 1.9946631087391595, "loss": 1.0884795188903809, "step": 5980 }, { "ce_loss": 0.18364958465099335, "epoch": 1.9946631087391595, "step": 5980 }, { "distill_loss": 0.2071857750415802, "epoch": 1.9946631087391595, "step": 5980 }, { "epoch": 1.9946631087391595, "ref_ce_loss": 0.12370967864990234, "step": 5980 }, { "epoch": 1.9979986657771849, "loss": 0.6674, "step": 5990 }, { "epoch": 1.9979986657771849, "grad_norm": 2.313319683074951, "step": 5990 }, { "epoch": 1.9979986657771849, "learning_rate": 0.0002635912494650246, "step": 5990 }, { "epoch": 1.9979986657771849, "loss": 0.6827449202537537, "step": 5990 }, { "ce_loss": 0.18689732253551483, "epoch": 1.9979986657771849, "step": 5990 }, { "distill_loss": 0.19142132997512817, "epoch": 1.9979986657771849, "step": 5990 }, { "epoch": 1.9979986657771849, "ref_ce_loss": 0.08761729300022125, "step": 5990 }, { "epoch": 1.9979986657771849, "loss": 0.7562812566757202, "step": 5990 }, { "ce_loss": 0.2163257747888565, "epoch": 1.9979986657771849, "step": 5990 }, { "distill_loss": 0.2145807296037674, "epoch": 1.9979986657771849, "step": 5990 }, { "epoch": 1.9979986657771849, "ref_ce_loss": 0.1543344110250473, "step": 5990 }, { "epoch": 1.9979986657771849, "loss": 0.48378419876098633, "step": 5990 }, { "ce_loss": 0.1646513193845749, "epoch": 1.9979986657771849, "step": 5990 }, { "distill_loss": 0.19733795523643494, "epoch": 1.9979986657771849, "step": 5990 }, { "epoch": 1.9979986657771849, "ref_ce_loss": 0.12141157686710358, "step": 5990 }, { "epoch": 1.9979986657771849, "loss": 0.5468843579292297, "step": 5990 }, { "ce_loss": 0.18428729474544525, "epoch": 1.9979986657771849, "step": 5990 }, { "distill_loss": 0.2126753032207489, "epoch": 1.9979986657771849, "step": 5990 }, { "epoch": 1.9979986657771849, "ref_ce_loss": 0.11773700267076492, "step": 5990 }, { "epoch": 2.0013342228152102, "loss": 0.672, "step": 6000 }, { "epoch": 2.0013342228152102, "grad_norm": 3.492262840270996, "step": 6000 }, { "epoch": 2.0013342228152102, "learning_rate": 0.00026345885384612705, "step": 6000 }, { "epoch": 2.0013342228152102, "loss": 0.6723750233650208, "step": 6000 }, { "ce_loss": 0.18430662155151367, "epoch": 2.0013342228152102, "step": 6000 }, { "distill_loss": 0.2313898801803589, "epoch": 2.0013342228152102, "step": 6000 }, { "epoch": 2.0013342228152102, "ref_ce_loss": 0.08441024273633957, "step": 6000 }, { "epoch": 2.0013342228152102, "loss": 0.5557563304901123, "step": 6000 }, { "ce_loss": 0.11034717410802841, "epoch": 2.0013342228152102, "step": 6000 }, { "distill_loss": 0.15689697861671448, "epoch": 2.0013342228152102, "step": 6000 }, { "epoch": 2.0013342228152102, "ref_ce_loss": 0.11964371055364609, "step": 6000 }, { "epoch": 2.0013342228152102, "loss": 0.6583637595176697, "step": 6000 }, { "ce_loss": 0.20213940739631653, "epoch": 2.0013342228152102, "step": 6000 }, { "distill_loss": 0.16685333847999573, "epoch": 2.0013342228152102, "step": 6000 }, { "epoch": 2.0013342228152102, "ref_ce_loss": 0.12898676097393036, "step": 6000 }, { "epoch": 2.0013342228152102, "loss": 0.6852834820747375, "step": 6000 }, { "ce_loss": 0.2616039216518402, "epoch": 2.0013342228152102, "step": 6000 }, { "distill_loss": 0.1978679597377777, "epoch": 2.0013342228152102, "step": 6000 }, { "epoch": 2.0013342228152102, "ref_ce_loss": 0.161850243806839, "step": 6000 }, { "epoch": 2.0046697798532356, "loss": 0.6376, "step": 6010 }, { "epoch": 2.0046697798532356, "grad_norm": 2.2328505516052246, "step": 6010 }, { "epoch": 2.0046697798532356, "learning_rate": 0.00026332625132303593, "step": 6010 }, { "epoch": 2.0046697798532356, "loss": 0.6028956174850464, "step": 6010 }, { "ce_loss": 0.09909051656723022, "epoch": 2.0046697798532356, "step": 6010 }, { "distill_loss": 0.12309257686138153, "epoch": 2.0046697798532356, "step": 6010 }, { "epoch": 2.0046697798532356, "ref_ce_loss": 0.11756174266338348, "step": 6010 }, { "epoch": 2.0046697798532356, "loss": 0.5768246054649353, "step": 6010 }, { "ce_loss": 0.21978306770324707, "epoch": 2.0046697798532356, "step": 6010 }, { "distill_loss": 0.1698399931192398, "epoch": 2.0046697798532356, "step": 6010 }, { "epoch": 2.0046697798532356, "ref_ce_loss": 0.14700907468795776, "step": 6010 }, { "epoch": 2.0046697798532356, "loss": 0.3713357448577881, "step": 6010 }, { "ce_loss": 0.08948267996311188, "epoch": 2.0046697798532356, "step": 6010 }, { "distill_loss": 0.12198400497436523, "epoch": 2.0046697798532356, "step": 6010 }, { "epoch": 2.0046697798532356, "ref_ce_loss": 0.10145905613899231, "step": 6010 }, { "epoch": 2.0046697798532356, "loss": 0.5854678750038147, "step": 6010 }, { "ce_loss": 0.1666143238544464, "epoch": 2.0046697798532356, "step": 6010 }, { "distill_loss": 0.19001466035842896, "epoch": 2.0046697798532356, "step": 6010 }, { "epoch": 2.0046697798532356, "ref_ce_loss": 0.1575281322002411, "step": 6010 }, { "epoch": 2.008005336891261, "loss": 0.5722, "step": 6020 }, { "epoch": 2.008005336891261, "grad_norm": 3.4229934215545654, "step": 6020 }, { "epoch": 2.008005336891261, "learning_rate": 0.0002631934421375659, "step": 6020 }, { "epoch": 2.008005336891261, "loss": 0.6702373027801514, "step": 6020 }, { "ce_loss": 0.2024572342634201, "epoch": 2.008005336891261, "step": 6020 }, { "distill_loss": 0.1616104245185852, "epoch": 2.008005336891261, "step": 6020 }, { "epoch": 2.008005336891261, "ref_ce_loss": 0.11883606016635895, "step": 6020 }, { "epoch": 2.008005336891261, "loss": 0.3642963171005249, "step": 6020 }, { "ce_loss": 0.08134083449840546, "epoch": 2.008005336891261, "step": 6020 }, { "distill_loss": 0.09039433300495148, "epoch": 2.008005336891261, "step": 6020 }, { "epoch": 2.008005336891261, "ref_ce_loss": 0.11555787175893784, "step": 6020 }, { "epoch": 2.008005336891261, "loss": 0.7688882946968079, "step": 6020 }, { "ce_loss": 0.22774671018123627, "epoch": 2.008005336891261, "step": 6020 }, { "distill_loss": 0.14509688317775726, "epoch": 2.008005336891261, "step": 6020 }, { "epoch": 2.008005336891261, "ref_ce_loss": 0.14968301355838776, "step": 6020 }, { "epoch": 2.008005336891261, "loss": 1.2047111988067627, "step": 6020 }, { "ce_loss": 0.17515689134597778, "epoch": 2.008005336891261, "step": 6020 }, { "distill_loss": 0.1401824802160263, "epoch": 2.008005336891261, "step": 6020 }, { "epoch": 2.008005336891261, "ref_ce_loss": 0.10154224187135696, "step": 6020 }, { "epoch": 2.0113408939292863, "loss": 0.5728, "step": 6030 }, { "epoch": 2.0113408939292863, "grad_norm": 2.178412437438965, "step": 6030 }, { "epoch": 2.0113408939292863, "learning_rate": 0.00026306042653190866, "step": 6030 }, { "epoch": 2.0113408939292863, "loss": 0.35371965169906616, "step": 6030 }, { "ce_loss": 0.11235601454973221, "epoch": 2.0113408939292863, "step": 6030 }, { "distill_loss": 0.09209266304969788, "epoch": 2.0113408939292863, "step": 6030 }, { "epoch": 2.0113408939292863, "ref_ce_loss": 0.09106268733739853, "step": 6030 }, { "epoch": 2.0113408939292863, "loss": 0.48361748456954956, "step": 6030 }, { "ce_loss": 0.13192753493785858, "epoch": 2.0113408939292863, "step": 6030 }, { "distill_loss": 0.12088339775800705, "epoch": 2.0113408939292863, "step": 6030 }, { "epoch": 2.0113408939292863, "ref_ce_loss": 0.18421778082847595, "step": 6030 }, { "epoch": 2.0113408939292863, "loss": 0.4473074674606323, "step": 6030 }, { "ce_loss": 0.1566929668188095, "epoch": 2.0113408939292863, "step": 6030 }, { "distill_loss": 0.10360036790370941, "epoch": 2.0113408939292863, "step": 6030 }, { "epoch": 2.0113408939292863, "ref_ce_loss": 0.14987316727638245, "step": 6030 }, { "epoch": 2.0113408939292863, "loss": 0.7593709230422974, "step": 6030 }, { "ce_loss": 0.2081209421157837, "epoch": 2.0113408939292863, "step": 6030 }, { "distill_loss": 0.11178325861692429, "epoch": 2.0113408939292863, "step": 6030 }, { "epoch": 2.0113408939292863, "ref_ce_loss": 0.10972130298614502, "step": 6030 }, { "epoch": 2.0146764509673116, "loss": 0.5387, "step": 6040 }, { "epoch": 2.0146764509673116, "grad_norm": 3.3054556846618652, "step": 6040 }, { "epoch": 2.0146764509673116, "learning_rate": 0.0002629272047486321, "step": 6040 }, { "epoch": 2.0146764509673116, "loss": 0.5572177767753601, "step": 6040 }, { "ce_loss": 0.2259761095046997, "epoch": 2.0146764509673116, "step": 6040 }, { "distill_loss": 0.12743355333805084, "epoch": 2.0146764509673116, "step": 6040 }, { "epoch": 2.0146764509673116, "ref_ce_loss": 0.12631645798683167, "step": 6040 }, { "epoch": 2.0146764509673116, "loss": 0.44857120513916016, "step": 6040 }, { "ce_loss": 0.16541236639022827, "epoch": 2.0146764509673116, "step": 6040 }, { "distill_loss": 0.09709923714399338, "epoch": 2.0146764509673116, "step": 6040 }, { "epoch": 2.0146764509673116, "ref_ce_loss": 0.13638761639595032, "step": 6040 }, { "epoch": 2.0146764509673116, "loss": 0.3600292205810547, "step": 6040 }, { "ce_loss": 0.17759163677692413, "epoch": 2.0146764509673116, "step": 6040 }, { "distill_loss": 0.09841334074735641, "epoch": 2.0146764509673116, "step": 6040 }, { "epoch": 2.0146764509673116, "ref_ce_loss": 0.08374278247356415, "step": 6040 }, { "epoch": 2.0146764509673116, "loss": 1.024361252784729, "step": 6040 }, { "ce_loss": 0.1911391019821167, "epoch": 2.0146764509673116, "step": 6040 }, { "distill_loss": 0.13310448825359344, "epoch": 2.0146764509673116, "step": 6040 }, { "epoch": 2.0146764509673116, "ref_ce_loss": 0.11508171260356903, "step": 6040 }, { "epoch": 2.018012008005337, "loss": 0.5694, "step": 6050 }, { "epoch": 2.018012008005337, "grad_norm": 2.28583025932312, "step": 6050 }, { "epoch": 2.018012008005337, "learning_rate": 0.0002627937770306802, "step": 6050 }, { "epoch": 2.018012008005337, "loss": 0.4060389995574951, "step": 6050 }, { "ce_loss": 0.1576002985239029, "epoch": 2.018012008005337, "step": 6050 }, { "distill_loss": 0.11558900773525238, "epoch": 2.018012008005337, "step": 6050 }, { "epoch": 2.018012008005337, "ref_ce_loss": 0.09974104911088943, "step": 6050 }, { "epoch": 2.018012008005337, "loss": 0.5782639384269714, "step": 6050 }, { "ce_loss": 0.2112276256084442, "epoch": 2.018012008005337, "step": 6050 }, { "distill_loss": 0.1430872231721878, "epoch": 2.018012008005337, "step": 6050 }, { "epoch": 2.018012008005337, "ref_ce_loss": 0.1537550836801529, "step": 6050 }, { "epoch": 2.018012008005337, "loss": 0.6270675659179688, "step": 6050 }, { "ce_loss": 0.20577941834926605, "epoch": 2.018012008005337, "step": 6050 }, { "distill_loss": 0.15070682764053345, "epoch": 2.018012008005337, "step": 6050 }, { "epoch": 2.018012008005337, "ref_ce_loss": 0.15376164019107819, "step": 6050 }, { "epoch": 2.018012008005337, "loss": 0.42507204413414, "step": 6050 }, { "ce_loss": 0.1491844207048416, "epoch": 2.018012008005337, "step": 6050 }, { "distill_loss": 0.12194012105464935, "epoch": 2.018012008005337, "step": 6050 }, { "epoch": 2.018012008005337, "ref_ce_loss": 0.11052257567644119, "step": 6050 }, { "epoch": 2.0213475650433623, "loss": 0.5665, "step": 6060 }, { "epoch": 2.0213475650433623, "grad_norm": 2.7430214881896973, "step": 6060 }, { "epoch": 2.0213475650433623, "learning_rate": 0.0002626601436213725, "step": 6060 }, { "epoch": 2.0213475650433623, "loss": 0.39292192459106445, "step": 6060 }, { "ce_loss": 0.13748258352279663, "epoch": 2.0213475650433623, "step": 6060 }, { "distill_loss": 0.1628316044807434, "epoch": 2.0213475650433623, "step": 6060 }, { "epoch": 2.0213475650433623, "ref_ce_loss": 0.09246788918972015, "step": 6060 }, { "epoch": 2.0213475650433623, "loss": 0.6393857002258301, "step": 6060 }, { "ce_loss": 0.2324545830488205, "epoch": 2.0213475650433623, "step": 6060 }, { "distill_loss": 0.22455164790153503, "epoch": 2.0213475650433623, "step": 6060 }, { "epoch": 2.0213475650433623, "ref_ce_loss": 0.13880237936973572, "step": 6060 }, { "epoch": 2.0213475650433623, "loss": 0.3977337181568146, "step": 6060 }, { "ce_loss": 0.11556189507246017, "epoch": 2.0213475650433623, "step": 6060 }, { "distill_loss": 0.16048678755760193, "epoch": 2.0213475650433623, "step": 6060 }, { "epoch": 2.0213475650433623, "ref_ce_loss": 0.09150765091180801, "step": 6060 }, { "epoch": 2.0213475650433623, "loss": 0.6872441172599792, "step": 6060 }, { "ce_loss": 0.15894901752471924, "epoch": 2.0213475650433623, "step": 6060 }, { "distill_loss": 0.1782384216785431, "epoch": 2.0213475650433623, "step": 6060 }, { "epoch": 2.0213475650433623, "ref_ce_loss": 0.12021970003843307, "step": 6060 }, { "epoch": 2.0246831220813877, "loss": 0.6173, "step": 6070 }, { "epoch": 2.0246831220813877, "grad_norm": 3.887601137161255, "step": 6070 }, { "epoch": 2.0246831220813877, "learning_rate": 0.00026252630476440367, "step": 6070 }, { "epoch": 2.0246831220813877, "loss": 0.47213730216026306, "step": 6070 }, { "ce_loss": 0.12705136835575104, "epoch": 2.0246831220813877, "step": 6070 }, { "distill_loss": 0.16267943382263184, "epoch": 2.0246831220813877, "step": 6070 }, { "epoch": 2.0246831220813877, "ref_ce_loss": 0.062675841152668, "step": 6070 }, { "epoch": 2.0246831220813877, "loss": 0.5571383833885193, "step": 6070 }, { "ce_loss": 0.12326113879680634, "epoch": 2.0246831220813877, "step": 6070 }, { "distill_loss": 0.1477968990802765, "epoch": 2.0246831220813877, "step": 6070 }, { "epoch": 2.0246831220813877, "ref_ce_loss": 0.13798700273036957, "step": 6070 }, { "epoch": 2.0246831220813877, "loss": 0.5846275687217712, "step": 6070 }, { "ce_loss": 0.16185100376605988, "epoch": 2.0246831220813877, "step": 6070 }, { "distill_loss": 0.18354424834251404, "epoch": 2.0246831220813877, "step": 6070 }, { "epoch": 2.0246831220813877, "ref_ce_loss": 0.13381104171276093, "step": 6070 }, { "epoch": 2.0246831220813877, "loss": 0.863525390625, "step": 6070 }, { "ce_loss": 0.319327712059021, "epoch": 2.0246831220813877, "step": 6070 }, { "distill_loss": 0.27865153551101685, "epoch": 2.0246831220813877, "step": 6070 }, { "epoch": 2.0246831220813877, "ref_ce_loss": 0.19892120361328125, "step": 6070 }, { "epoch": 2.028018679119413, "loss": 0.6443, "step": 6080 }, { "epoch": 2.028018679119413, "grad_norm": 1.872769832611084, "step": 6080 }, { "epoch": 2.028018679119413, "learning_rate": 0.0002623922607038429, "step": 6080 }, { "epoch": 2.028018679119413, "loss": 0.41863390803337097, "step": 6080 }, { "ce_loss": 0.0914531722664833, "epoch": 2.028018679119413, "step": 6080 }, { "distill_loss": 0.22104503214359283, "epoch": 2.028018679119413, "step": 6080 }, { "epoch": 2.028018679119413, "ref_ce_loss": 0.10593371838331223, "step": 6080 }, { "epoch": 2.028018679119413, "loss": 0.44628605246543884, "step": 6080 }, { "ce_loss": 0.13750892877578735, "epoch": 2.028018679119413, "step": 6080 }, { "distill_loss": 0.18723967671394348, "epoch": 2.028018679119413, "step": 6080 }, { "epoch": 2.028018679119413, "ref_ce_loss": 0.08156903088092804, "step": 6080 }, { "epoch": 2.028018679119413, "loss": 0.6728360652923584, "step": 6080 }, { "ce_loss": 0.19420503079891205, "epoch": 2.028018679119413, "step": 6080 }, { "distill_loss": 0.20174559950828552, "epoch": 2.028018679119413, "step": 6080 }, { "epoch": 2.028018679119413, "ref_ce_loss": 0.13595017790794373, "step": 6080 }, { "epoch": 2.028018679119413, "loss": 1.2898201942443848, "step": 6080 }, { "ce_loss": 0.36071357131004333, "epoch": 2.028018679119413, "step": 6080 }, { "distill_loss": 0.251817524433136, "epoch": 2.028018679119413, "step": 6080 }, { "epoch": 2.028018679119413, "ref_ce_loss": 0.1717768758535385, "step": 6080 }, { "epoch": 2.0313542361574384, "loss": 0.6395, "step": 6090 }, { "epoch": 2.0313542361574384, "grad_norm": 3.3039674758911133, "step": 6090 }, { "epoch": 2.0313542361574384, "learning_rate": 0.00026225801168413377, "step": 6090 }, { "epoch": 2.0313542361574384, "loss": 0.5026826858520508, "step": 6090 }, { "ce_loss": 0.13728295266628265, "epoch": 2.0313542361574384, "step": 6090 }, { "distill_loss": 0.1531429886817932, "epoch": 2.0313542361574384, "step": 6090 }, { "epoch": 2.0313542361574384, "ref_ce_loss": 0.1067405492067337, "step": 6090 }, { "epoch": 2.0313542361574384, "loss": 0.552277684211731, "step": 6090 }, { "ce_loss": 0.17691931128501892, "epoch": 2.0313542361574384, "step": 6090 }, { "distill_loss": 0.16204652190208435, "epoch": 2.0313542361574384, "step": 6090 }, { "epoch": 2.0313542361574384, "ref_ce_loss": 0.11657890677452087, "step": 6090 }, { "epoch": 2.0313542361574384, "loss": 0.32724007964134216, "step": 6090 }, { "ce_loss": 0.05284838378429413, "epoch": 2.0313542361574384, "step": 6090 }, { "distill_loss": 0.12443891167640686, "epoch": 2.0313542361574384, "step": 6090 }, { "epoch": 2.0313542361574384, "ref_ce_loss": 0.149708092212677, "step": 6090 }, { "epoch": 2.0313542361574384, "loss": 0.3944059908390045, "step": 6090 }, { "ce_loss": 0.137527734041214, "epoch": 2.0313542361574384, "step": 6090 }, { "distill_loss": 0.16900943219661713, "epoch": 2.0313542361574384, "step": 6090 }, { "epoch": 2.0313542361574384, "ref_ce_loss": 0.08755436539649963, "step": 6090 }, { "epoch": 2.0346897931954637, "loss": 0.552, "step": 6100 }, { "epoch": 2.0346897931954637, "grad_norm": 4.813361167907715, "step": 6100 }, { "epoch": 2.0346897931954637, "learning_rate": 0.00026212355795009353, "step": 6100 }, { "epoch": 2.0346897931954637, "loss": 0.4744175970554352, "step": 6100 }, { "ce_loss": 0.14979687333106995, "epoch": 2.0346897931954637, "step": 6100 }, { "distill_loss": 0.1268942803144455, "epoch": 2.0346897931954637, "step": 6100 }, { "epoch": 2.0346897931954637, "ref_ce_loss": 0.14003252983093262, "step": 6100 }, { "epoch": 2.0346897931954637, "loss": 0.5676864981651306, "step": 6100 }, { "ce_loss": 0.16010522842407227, "epoch": 2.0346897931954637, "step": 6100 }, { "distill_loss": 0.11075478047132492, "epoch": 2.0346897931954637, "step": 6100 }, { "epoch": 2.0346897931954637, "ref_ce_loss": 0.13726557791233063, "step": 6100 }, { "epoch": 2.0346897931954637, "loss": 0.5984420776367188, "step": 6100 }, { "ce_loss": 0.1581433117389679, "epoch": 2.0346897931954637, "step": 6100 }, { "distill_loss": 0.15865829586982727, "epoch": 2.0346897931954637, "step": 6100 }, { "epoch": 2.0346897931954637, "ref_ce_loss": 0.1082833856344223, "step": 6100 }, { "epoch": 2.0346897931954637, "loss": 0.5503508448600769, "step": 6100 }, { "ce_loss": 0.16133596003055573, "epoch": 2.0346897931954637, "step": 6100 }, { "distill_loss": 0.11832837760448456, "epoch": 2.0346897931954637, "step": 6100 }, { "epoch": 2.0346897931954637, "ref_ce_loss": 0.08712395280599594, "step": 6100 }, { "epoch": 2.038025350233489, "loss": 0.5482, "step": 6110 }, { "epoch": 2.038025350233489, "grad_norm": 5.132610321044922, "step": 6110 }, { "epoch": 2.038025350233489, "learning_rate": 0.00026198889974691266, "step": 6110 }, { "epoch": 2.038025350233489, "loss": 0.6514615416526794, "step": 6110 }, { "ce_loss": 0.16826602816581726, "epoch": 2.038025350233489, "step": 6110 }, { "distill_loss": 0.1439044177532196, "epoch": 2.038025350233489, "step": 6110 }, { "epoch": 2.038025350233489, "ref_ce_loss": 0.1011396050453186, "step": 6110 }, { "epoch": 2.038025350233489, "loss": 0.5035021901130676, "step": 6110 }, { "ce_loss": 0.17770890891551971, "epoch": 2.038025350233489, "step": 6110 }, { "distill_loss": 0.1277397722005844, "epoch": 2.038025350233489, "step": 6110 }, { "epoch": 2.038025350233489, "ref_ce_loss": 0.14741405844688416, "step": 6110 }, { "epoch": 2.038025350233489, "loss": 0.7693830728530884, "step": 6110 }, { "ce_loss": 0.20457173883914948, "epoch": 2.038025350233489, "step": 6110 }, { "distill_loss": 0.11660545319318771, "epoch": 2.038025350233489, "step": 6110 }, { "epoch": 2.038025350233489, "ref_ce_loss": 0.16994109749794006, "step": 6110 }, { "epoch": 2.038025350233489, "loss": 0.36567243933677673, "step": 6110 }, { "ce_loss": 0.11716575175523758, "epoch": 2.038025350233489, "step": 6110 }, { "distill_loss": 0.09368336945772171, "epoch": 2.038025350233489, "step": 6110 }, { "epoch": 2.038025350233489, "ref_ce_loss": 0.10542420297861099, "step": 6110 }, { "epoch": 2.0413609072715144, "loss": 0.5739, "step": 6120 }, { "epoch": 2.0413609072715144, "grad_norm": 2.7173047065734863, "step": 6120 }, { "epoch": 2.0413609072715144, "learning_rate": 0.00026185403732015473, "step": 6120 }, { "epoch": 2.0413609072715144, "loss": 0.5491402745246887, "step": 6120 }, { "ce_loss": 0.2195517122745514, "epoch": 2.0413609072715144, "step": 6120 }, { "distill_loss": 0.11219502240419388, "epoch": 2.0413609072715144, "step": 6120 }, { "epoch": 2.0413609072715144, "ref_ce_loss": 0.12330888211727142, "step": 6120 }, { "epoch": 2.0413609072715144, "loss": 0.6513844132423401, "step": 6120 }, { "ce_loss": 0.2567315697669983, "epoch": 2.0413609072715144, "step": 6120 }, { "distill_loss": 0.11513234674930573, "epoch": 2.0413609072715144, "step": 6120 }, { "epoch": 2.0413609072715144, "ref_ce_loss": 0.17506708204746246, "step": 6120 }, { "epoch": 2.0413609072715144, "loss": 0.6120648384094238, "step": 6120 }, { "ce_loss": 0.19124412536621094, "epoch": 2.0413609072715144, "step": 6120 }, { "distill_loss": 0.10679781436920166, "epoch": 2.0413609072715144, "step": 6120 }, { "epoch": 2.0413609072715144, "ref_ce_loss": 0.1378684788942337, "step": 6120 }, { "epoch": 2.0413609072715144, "loss": 0.3801417648792267, "step": 6120 }, { "ce_loss": 0.149167001247406, "epoch": 2.0413609072715144, "step": 6120 }, { "distill_loss": 0.12198339402675629, "epoch": 2.0413609072715144, "step": 6120 }, { "epoch": 2.0413609072715144, "ref_ce_loss": 0.08929223567247391, "step": 6120 }, { "epoch": 2.0446964643095398, "loss": 0.5375, "step": 6130 }, { "epoch": 2.0446964643095398, "grad_norm": 2.9968857765197754, "step": 6130 }, { "epoch": 2.0446964643095398, "learning_rate": 0.0002617189709157555, "step": 6130 }, { "epoch": 2.0446964643095398, "loss": 0.5337057113647461, "step": 6130 }, { "ce_loss": 0.216720849275589, "epoch": 2.0446964643095398, "step": 6130 }, { "distill_loss": 0.13213863968849182, "epoch": 2.0446964643095398, "step": 6130 }, { "epoch": 2.0446964643095398, "ref_ce_loss": 0.14722806215286255, "step": 6130 }, { "epoch": 2.0446964643095398, "loss": 0.4620002508163452, "step": 6130 }, { "ce_loss": 0.13662521541118622, "epoch": 2.0446964643095398, "step": 6130 }, { "distill_loss": 0.0959618091583252, "epoch": 2.0446964643095398, "step": 6130 }, { "epoch": 2.0446964643095398, "ref_ce_loss": 0.12215406447649002, "step": 6130 }, { "epoch": 2.0446964643095398, "loss": 0.4098561406135559, "step": 6130 }, { "ce_loss": 0.12565305829048157, "epoch": 2.0446964643095398, "step": 6130 }, { "distill_loss": 0.10984811186790466, "epoch": 2.0446964643095398, "step": 6130 }, { "epoch": 2.0446964643095398, "ref_ce_loss": 0.13379180431365967, "step": 6130 }, { "epoch": 2.0446964643095398, "loss": 0.4601963758468628, "step": 6130 }, { "ce_loss": 0.11863856762647629, "epoch": 2.0446964643095398, "step": 6130 }, { "distill_loss": 0.08010222762823105, "epoch": 2.0446964643095398, "step": 6130 }, { "epoch": 2.0446964643095398, "ref_ce_loss": 0.12990163266658783, "step": 6130 }, { "epoch": 2.048032021347565, "loss": 0.5323, "step": 6140 }, { "epoch": 2.048032021347565, "grad_norm": 2.794053077697754, "step": 6140 }, { "epoch": 2.048032021347565, "learning_rate": 0.0002615837007800229, "step": 6140 }, { "epoch": 2.048032021347565, "loss": 0.7886110544204712, "step": 6140 }, { "ce_loss": 0.15798018872737885, "epoch": 2.048032021347565, "step": 6140 }, { "distill_loss": 0.17019721865653992, "epoch": 2.048032021347565, "step": 6140 }, { "epoch": 2.048032021347565, "ref_ce_loss": 0.13794007897377014, "step": 6140 }, { "epoch": 2.048032021347565, "loss": 0.4465550482273102, "step": 6140 }, { "ce_loss": 0.17327573895454407, "epoch": 2.048032021347565, "step": 6140 }, { "distill_loss": 0.12840725481510162, "epoch": 2.048032021347565, "step": 6140 }, { "epoch": 2.048032021347565, "ref_ce_loss": 0.10248830169439316, "step": 6140 }, { "epoch": 2.048032021347565, "loss": 0.5394412279129028, "step": 6140 }, { "ce_loss": 0.14059796929359436, "epoch": 2.048032021347565, "step": 6140 }, { "distill_loss": 0.1589897871017456, "epoch": 2.048032021347565, "step": 6140 }, { "epoch": 2.048032021347565, "ref_ce_loss": 0.12873795628547668, "step": 6140 }, { "epoch": 2.048032021347565, "loss": 0.653471827507019, "step": 6140 }, { "ce_loss": 0.2507249712944031, "epoch": 2.048032021347565, "step": 6140 }, { "distill_loss": 0.15221871435642242, "epoch": 2.048032021347565, "step": 6140 }, { "epoch": 2.048032021347565, "ref_ce_loss": 0.12897253036499023, "step": 6140 }, { "epoch": 2.0513675783855905, "loss": 0.6209, "step": 6150 }, { "epoch": 2.0513675783855905, "grad_norm": 16.627849578857422, "step": 6150 }, { "epoch": 2.0513675783855905, "learning_rate": 0.00026144822715963627, "step": 6150 }, { "epoch": 2.0513675783855905, "loss": 0.5434788465499878, "step": 6150 }, { "ce_loss": 0.1266670972108841, "epoch": 2.0513675783855905, "step": 6150 }, { "distill_loss": 0.25511863827705383, "epoch": 2.0513675783855905, "step": 6150 }, { "epoch": 2.0513675783855905, "ref_ce_loss": 0.09655553847551346, "step": 6150 }, { "epoch": 2.0513675783855905, "loss": 0.7020452618598938, "step": 6150 }, { "ce_loss": 0.1957281082868576, "epoch": 2.0513675783855905, "step": 6150 }, { "distill_loss": 0.279159814119339, "epoch": 2.0513675783855905, "step": 6150 }, { "epoch": 2.0513675783855905, "ref_ce_loss": 0.09773007780313492, "step": 6150 }, { "epoch": 2.0513675783855905, "loss": 0.589733898639679, "step": 6150 }, { "ce_loss": 0.2290581315755844, "epoch": 2.0513675783855905, "step": 6150 }, { "distill_loss": 0.2273220419883728, "epoch": 2.0513675783855905, "step": 6150 }, { "epoch": 2.0513675783855905, "ref_ce_loss": 0.13324572145938873, "step": 6150 }, { "epoch": 2.0513675783855905, "loss": 0.6807817816734314, "step": 6150 }, { "ce_loss": 0.21881437301635742, "epoch": 2.0513675783855905, "step": 6150 }, { "distill_loss": 0.31649279594421387, "epoch": 2.0513675783855905, "step": 6150 }, { "epoch": 2.0513675783855905, "ref_ce_loss": 0.14447903633117676, "step": 6150 }, { "epoch": 2.054703135423616, "loss": 0.6239, "step": 6160 }, { "epoch": 2.054703135423616, "grad_norm": 3.414583206176758, "step": 6160 }, { "epoch": 2.054703135423616, "learning_rate": 0.000261312550301646, "step": 6160 }, { "epoch": 2.054703135423616, "loss": 0.7235752940177917, "step": 6160 }, { "ce_loss": 0.26676416397094727, "epoch": 2.054703135423616, "step": 6160 }, { "distill_loss": 0.2171621322631836, "epoch": 2.054703135423616, "step": 6160 }, { "epoch": 2.054703135423616, "ref_ce_loss": 0.1268942803144455, "step": 6160 }, { "epoch": 2.054703135423616, "loss": 0.5398198962211609, "step": 6160 }, { "ce_loss": 0.17983555793762207, "epoch": 2.054703135423616, "step": 6160 }, { "distill_loss": 0.22419527173042297, "epoch": 2.054703135423616, "step": 6160 }, { "epoch": 2.054703135423616, "ref_ce_loss": 0.08115441352128983, "step": 6160 }, { "epoch": 2.054703135423616, "loss": 0.5893262624740601, "step": 6160 }, { "ce_loss": 0.16456808149814606, "epoch": 2.054703135423616, "step": 6160 }, { "distill_loss": 0.13162600994110107, "epoch": 2.054703135423616, "step": 6160 }, { "epoch": 2.054703135423616, "ref_ce_loss": 0.13405293226242065, "step": 6160 }, { "epoch": 2.054703135423616, "loss": 0.5403462052345276, "step": 6160 }, { "ce_loss": 0.19443656504154205, "epoch": 2.054703135423616, "step": 6160 }, { "distill_loss": 0.17818742990493774, "epoch": 2.054703135423616, "step": 6160 }, { "epoch": 2.054703135423616, "ref_ce_loss": 0.13629718124866486, "step": 6160 }, { "epoch": 2.058038692461641, "loss": 0.5894, "step": 6170 }, { "epoch": 2.058038692461641, "grad_norm": 2.5601234436035156, "step": 6170 }, { "epoch": 2.058038692461641, "learning_rate": 0.0002611766704534732, "step": 6170 }, { "epoch": 2.058038692461641, "loss": 0.49508172273635864, "step": 6170 }, { "ce_loss": 0.14286686480045319, "epoch": 2.058038692461641, "step": 6170 }, { "distill_loss": 0.14650699496269226, "epoch": 2.058038692461641, "step": 6170 }, { "epoch": 2.058038692461641, "ref_ce_loss": 0.15913353860378265, "step": 6170 }, { "epoch": 2.058038692461641, "loss": 0.3224010467529297, "step": 6170 }, { "ce_loss": 0.11369533091783524, "epoch": 2.058038692461641, "step": 6170 }, { "distill_loss": 0.13524405658245087, "epoch": 2.058038692461641, "step": 6170 }, { "epoch": 2.058038692461641, "ref_ce_loss": 0.07325857877731323, "step": 6170 }, { "epoch": 2.058038692461641, "loss": 0.43581756949424744, "step": 6170 }, { "ce_loss": 0.13941310346126556, "epoch": 2.058038692461641, "step": 6170 }, { "distill_loss": 0.17731405794620514, "epoch": 2.058038692461641, "step": 6170 }, { "epoch": 2.058038692461641, "ref_ce_loss": 0.08746254444122314, "step": 6170 }, { "epoch": 2.058038692461641, "loss": 0.5797979235649109, "step": 6170 }, { "ce_loss": 0.15138761699199677, "epoch": 2.058038692461641, "step": 6170 }, { "distill_loss": 0.1986589878797531, "epoch": 2.058038692461641, "step": 6170 }, { "epoch": 2.058038692461641, "ref_ce_loss": 0.11812014132738113, "step": 6170 }, { "epoch": 2.0613742494996665, "loss": 0.5313, "step": 6180 }, { "epoch": 2.0613742494996665, "grad_norm": 2.2964236736297607, "step": 6180 }, { "epoch": 2.0613742494996665, "learning_rate": 0.00026104058786290905, "step": 6180 }, { "epoch": 2.0613742494996665, "loss": 0.44557082653045654, "step": 6180 }, { "ce_loss": 0.12464036047458649, "epoch": 2.0613742494996665, "step": 6180 }, { "distill_loss": 0.10614349693059921, "epoch": 2.0613742494996665, "step": 6180 }, { "epoch": 2.0613742494996665, "ref_ce_loss": 0.09893155843019485, "step": 6180 }, { "epoch": 2.0613742494996665, "loss": 0.522710919380188, "step": 6180 }, { "ce_loss": 0.16048184037208557, "epoch": 2.0613742494996665, "step": 6180 }, { "distill_loss": 0.13187257945537567, "epoch": 2.0613742494996665, "step": 6180 }, { "epoch": 2.0613742494996665, "ref_ce_loss": 0.1442248374223709, "step": 6180 }, { "epoch": 2.0613742494996665, "loss": 0.6206411123275757, "step": 6180 }, { "ce_loss": 0.1626727133989334, "epoch": 2.0613742494996665, "step": 6180 }, { "distill_loss": 0.14576083421707153, "epoch": 2.0613742494996665, "step": 6180 }, { "epoch": 2.0613742494996665, "ref_ce_loss": 0.12461519986391068, "step": 6180 }, { "epoch": 2.0613742494996665, "loss": 0.7486047744750977, "step": 6180 }, { "ce_loss": 0.2025829553604126, "epoch": 2.0613742494996665, "step": 6180 }, { "distill_loss": 0.13131900131702423, "epoch": 2.0613742494996665, "step": 6180 }, { "epoch": 2.0613742494996665, "ref_ce_loss": 0.13520632684230804, "step": 6180 }, { "epoch": 2.064709806537692, "loss": 0.6143, "step": 6190 }, { "epoch": 2.064709806537692, "grad_norm": 2.8212482929229736, "step": 6190 }, { "epoch": 2.064709806537692, "learning_rate": 0.0002609043027781146, "step": 6190 }, { "epoch": 2.064709806537692, "loss": 0.5236290097236633, "step": 6190 }, { "ce_loss": 0.19522197544574738, "epoch": 2.064709806537692, "step": 6190 }, { "distill_loss": 0.10594858974218369, "epoch": 2.064709806537692, "step": 6190 }, { "epoch": 2.064709806537692, "ref_ce_loss": 0.11298041045665741, "step": 6190 }, { "epoch": 2.064709806537692, "loss": 0.42183634638786316, "step": 6190 }, { "ce_loss": 0.15197670459747314, "epoch": 2.064709806537692, "step": 6190 }, { "distill_loss": 0.11152088642120361, "epoch": 2.064709806537692, "step": 6190 }, { "epoch": 2.064709806537692, "ref_ce_loss": 0.15807226300239563, "step": 6190 }, { "epoch": 2.064709806537692, "loss": 0.671714186668396, "step": 6190 }, { "ce_loss": 0.26314622163772583, "epoch": 2.064709806537692, "step": 6190 }, { "distill_loss": 0.13445574045181274, "epoch": 2.064709806537692, "step": 6190 }, { "epoch": 2.064709806537692, "ref_ce_loss": 0.18136006593704224, "step": 6190 }, { "epoch": 2.064709806537692, "loss": 0.40071842074394226, "step": 6190 }, { "ce_loss": 0.19335207343101501, "epoch": 2.064709806537692, "step": 6190 }, { "distill_loss": 0.09611434489488602, "epoch": 2.064709806537692, "step": 6190 }, { "epoch": 2.064709806537692, "ref_ce_loss": 0.07939335703849792, "step": 6190 }, { "epoch": 2.068045363575717, "loss": 0.5268, "step": 6200 }, { "epoch": 2.068045363575717, "grad_norm": 4.135841369628906, "step": 6200 }, { "epoch": 2.068045363575717, "learning_rate": 0.00026076781544762015, "step": 6200 }, { "epoch": 2.068045363575717, "loss": 0.5495902299880981, "step": 6200 }, { "ce_loss": 0.2173391878604889, "epoch": 2.068045363575717, "step": 6200 }, { "distill_loss": 0.10663158446550369, "epoch": 2.068045363575717, "step": 6200 }, { "epoch": 2.068045363575717, "ref_ce_loss": 0.14941971004009247, "step": 6200 }, { "epoch": 2.068045363575717, "loss": 0.8383069038391113, "step": 6200 }, { "ce_loss": 0.12289828807115555, "epoch": 2.068045363575717, "step": 6200 }, { "distill_loss": 0.171888530254364, "epoch": 2.068045363575717, "step": 6200 }, { "epoch": 2.068045363575717, "ref_ce_loss": 0.13419800996780396, "step": 6200 }, { "epoch": 2.068045363575717, "loss": 0.5588772892951965, "step": 6200 }, { "ce_loss": 0.18625810742378235, "epoch": 2.068045363575717, "step": 6200 }, { "distill_loss": 0.14179158210754395, "epoch": 2.068045363575717, "step": 6200 }, { "epoch": 2.068045363575717, "ref_ce_loss": 0.11892954260110855, "step": 6200 }, { "epoch": 2.068045363575717, "loss": 0.808592677116394, "step": 6200 }, { "ce_loss": 0.20481625199317932, "epoch": 2.068045363575717, "step": 6200 }, { "distill_loss": 0.17151331901550293, "epoch": 2.068045363575717, "step": 6200 }, { "epoch": 2.068045363575717, "ref_ce_loss": 0.06582659482955933, "step": 6200 }, { "epoch": 2.0713809206137426, "loss": 0.6267, "step": 6210 }, { "epoch": 2.0713809206137426, "grad_norm": 3.014910936355591, "step": 6210 }, { "epoch": 2.0713809206137426, "learning_rate": 0.00026063112612032457, "step": 6210 }, { "epoch": 2.0713809206137426, "loss": 0.3734577000141144, "step": 6210 }, { "ce_loss": 0.09334731101989746, "epoch": 2.0713809206137426, "step": 6210 }, { "distill_loss": 0.1371557116508484, "epoch": 2.0713809206137426, "step": 6210 }, { "epoch": 2.0713809206137426, "ref_ce_loss": 0.09609896689653397, "step": 6210 }, { "epoch": 2.0713809206137426, "loss": 0.7269737124443054, "step": 6210 }, { "ce_loss": 0.27129364013671875, "epoch": 2.0713809206137426, "step": 6210 }, { "distill_loss": 0.23927046358585358, "epoch": 2.0713809206137426, "step": 6210 }, { "epoch": 2.0713809206137426, "ref_ce_loss": 0.12874388694763184, "step": 6210 }, { "epoch": 2.0713809206137426, "loss": 0.68470299243927, "step": 6210 }, { "ce_loss": 0.11388979107141495, "epoch": 2.0713809206137426, "step": 6210 }, { "distill_loss": 0.15361973643302917, "epoch": 2.0713809206137426, "step": 6210 }, { "epoch": 2.0713809206137426, "ref_ce_loss": 0.1297324299812317, "step": 6210 }, { "epoch": 2.0713809206137426, "loss": 1.0848585367202759, "step": 6210 }, { "ce_loss": 0.14715316891670227, "epoch": 2.0713809206137426, "step": 6210 }, { "distill_loss": 0.18684878945350647, "epoch": 2.0713809206137426, "step": 6210 }, { "epoch": 2.0713809206137426, "ref_ce_loss": 0.08337806910276413, "step": 6210 }, { "epoch": 2.074716477651768, "loss": 0.5362, "step": 6220 }, { "epoch": 2.074716477651768, "grad_norm": 3.284393072128296, "step": 6220 }, { "epoch": 2.074716477651768, "learning_rate": 0.00026049423504549544, "step": 6220 }, { "epoch": 2.074716477651768, "loss": 0.31834161281585693, "step": 6220 }, { "ce_loss": 0.079567089676857, "epoch": 2.074716477651768, "step": 6220 }, { "distill_loss": 0.15186390280723572, "epoch": 2.074716477651768, "step": 6220 }, { "epoch": 2.074716477651768, "ref_ce_loss": 0.08671994507312775, "step": 6220 }, { "epoch": 2.074716477651768, "loss": 0.8450188636779785, "step": 6220 }, { "ce_loss": 0.31348034739494324, "epoch": 2.074716477651768, "step": 6220 }, { "distill_loss": 0.2693202793598175, "epoch": 2.074716477651768, "step": 6220 }, { "epoch": 2.074716477651768, "ref_ce_loss": 0.18057137727737427, "step": 6220 }, { "epoch": 2.074716477651768, "loss": 0.6984924077987671, "step": 6220 }, { "ce_loss": 0.18971113860607147, "epoch": 2.074716477651768, "step": 6220 }, { "distill_loss": 0.14513225853443146, "epoch": 2.074716477651768, "step": 6220 }, { "epoch": 2.074716477651768, "ref_ce_loss": 0.11545384675264359, "step": 6220 }, { "epoch": 2.074716477651768, "loss": 0.43207088112831116, "step": 6220 }, { "ce_loss": 0.10766131430864334, "epoch": 2.074716477651768, "step": 6220 }, { "distill_loss": 0.14692091941833496, "epoch": 2.074716477651768, "step": 6220 }, { "epoch": 2.074716477651768, "ref_ce_loss": 0.08508320897817612, "step": 6220 }, { "epoch": 2.0780520346897933, "loss": 0.5354, "step": 6230 }, { "epoch": 2.0780520346897933, "grad_norm": 3.0783700942993164, "step": 6230 }, { "epoch": 2.0780520346897933, "learning_rate": 0.0002603571424727679, "step": 6230 }, { "epoch": 2.0780520346897933, "loss": 0.5964272618293762, "step": 6230 }, { "ce_loss": 0.22434349358081818, "epoch": 2.0780520346897933, "step": 6230 }, { "distill_loss": 0.095759816467762, "epoch": 2.0780520346897933, "step": 6230 }, { "epoch": 2.0780520346897933, "ref_ce_loss": 0.1369304656982422, "step": 6230 }, { "epoch": 2.0780520346897933, "loss": 0.4027251601219177, "step": 6230 }, { "ce_loss": 0.0775301530957222, "epoch": 2.0780520346897933, "step": 6230 }, { "distill_loss": 0.10245171189308167, "epoch": 2.0780520346897933, "step": 6230 }, { "epoch": 2.0780520346897933, "ref_ce_loss": 0.12542954087257385, "step": 6230 }, { "epoch": 2.0780520346897933, "loss": 0.38007214665412903, "step": 6230 }, { "ce_loss": 0.15331995487213135, "epoch": 2.0780520346897933, "step": 6230 }, { "distill_loss": 0.11959885060787201, "epoch": 2.0780520346897933, "step": 6230 }, { "epoch": 2.0780520346897933, "ref_ce_loss": 0.10672856867313385, "step": 6230 }, { "epoch": 2.0780520346897933, "loss": 0.5621980428695679, "step": 6230 }, { "ce_loss": 0.26177355647087097, "epoch": 2.0780520346897933, "step": 6230 }, { "distill_loss": 0.15812274813652039, "epoch": 2.0780520346897933, "step": 6230 }, { "epoch": 2.0780520346897933, "ref_ce_loss": 0.09878107905387878, "step": 6230 }, { "epoch": 2.0813875917278186, "loss": 0.5493, "step": 6240 }, { "epoch": 2.0813875917278186, "grad_norm": 2.6283464431762695, "step": 6240 }, { "epoch": 2.0813875917278186, "learning_rate": 0.00026021984865214493, "step": 6240 }, { "epoch": 2.0813875917278186, "loss": 0.516473650932312, "step": 6240 }, { "ce_loss": 0.22216933965682983, "epoch": 2.0813875917278186, "step": 6240 }, { "distill_loss": 0.14236842095851898, "epoch": 2.0813875917278186, "step": 6240 }, { "epoch": 2.0813875917278186, "ref_ce_loss": 0.11821052432060242, "step": 6240 }, { "epoch": 2.0813875917278186, "loss": 1.0167242288589478, "step": 6240 }, { "ce_loss": 0.1872362345457077, "epoch": 2.0813875917278186, "step": 6240 }, { "distill_loss": 0.1137826144695282, "epoch": 2.0813875917278186, "step": 6240 }, { "epoch": 2.0813875917278186, "ref_ce_loss": 0.07063769549131393, "step": 6240 }, { "epoch": 2.0813875917278186, "loss": 0.49501004815101624, "step": 6240 }, { "ce_loss": 0.19312813878059387, "epoch": 2.0813875917278186, "step": 6240 }, { "distill_loss": 0.1220867857336998, "epoch": 2.0813875917278186, "step": 6240 }, { "epoch": 2.0813875917278186, "ref_ce_loss": 0.11821332573890686, "step": 6240 }, { "epoch": 2.0813875917278186, "loss": 0.5460554361343384, "step": 6240 }, { "ce_loss": 0.1809922456741333, "epoch": 2.0813875917278186, "step": 6240 }, { "distill_loss": 0.14729411900043488, "epoch": 2.0813875917278186, "step": 6240 }, { "epoch": 2.0813875917278186, "ref_ce_loss": 0.1758243888616562, "step": 6240 }, { "epoch": 2.084723148765844, "loss": 0.5744, "step": 6250 }, { "epoch": 2.084723148765844, "grad_norm": 2.9297170639038086, "step": 6250 }, { "epoch": 2.084723148765844, "learning_rate": 0.00026008235383399614, "step": 6250 }, { "epoch": 2.084723148765844, "loss": 0.5299870371818542, "step": 6250 }, { "ce_loss": 0.20783375203609467, "epoch": 2.084723148765844, "step": 6250 }, { "distill_loss": 0.18256841599941254, "epoch": 2.084723148765844, "step": 6250 }, { "epoch": 2.084723148765844, "ref_ce_loss": 0.10592147707939148, "step": 6250 }, { "epoch": 2.084723148765844, "loss": 0.5724793672561646, "step": 6250 }, { "ce_loss": 0.13250119984149933, "epoch": 2.084723148765844, "step": 6250 }, { "distill_loss": 0.1532224714756012, "epoch": 2.084723148765844, "step": 6250 }, { "epoch": 2.084723148765844, "ref_ce_loss": 0.12249592691659927, "step": 6250 }, { "epoch": 2.084723148765844, "loss": 0.601777195930481, "step": 6250 }, { "ce_loss": 0.26903092861175537, "epoch": 2.084723148765844, "step": 6250 }, { "distill_loss": 0.16014234721660614, "epoch": 2.084723148765844, "step": 6250 }, { "epoch": 2.084723148765844, "ref_ce_loss": 0.1334620863199234, "step": 6250 }, { "epoch": 2.084723148765844, "loss": 0.5935980081558228, "step": 6250 }, { "ce_loss": 0.1840442568063736, "epoch": 2.084723148765844, "step": 6250 }, { "distill_loss": 0.1481340080499649, "epoch": 2.084723148765844, "step": 6250 }, { "epoch": 2.084723148765844, "ref_ce_loss": 0.15468862652778625, "step": 6250 }, { "epoch": 2.0880587058038693, "loss": 0.5446, "step": 6260 }, { "epoch": 2.0880587058038693, "grad_norm": 2.8151493072509766, "step": 6260 }, { "epoch": 2.0880587058038693, "learning_rate": 0.00025994465826905793, "step": 6260 }, { "epoch": 2.0880587058038693, "loss": 0.3203030228614807, "step": 6260 }, { "ce_loss": 0.12304674088954926, "epoch": 2.0880587058038693, "step": 6260 }, { "distill_loss": 0.11692545562982559, "epoch": 2.0880587058038693, "step": 6260 }, { "epoch": 2.0880587058038693, "ref_ce_loss": 0.08023111522197723, "step": 6260 }, { "epoch": 2.0880587058038693, "loss": 0.5208563804626465, "step": 6260 }, { "ce_loss": 0.19634287059307098, "epoch": 2.0880587058038693, "step": 6260 }, { "distill_loss": 0.11992079019546509, "epoch": 2.0880587058038693, "step": 6260 }, { "epoch": 2.0880587058038693, "ref_ce_loss": 0.15068615972995758, "step": 6260 }, { "epoch": 2.0880587058038693, "loss": 0.4007379114627838, "step": 6260 }, { "ce_loss": 0.13138967752456665, "epoch": 2.0880587058038693, "step": 6260 }, { "distill_loss": 0.10143236815929413, "epoch": 2.0880587058038693, "step": 6260 }, { "epoch": 2.0880587058038693, "ref_ce_loss": 0.11003472656011581, "step": 6260 }, { "epoch": 2.0880587058038693, "loss": 0.44156375527381897, "step": 6260 }, { "ce_loss": 0.15144360065460205, "epoch": 2.0880587058038693, "step": 6260 }, { "distill_loss": 0.10493350028991699, "epoch": 2.0880587058038693, "step": 6260 }, { "epoch": 2.0880587058038693, "ref_ce_loss": 0.12747961282730103, "step": 6260 }, { "epoch": 2.0913942628418947, "loss": 0.5177, "step": 6270 }, { "epoch": 2.0913942628418947, "grad_norm": 3.0881059169769287, "step": 6270 }, { "epoch": 2.0913942628418947, "learning_rate": 0.00025980676220843267, "step": 6270 }, { "epoch": 2.0913942628418947, "loss": 0.46956267952919006, "step": 6270 }, { "ce_loss": 0.18205973505973816, "epoch": 2.0913942628418947, "step": 6270 }, { "distill_loss": 0.1669866442680359, "epoch": 2.0913942628418947, "step": 6270 }, { "epoch": 2.0913942628418947, "ref_ce_loss": 0.11989451199769974, "step": 6270 }, { "epoch": 2.0913942628418947, "loss": 0.4681810140609741, "step": 6270 }, { "ce_loss": 0.16584455966949463, "epoch": 2.0913942628418947, "step": 6270 }, { "distill_loss": 0.1511474847793579, "epoch": 2.0913942628418947, "step": 6270 }, { "epoch": 2.0913942628418947, "ref_ce_loss": 0.10125041007995605, "step": 6270 }, { "epoch": 2.0913942628418947, "loss": 0.6433732509613037, "step": 6270 }, { "ce_loss": 0.23725828528404236, "epoch": 2.0913942628418947, "step": 6270 }, { "distill_loss": 0.12847734987735748, "epoch": 2.0913942628418947, "step": 6270 }, { "epoch": 2.0913942628418947, "ref_ce_loss": 0.1302829384803772, "step": 6270 }, { "epoch": 2.0913942628418947, "loss": 0.7147667407989502, "step": 6270 }, { "ce_loss": 0.3272174298763275, "epoch": 2.0913942628418947, "step": 6270 }, { "distill_loss": 0.18335974216461182, "epoch": 2.0913942628418947, "step": 6270 }, { "epoch": 2.0913942628418947, "ref_ce_loss": 0.1645212322473526, "step": 6270 }, { "epoch": 2.09472981987992, "loss": 0.5341, "step": 6280 }, { "epoch": 2.09472981987992, "grad_norm": 2.9068071842193604, "step": 6280 }, { "epoch": 2.09472981987992, "learning_rate": 0.0002596686659035884, "step": 6280 }, { "epoch": 2.09472981987992, "loss": 0.6974978446960449, "step": 6280 }, { "ce_loss": 0.2411918044090271, "epoch": 2.09472981987992, "step": 6280 }, { "distill_loss": 0.19004279375076294, "epoch": 2.09472981987992, "step": 6280 }, { "epoch": 2.09472981987992, "ref_ce_loss": 0.19078131020069122, "step": 6280 }, { "epoch": 2.09472981987992, "loss": 0.8441246151924133, "step": 6280 }, { "ce_loss": 0.1914375275373459, "epoch": 2.09472981987992, "step": 6280 }, { "distill_loss": 0.18291591107845306, "epoch": 2.09472981987992, "step": 6280 }, { "epoch": 2.09472981987992, "ref_ce_loss": 0.1954183131456375, "step": 6280 }, { "epoch": 2.09472981987992, "loss": 0.700549304485321, "step": 6280 }, { "ce_loss": 0.17326299846172333, "epoch": 2.09472981987992, "step": 6280 }, { "distill_loss": 0.22238874435424805, "epoch": 2.09472981987992, "step": 6280 }, { "epoch": 2.09472981987992, "ref_ce_loss": 0.1731567084789276, "step": 6280 }, { "epoch": 2.09472981987992, "loss": 0.5677235126495361, "step": 6280 }, { "ce_loss": 0.2340429276227951, "epoch": 2.09472981987992, "step": 6280 }, { "distill_loss": 0.13371285796165466, "epoch": 2.09472981987992, "step": 6280 }, { "epoch": 2.09472981987992, "ref_ce_loss": 0.14143046736717224, "step": 6280 }, { "epoch": 2.0980653769179454, "loss": 0.5686, "step": 6290 }, { "epoch": 2.0980653769179454, "grad_norm": 2.8393349647521973, "step": 6290 }, { "epoch": 2.0980653769179454, "learning_rate": 0.0002595303696063582, "step": 6290 }, { "epoch": 2.0980653769179454, "loss": 0.4551650583744049, "step": 6290 }, { "ce_loss": 0.1547398418188095, "epoch": 2.0980653769179454, "step": 6290 }, { "distill_loss": 0.11198326200246811, "epoch": 2.0980653769179454, "step": 6290 }, { "epoch": 2.0980653769179454, "ref_ce_loss": 0.1279253214597702, "step": 6290 }, { "epoch": 2.0980653769179454, "loss": 0.6482787132263184, "step": 6290 }, { "ce_loss": 0.25222861766815186, "epoch": 2.0980653769179454, "step": 6290 }, { "distill_loss": 0.13030463457107544, "epoch": 2.0980653769179454, "step": 6290 }, { "epoch": 2.0980653769179454, "ref_ce_loss": 0.14368154108524323, "step": 6290 }, { "epoch": 2.0980653769179454, "loss": 0.662798285484314, "step": 6290 }, { "ce_loss": 0.20530973374843597, "epoch": 2.0980653769179454, "step": 6290 }, { "distill_loss": 0.1648654192686081, "epoch": 2.0980653769179454, "step": 6290 }, { "epoch": 2.0980653769179454, "ref_ce_loss": 0.09448906034231186, "step": 6290 }, { "epoch": 2.0980653769179454, "loss": 0.5949462652206421, "step": 6290 }, { "ce_loss": 0.2017897069454193, "epoch": 2.0980653769179454, "step": 6290 }, { "distill_loss": 0.14443297684192657, "epoch": 2.0980653769179454, "step": 6290 }, { "epoch": 2.0980653769179454, "ref_ce_loss": 0.1371644288301468, "step": 6290 }, { "epoch": 2.1014009339559707, "loss": 0.5499, "step": 6300 }, { "epoch": 2.1014009339559707, "grad_norm": 3.2759594917297363, "step": 6300 }, { "epoch": 2.1014009339559707, "learning_rate": 0.0002593918735689401, "step": 6300 }, { "epoch": 2.1014009339559707, "loss": 0.4449702501296997, "step": 6300 }, { "ce_loss": 0.12942036986351013, "epoch": 2.1014009339559707, "step": 6300 }, { "distill_loss": 0.11134645342826843, "epoch": 2.1014009339559707, "step": 6300 }, { "epoch": 2.1014009339559707, "ref_ce_loss": 0.08583880960941315, "step": 6300 }, { "epoch": 2.1014009339559707, "loss": 0.6386305093765259, "step": 6300 }, { "ce_loss": 0.14869295060634613, "epoch": 2.1014009339559707, "step": 6300 }, { "distill_loss": 0.1496465802192688, "epoch": 2.1014009339559707, "step": 6300 }, { "epoch": 2.1014009339559707, "ref_ce_loss": 0.1405908614397049, "step": 6300 }, { "epoch": 2.1014009339559707, "loss": 1.184739351272583, "step": 6300 }, { "ce_loss": 0.17178437113761902, "epoch": 2.1014009339559707, "step": 6300 }, { "distill_loss": 0.10091239213943481, "epoch": 2.1014009339559707, "step": 6300 }, { "epoch": 2.1014009339559707, "ref_ce_loss": 0.12160031497478485, "step": 6300 }, { "epoch": 2.1014009339559707, "loss": 0.4029628336429596, "step": 6300 }, { "ce_loss": 0.09635565429925919, "epoch": 2.1014009339559707, "step": 6300 }, { "distill_loss": 0.10692538321018219, "epoch": 2.1014009339559707, "step": 6300 }, { "epoch": 2.1014009339559707, "ref_ce_loss": 0.12069247663021088, "step": 6300 }, { "epoch": 2.104736490993996, "loss": 0.5475, "step": 6310 }, { "epoch": 2.104736490993996, "grad_norm": 2.3113832473754883, "step": 6310 }, { "epoch": 2.104736490993996, "learning_rate": 0.0002592531780438962, "step": 6310 }, { "epoch": 2.104736490993996, "loss": 0.704979658126831, "step": 6310 }, { "ce_loss": 0.18846255540847778, "epoch": 2.104736490993996, "step": 6310 }, { "distill_loss": 0.10475876927375793, "epoch": 2.104736490993996, "step": 6310 }, { "epoch": 2.104736490993996, "ref_ce_loss": 0.20854370296001434, "step": 6310 }, { "epoch": 2.104736490993996, "loss": 0.6280515193939209, "step": 6310 }, { "ce_loss": 0.13968916237354279, "epoch": 2.104736490993996, "step": 6310 }, { "distill_loss": 0.10155282914638519, "epoch": 2.104736490993996, "step": 6310 }, { "epoch": 2.104736490993996, "ref_ce_loss": 0.10801265388727188, "step": 6310 }, { "epoch": 2.104736490993996, "loss": 0.4487094283103943, "step": 6310 }, { "ce_loss": 0.05168895050883293, "epoch": 2.104736490993996, "step": 6310 }, { "distill_loss": 0.09908200800418854, "epoch": 2.104736490993996, "step": 6310 }, { "epoch": 2.104736490993996, "ref_ce_loss": 0.07915835827589035, "step": 6310 }, { "epoch": 2.104736490993996, "loss": 0.5438628196716309, "step": 6310 }, { "ce_loss": 0.15742728114128113, "epoch": 2.104736490993996, "step": 6310 }, { "distill_loss": 0.1507430076599121, "epoch": 2.104736490993996, "step": 6310 }, { "epoch": 2.104736490993996, "ref_ce_loss": 0.1463308483362198, "step": 6310 }, { "epoch": 2.1080720480320214, "loss": 0.5094, "step": 6320 }, { "epoch": 2.1080720480320214, "grad_norm": 3.62620210647583, "step": 6320 }, { "epoch": 2.1080720480320214, "learning_rate": 0.0002591142832841524, "step": 6320 }, { "epoch": 2.1080720480320214, "loss": 0.5473156571388245, "step": 6320 }, { "ce_loss": 0.2167169153690338, "epoch": 2.1080720480320214, "step": 6320 }, { "distill_loss": 0.12036389857530594, "epoch": 2.1080720480320214, "step": 6320 }, { "epoch": 2.1080720480320214, "ref_ce_loss": 0.14263157546520233, "step": 6320 }, { "epoch": 2.1080720480320214, "loss": 0.5363081693649292, "step": 6320 }, { "ce_loss": 0.18843142688274384, "epoch": 2.1080720480320214, "step": 6320 }, { "distill_loss": 0.11299119144678116, "epoch": 2.1080720480320214, "step": 6320 }, { "epoch": 2.1080720480320214, "ref_ce_loss": 0.1542125791311264, "step": 6320 }, { "epoch": 2.1080720480320214, "loss": 0.5071948170661926, "step": 6320 }, { "ce_loss": 0.1818801909685135, "epoch": 2.1080720480320214, "step": 6320 }, { "distill_loss": 0.10943155735731125, "epoch": 2.1080720480320214, "step": 6320 }, { "epoch": 2.1080720480320214, "ref_ce_loss": 0.10278405249118805, "step": 6320 }, { "epoch": 2.1080720480320214, "loss": 0.5227207541465759, "step": 6320 }, { "ce_loss": 0.2548081874847412, "epoch": 2.1080720480320214, "step": 6320 }, { "distill_loss": 0.10464075952768326, "epoch": 2.1080720480320214, "step": 6320 }, { "epoch": 2.1080720480320214, "ref_ce_loss": 0.16315731406211853, "step": 6320 }, { "epoch": 2.1114076050700468, "loss": 0.5356, "step": 6330 }, { "epoch": 2.1114076050700468, "grad_norm": 2.8032443523406982, "step": 6330 }, { "epoch": 2.1114076050700468, "learning_rate": 0.0002589751895429979, "step": 6330 }, { "epoch": 2.1114076050700468, "loss": 0.5398804545402527, "step": 6330 }, { "ce_loss": 0.23482723534107208, "epoch": 2.1114076050700468, "step": 6330 }, { "distill_loss": 0.12033455818891525, "epoch": 2.1114076050700468, "step": 6330 }, { "epoch": 2.1114076050700468, "ref_ce_loss": 0.13513240218162537, "step": 6330 }, { "epoch": 2.1114076050700468, "loss": 0.5462663769721985, "step": 6330 }, { "ce_loss": 0.16122372448444366, "epoch": 2.1114076050700468, "step": 6330 }, { "distill_loss": 0.0924907997250557, "epoch": 2.1114076050700468, "step": 6330 }, { "epoch": 2.1114076050700468, "ref_ce_loss": 0.1783560961484909, "step": 6330 }, { "epoch": 2.1114076050700468, "loss": 0.7903030514717102, "step": 6330 }, { "ce_loss": 0.13481366634368896, "epoch": 2.1114076050700468, "step": 6330 }, { "distill_loss": 0.10490374267101288, "epoch": 2.1114076050700468, "step": 6330 }, { "epoch": 2.1114076050700468, "ref_ce_loss": 0.144536554813385, "step": 6330 }, { "epoch": 2.1114076050700468, "loss": 0.7620557546615601, "step": 6330 }, { "ce_loss": 0.2825464606285095, "epoch": 2.1114076050700468, "step": 6330 }, { "distill_loss": 0.11635222285985947, "epoch": 2.1114076050700468, "step": 6330 }, { "epoch": 2.1114076050700468, "ref_ce_loss": 0.1272297352552414, "step": 6330 }, { "epoch": 2.114743162108072, "loss": 0.5484, "step": 6340 }, { "epoch": 2.114743162108072, "grad_norm": 3.093912124633789, "step": 6340 }, { "epoch": 2.114743162108072, "learning_rate": 0.00025883589707408495, "step": 6340 }, { "epoch": 2.114743162108072, "loss": 0.44437944889068604, "step": 6340 }, { "ce_loss": 0.10807617753744125, "epoch": 2.114743162108072, "step": 6340 }, { "distill_loss": 0.08297376334667206, "epoch": 2.114743162108072, "step": 6340 }, { "epoch": 2.114743162108072, "ref_ce_loss": 0.12267817556858063, "step": 6340 }, { "epoch": 2.114743162108072, "loss": 0.4830673336982727, "step": 6340 }, { "ce_loss": 0.21065731346607208, "epoch": 2.114743162108072, "step": 6340 }, { "distill_loss": 0.13647867739200592, "epoch": 2.114743162108072, "step": 6340 }, { "epoch": 2.114743162108072, "ref_ce_loss": 0.10502800345420837, "step": 6340 }, { "epoch": 2.114743162108072, "loss": 0.6829445362091064, "step": 6340 }, { "ce_loss": 0.20690180361270905, "epoch": 2.114743162108072, "step": 6340 }, { "distill_loss": 0.10831039398908615, "epoch": 2.114743162108072, "step": 6340 }, { "epoch": 2.114743162108072, "ref_ce_loss": 0.14528189599514008, "step": 6340 }, { "epoch": 2.114743162108072, "loss": 0.39419299364089966, "step": 6340 }, { "ce_loss": 0.17007343471050262, "epoch": 2.114743162108072, "step": 6340 }, { "distill_loss": 0.10755617171525955, "epoch": 2.114743162108072, "step": 6340 }, { "epoch": 2.114743162108072, "ref_ce_loss": 0.11648072302341461, "step": 6340 }, { "epoch": 2.1180787191460975, "loss": 0.5393, "step": 6350 }, { "epoch": 2.1180787191460975, "grad_norm": 2.7056565284729004, "step": 6350 }, { "epoch": 2.1180787191460975, "learning_rate": 0.00025869640613142796, "step": 6350 }, { "epoch": 2.1180787191460975, "loss": 0.4408019483089447, "step": 6350 }, { "ce_loss": 0.1383562535047531, "epoch": 2.1180787191460975, "step": 6350 }, { "distill_loss": 0.12275628000497818, "epoch": 2.1180787191460975, "step": 6350 }, { "epoch": 2.1180787191460975, "ref_ce_loss": 0.06599508970975876, "step": 6350 }, { "epoch": 2.1180787191460975, "loss": 0.30788254737854004, "step": 6350 }, { "ce_loss": 0.10194658488035202, "epoch": 2.1180787191460975, "step": 6350 }, { "distill_loss": 0.09040798991918564, "epoch": 2.1180787191460975, "step": 6350 }, { "epoch": 2.1180787191460975, "ref_ce_loss": 0.06405435502529144, "step": 6350 }, { "epoch": 2.1180787191460975, "loss": 0.41332122683525085, "step": 6350 }, { "ce_loss": 0.13060200214385986, "epoch": 2.1180787191460975, "step": 6350 }, { "distill_loss": 0.13377758860588074, "epoch": 2.1180787191460975, "step": 6350 }, { "epoch": 2.1180787191460975, "ref_ce_loss": 0.11965243518352509, "step": 6350 }, { "epoch": 2.1180787191460975, "loss": 0.43371546268463135, "step": 6350 }, { "ce_loss": 0.15489931404590607, "epoch": 2.1180787191460975, "step": 6350 }, { "distill_loss": 0.13392747938632965, "epoch": 2.1180787191460975, "step": 6350 }, { "epoch": 2.1180787191460975, "ref_ce_loss": 0.11093331128358841, "step": 6350 }, { "epoch": 2.121414276184123, "loss": 0.5057, "step": 6360 }, { "epoch": 2.121414276184123, "grad_norm": 1.7513467073440552, "step": 6360 }, { "epoch": 2.121414276184123, "learning_rate": 0.00025855671696940345, "step": 6360 }, { "epoch": 2.121414276184123, "loss": 0.318505197763443, "step": 6360 }, { "ce_loss": 0.08526570349931717, "epoch": 2.121414276184123, "step": 6360 }, { "distill_loss": 0.07790465652942657, "epoch": 2.121414276184123, "step": 6360 }, { "epoch": 2.121414276184123, "ref_ce_loss": 0.08237326890230179, "step": 6360 }, { "epoch": 2.121414276184123, "loss": 0.42279934883117676, "step": 6360 }, { "ce_loss": 0.15253883600234985, "epoch": 2.121414276184123, "step": 6360 }, { "distill_loss": 0.09407106041908264, "epoch": 2.121414276184123, "step": 6360 }, { "epoch": 2.121414276184123, "ref_ce_loss": 0.09206009656190872, "step": 6360 }, { "epoch": 2.121414276184123, "loss": 0.8274306058883667, "step": 6360 }, { "ce_loss": 0.36078494787216187, "epoch": 2.121414276184123, "step": 6360 }, { "distill_loss": 0.15812142193317413, "epoch": 2.121414276184123, "step": 6360 }, { "epoch": 2.121414276184123, "ref_ce_loss": 0.15409733355045319, "step": 6360 }, { "epoch": 2.121414276184123, "loss": 0.4437326490879059, "step": 6360 }, { "ce_loss": 0.14433549344539642, "epoch": 2.121414276184123, "step": 6360 }, { "distill_loss": 0.0919932946562767, "epoch": 2.121414276184123, "step": 6360 }, { "epoch": 2.121414276184123, "ref_ce_loss": 0.1464773565530777, "step": 6360 }, { "epoch": 2.124749833222148, "loss": 0.5228, "step": 6370 }, { "epoch": 2.124749833222148, "grad_norm": 2.1484248638153076, "step": 6370 }, { "epoch": 2.124749833222148, "learning_rate": 0.0002584168298427493, "step": 6370 }, { "epoch": 2.124749833222148, "loss": 0.43399181962013245, "step": 6370 }, { "ce_loss": 0.15099979937076569, "epoch": 2.124749833222148, "step": 6370 }, { "distill_loss": 0.1152973547577858, "epoch": 2.124749833222148, "step": 6370 }, { "epoch": 2.124749833222148, "ref_ce_loss": 0.11432129144668579, "step": 6370 }, { "epoch": 2.124749833222148, "loss": 0.6378722786903381, "step": 6370 }, { "ce_loss": 0.1807975023984909, "epoch": 2.124749833222148, "step": 6370 }, { "distill_loss": 0.08343417942523956, "epoch": 2.124749833222148, "step": 6370 }, { "epoch": 2.124749833222148, "ref_ce_loss": 0.17199712991714478, "step": 6370 }, { "epoch": 2.124749833222148, "loss": 0.2268414944410324, "step": 6370 }, { "ce_loss": 0.066691555082798, "epoch": 2.124749833222148, "step": 6370 }, { "distill_loss": 0.07913003861904144, "epoch": 2.124749833222148, "step": 6370 }, { "epoch": 2.124749833222148, "ref_ce_loss": 0.08088640868663788, "step": 6370 }, { "epoch": 2.124749833222148, "loss": 1.0655454397201538, "step": 6370 }, { "ce_loss": 0.12815611064434052, "epoch": 2.124749833222148, "step": 6370 }, { "distill_loss": 0.09509027004241943, "epoch": 2.124749833222148, "step": 6370 }, { "epoch": 2.124749833222148, "ref_ce_loss": 0.11971811205148697, "step": 6370 }, { "epoch": 2.1280853902601735, "loss": 0.5187, "step": 6380 }, { "epoch": 2.1280853902601735, "grad_norm": 2.40598464012146, "step": 6380 }, { "epoch": 2.1280853902601735, "learning_rate": 0.00025827674500656446, "step": 6380 }, { "epoch": 2.1280853902601735, "loss": 0.6937955617904663, "step": 6380 }, { "ce_loss": 0.16688190400600433, "epoch": 2.1280853902601735, "step": 6380 }, { "distill_loss": 0.10379371047019958, "epoch": 2.1280853902601735, "step": 6380 }, { "epoch": 2.1280853902601735, "ref_ce_loss": 0.15276721119880676, "step": 6380 }, { "epoch": 2.1280853902601735, "loss": 0.42396658658981323, "step": 6380 }, { "ce_loss": 0.08682675659656525, "epoch": 2.1280853902601735, "step": 6380 }, { "distill_loss": 0.10008202493190765, "epoch": 2.1280853902601735, "step": 6380 }, { "epoch": 2.1280853902601735, "ref_ce_loss": 0.10740143060684204, "step": 6380 }, { "epoch": 2.1280853902601735, "loss": 1.1396608352661133, "step": 6380 }, { "ce_loss": 0.22738853096961975, "epoch": 2.1280853902601735, "step": 6380 }, { "distill_loss": 0.14909625053405762, "epoch": 2.1280853902601735, "step": 6380 }, { "epoch": 2.1280853902601735, "ref_ce_loss": 0.20061470568180084, "step": 6380 }, { "epoch": 2.1280853902601735, "loss": 0.5962453484535217, "step": 6380 }, { "ce_loss": 0.1423494815826416, "epoch": 2.1280853902601735, "step": 6380 }, { "distill_loss": 0.09867765009403229, "epoch": 2.1280853902601735, "step": 6380 }, { "epoch": 2.1280853902601735, "ref_ce_loss": 0.16645397245883942, "step": 6380 }, { "epoch": 2.131420947298199, "loss": 0.6273, "step": 6390 }, { "epoch": 2.131420947298199, "grad_norm": 6.128455638885498, "step": 6390 }, { "epoch": 2.131420947298199, "learning_rate": 0.0002581364627163084, "step": 6390 }, { "epoch": 2.131420947298199, "loss": 0.45542195439338684, "step": 6390 }, { "ce_loss": 0.15142342448234558, "epoch": 2.131420947298199, "step": 6390 }, { "distill_loss": 0.09832875430583954, "epoch": 2.131420947298199, "step": 6390 }, { "epoch": 2.131420947298199, "ref_ce_loss": 0.08714140206575394, "step": 6390 }, { "epoch": 2.131420947298199, "loss": 0.4087405800819397, "step": 6390 }, { "ce_loss": 0.13376466929912567, "epoch": 2.131420947298199, "step": 6390 }, { "distill_loss": 0.09935194253921509, "epoch": 2.131420947298199, "step": 6390 }, { "epoch": 2.131420947298199, "ref_ce_loss": 0.12032655626535416, "step": 6390 }, { "epoch": 2.131420947298199, "loss": 0.4319303631782532, "step": 6390 }, { "ce_loss": 0.20519354939460754, "epoch": 2.131420947298199, "step": 6390 }, { "distill_loss": 0.13323552906513214, "epoch": 2.131420947298199, "step": 6390 }, { "epoch": 2.131420947298199, "ref_ce_loss": 0.09306564927101135, "step": 6390 }, { "epoch": 2.131420947298199, "loss": 0.540589451789856, "step": 6390 }, { "ce_loss": 0.1891559511423111, "epoch": 2.131420947298199, "step": 6390 }, { "distill_loss": 0.14564424753189087, "epoch": 2.131420947298199, "step": 6390 }, { "epoch": 2.131420947298199, "ref_ce_loss": 0.16008201241493225, "step": 6390 }, { "epoch": 2.134756504336224, "loss": 0.5359, "step": 6400 }, { "epoch": 2.134756504336224, "grad_norm": 2.3692429065704346, "step": 6400 }, { "epoch": 2.134756504336224, "learning_rate": 0.0002579959832278007, "step": 6400 }, { "epoch": 2.134756504336224, "loss": 0.8921303749084473, "step": 6400 }, { "ce_loss": 0.13139651715755463, "epoch": 2.134756504336224, "step": 6400 }, { "distill_loss": 0.11276914924383163, "epoch": 2.134756504336224, "step": 6400 }, { "epoch": 2.134756504336224, "ref_ce_loss": 0.18120615184307098, "step": 6400 }, { "epoch": 2.134756504336224, "loss": 0.4045407772064209, "step": 6400 }, { "ce_loss": 0.19195307791233063, "epoch": 2.134756504336224, "step": 6400 }, { "distill_loss": 0.10690625756978989, "epoch": 2.134756504336224, "step": 6400 }, { "epoch": 2.134756504336224, "ref_ce_loss": 0.10556533932685852, "step": 6400 }, { "epoch": 2.134756504336224, "loss": 0.46520286798477173, "step": 6400 }, { "ce_loss": 0.23439905047416687, "epoch": 2.134756504336224, "step": 6400 }, { "distill_loss": 0.10840963572263718, "epoch": 2.134756504336224, "step": 6400 }, { "epoch": 2.134756504336224, "ref_ce_loss": 0.08725638687610626, "step": 6400 }, { "epoch": 2.134756504336224, "loss": 0.7589808106422424, "step": 6400 }, { "ce_loss": 0.16259820759296417, "epoch": 2.134756504336224, "step": 6400 }, { "distill_loss": 0.1078663095831871, "epoch": 2.134756504336224, "step": 6400 }, { "epoch": 2.134756504336224, "ref_ce_loss": 0.1864003986120224, "step": 6400 }, { "epoch": 2.1380920613742496, "loss": 0.5514, "step": 6410 }, { "epoch": 2.1380920613742496, "grad_norm": 4.490102767944336, "step": 6410 }, { "epoch": 2.1380920613742496, "learning_rate": 0.0002578553067972205, "step": 6410 }, { "epoch": 2.1380920613742496, "loss": 0.46829667687416077, "step": 6410 }, { "ce_loss": 0.1818871796131134, "epoch": 2.1380920613742496, "step": 6410 }, { "distill_loss": 0.11202502250671387, "epoch": 2.1380920613742496, "step": 6410 }, { "epoch": 2.1380920613742496, "ref_ce_loss": 0.12546919286251068, "step": 6410 }, { "epoch": 2.1380920613742496, "loss": 0.48561179637908936, "step": 6410 }, { "ce_loss": 0.17383486032485962, "epoch": 2.1380920613742496, "step": 6410 }, { "distill_loss": 0.11871214956045151, "epoch": 2.1380920613742496, "step": 6410 }, { "epoch": 2.1380920613742496, "ref_ce_loss": 0.12499058246612549, "step": 6410 }, { "epoch": 2.1380920613742496, "loss": 0.4042794108390808, "step": 6410 }, { "ce_loss": 0.13534142076969147, "epoch": 2.1380920613742496, "step": 6410 }, { "distill_loss": 0.11494524031877518, "epoch": 2.1380920613742496, "step": 6410 }, { "epoch": 2.1380920613742496, "ref_ce_loss": 0.11009050160646439, "step": 6410 }, { "epoch": 2.1380920613742496, "loss": 0.4688982367515564, "step": 6410 }, { "ce_loss": 0.09592331945896149, "epoch": 2.1380920613742496, "step": 6410 }, { "distill_loss": 0.09164990484714508, "epoch": 2.1380920613742496, "step": 6410 }, { "epoch": 2.1380920613742496, "ref_ce_loss": 0.11540322005748749, "step": 6410 }, { "epoch": 2.141427618412275, "loss": 0.5349, "step": 6420 }, { "epoch": 2.141427618412275, "grad_norm": 6.115320682525635, "step": 6420 }, { "epoch": 2.141427618412275, "learning_rate": 0.00025771443368110625, "step": 6420 }, { "epoch": 2.141427618412275, "loss": 0.7359644174575806, "step": 6420 }, { "ce_loss": 0.24195481836795807, "epoch": 2.141427618412275, "step": 6420 }, { "distill_loss": 0.11840562522411346, "epoch": 2.141427618412275, "step": 6420 }, { "epoch": 2.141427618412275, "ref_ce_loss": 0.19802844524383545, "step": 6420 }, { "epoch": 2.141427618412275, "loss": 0.4115001857280731, "step": 6420 }, { "ce_loss": 0.17016026377677917, "epoch": 2.141427618412275, "step": 6420 }, { "distill_loss": 0.1165749728679657, "epoch": 2.141427618412275, "step": 6420 }, { "epoch": 2.141427618412275, "ref_ce_loss": 0.09531422704458237, "step": 6420 }, { "epoch": 2.141427618412275, "loss": 0.5578610301017761, "step": 6420 }, { "ce_loss": 0.20728155970573425, "epoch": 2.141427618412275, "step": 6420 }, { "distill_loss": 0.15174588561058044, "epoch": 2.141427618412275, "step": 6420 }, { "epoch": 2.141427618412275, "ref_ce_loss": 0.09736379981040955, "step": 6420 }, { "epoch": 2.141427618412275, "loss": 0.43680068850517273, "step": 6420 }, { "ce_loss": 0.17766030132770538, "epoch": 2.141427618412275, "step": 6420 }, { "distill_loss": 0.14595100283622742, "epoch": 2.141427618412275, "step": 6420 }, { "epoch": 2.141427618412275, "ref_ce_loss": 0.11311586946249008, "step": 6420 }, { "epoch": 2.1447631754503003, "loss": 0.5044, "step": 6430 }, { "epoch": 2.1447631754503003, "grad_norm": 3.4700722694396973, "step": 6430 }, { "epoch": 2.1447631754503003, "learning_rate": 0.0002575733641363548, "step": 6430 }, { "epoch": 2.1447631754503003, "loss": 0.40244314074516296, "step": 6430 }, { "ce_loss": 0.12530314922332764, "epoch": 2.1447631754503003, "step": 6430 }, { "distill_loss": 0.09940031170845032, "epoch": 2.1447631754503003, "step": 6430 }, { "epoch": 2.1447631754503003, "ref_ce_loss": 0.11356396228075027, "step": 6430 }, { "epoch": 2.1447631754503003, "loss": 0.5394932627677917, "step": 6430 }, { "ce_loss": 0.24598979949951172, "epoch": 2.1447631754503003, "step": 6430 }, { "distill_loss": 0.11309697479009628, "epoch": 2.1447631754503003, "step": 6430 }, { "epoch": 2.1447631754503003, "ref_ce_loss": 0.1273350715637207, "step": 6430 }, { "epoch": 2.1447631754503003, "loss": 0.5167526602745056, "step": 6430 }, { "ce_loss": 0.16347810626029968, "epoch": 2.1447631754503003, "step": 6430 }, { "distill_loss": 0.10503201186656952, "epoch": 2.1447631754503003, "step": 6430 }, { "epoch": 2.1447631754503003, "ref_ce_loss": 0.1429758071899414, "step": 6430 }, { "epoch": 2.1447631754503003, "loss": 0.6103532314300537, "step": 6430 }, { "ce_loss": 0.19520029425621033, "epoch": 2.1447631754503003, "step": 6430 }, { "distill_loss": 0.11522606760263443, "epoch": 2.1447631754503003, "step": 6430 }, { "epoch": 2.1447631754503003, "ref_ce_loss": 0.21002265810966492, "step": 6430 }, { "epoch": 2.1480987324883256, "loss": 0.5028, "step": 6440 }, { "epoch": 2.1480987324883256, "grad_norm": 2.7797205448150635, "step": 6440 }, { "epoch": 2.1480987324883256, "learning_rate": 0.0002574320984202214, "step": 6440 }, { "epoch": 2.1480987324883256, "loss": 0.352073073387146, "step": 6440 }, { "ce_loss": 0.14698846638202667, "epoch": 2.1480987324883256, "step": 6440 }, { "distill_loss": 0.08582793176174164, "epoch": 2.1480987324883256, "step": 6440 }, { "epoch": 2.1480987324883256, "ref_ce_loss": 0.11884086579084396, "step": 6440 }, { "epoch": 2.1480987324883256, "loss": 0.5487686991691589, "step": 6440 }, { "ce_loss": 0.16990971565246582, "epoch": 2.1480987324883256, "step": 6440 }, { "distill_loss": 0.107993483543396, "epoch": 2.1480987324883256, "step": 6440 }, { "epoch": 2.1480987324883256, "ref_ce_loss": 0.16219596564769745, "step": 6440 }, { "epoch": 2.1480987324883256, "loss": 0.30984169244766235, "step": 6440 }, { "ce_loss": 0.0969039648771286, "epoch": 2.1480987324883256, "step": 6440 }, { "distill_loss": 0.07845834642648697, "epoch": 2.1480987324883256, "step": 6440 }, { "epoch": 2.1480987324883256, "ref_ce_loss": 0.13372232019901276, "step": 6440 }, { "epoch": 2.1480987324883256, "loss": 0.3871748745441437, "step": 6440 }, { "ce_loss": 0.13498345017433167, "epoch": 2.1480987324883256, "step": 6440 }, { "distill_loss": 0.10390393435955048, "epoch": 2.1480987324883256, "step": 6440 }, { "epoch": 2.1480987324883256, "ref_ce_loss": 0.147451713681221, "step": 6440 }, { "epoch": 2.151434289526351, "loss": 0.5577, "step": 6450 }, { "epoch": 2.151434289526351, "grad_norm": 3.586613416671753, "step": 6450 }, { "epoch": 2.151434289526351, "learning_rate": 0.00025729063679031896, "step": 6450 }, { "epoch": 2.151434289526351, "loss": 0.8539671897888184, "step": 6450 }, { "ce_loss": 0.13899965584278107, "epoch": 2.151434289526351, "step": 6450 }, { "distill_loss": 0.12739282846450806, "epoch": 2.151434289526351, "step": 6450 }, { "epoch": 2.151434289526351, "ref_ce_loss": 0.13170544803142548, "step": 6450 }, { "epoch": 2.151434289526351, "loss": 0.43330442905426025, "step": 6450 }, { "ce_loss": 0.17675459384918213, "epoch": 2.151434289526351, "step": 6450 }, { "distill_loss": 0.1473657488822937, "epoch": 2.151434289526351, "step": 6450 }, { "epoch": 2.151434289526351, "ref_ce_loss": 0.10867439955472946, "step": 6450 }, { "epoch": 2.151434289526351, "loss": 0.34321853518486023, "step": 6450 }, { "ce_loss": 0.10612574219703674, "epoch": 2.151434289526351, "step": 6450 }, { "distill_loss": 0.1542721539735794, "epoch": 2.151434289526351, "step": 6450 }, { "epoch": 2.151434289526351, "ref_ce_loss": 0.07491093873977661, "step": 6450 }, { "epoch": 2.151434289526351, "loss": 0.876250147819519, "step": 6450 }, { "ce_loss": 0.18352952599525452, "epoch": 2.151434289526351, "step": 6450 }, { "distill_loss": 0.1817750781774521, "epoch": 2.151434289526351, "step": 6450 }, { "epoch": 2.151434289526351, "ref_ce_loss": 0.11178678274154663, "step": 6450 }, { "epoch": 2.1547698465643763, "loss": 0.5881, "step": 6460 }, { "epoch": 2.1547698465643763, "grad_norm": 4.06648063659668, "step": 6460 }, { "epoch": 2.1547698465643763, "learning_rate": 0.0002571489795046177, "step": 6460 }, { "epoch": 2.1547698465643763, "loss": 0.8367530703544617, "step": 6460 }, { "ce_loss": 0.19520042836666107, "epoch": 2.1547698465643763, "step": 6460 }, { "distill_loss": 0.16486920416355133, "epoch": 2.1547698465643763, "step": 6460 }, { "epoch": 2.1547698465643763, "ref_ce_loss": 0.09433402866125107, "step": 6460 }, { "epoch": 2.1547698465643763, "loss": 0.36075928807258606, "step": 6460 }, { "ce_loss": 0.11127705872058868, "epoch": 2.1547698465643763, "step": 6460 }, { "distill_loss": 0.12301945686340332, "epoch": 2.1547698465643763, "step": 6460 }, { "epoch": 2.1547698465643763, "ref_ce_loss": 0.12597696483135223, "step": 6460 }, { "epoch": 2.1547698465643763, "loss": 0.5136455297470093, "step": 6460 }, { "ce_loss": 0.19618333876132965, "epoch": 2.1547698465643763, "step": 6460 }, { "distill_loss": 0.12617187201976776, "epoch": 2.1547698465643763, "step": 6460 }, { "epoch": 2.1547698465643763, "ref_ce_loss": 0.1307118833065033, "step": 6460 }, { "epoch": 2.1547698465643763, "loss": 0.6558589935302734, "step": 6460 }, { "ce_loss": 0.13309955596923828, "epoch": 2.1547698465643763, "step": 6460 }, { "distill_loss": 0.12923918664455414, "epoch": 2.1547698465643763, "step": 6460 }, { "epoch": 2.1547698465643763, "ref_ce_loss": 0.10269004851579666, "step": 6460 }, { "epoch": 2.1581054036024017, "loss": 0.5978, "step": 6470 }, { "epoch": 2.1581054036024017, "grad_norm": 2.748814344406128, "step": 6470 }, { "epoch": 2.1581054036024017, "learning_rate": 0.0002570071268214447, "step": 6470 }, { "epoch": 2.1581054036024017, "loss": 0.5000072717666626, "step": 6470 }, { "ce_loss": 0.19368258118629456, "epoch": 2.1581054036024017, "step": 6470 }, { "distill_loss": 0.13927975296974182, "epoch": 2.1581054036024017, "step": 6470 }, { "epoch": 2.1581054036024017, "ref_ce_loss": 0.1153336688876152, "step": 6470 }, { "epoch": 2.1581054036024017, "loss": 0.41083213686943054, "step": 6470 }, { "ce_loss": 0.15378105640411377, "epoch": 2.1581054036024017, "step": 6470 }, { "distill_loss": 0.1380583792924881, "epoch": 2.1581054036024017, "step": 6470 }, { "epoch": 2.1581054036024017, "ref_ce_loss": 0.07984202355146408, "step": 6470 }, { "epoch": 2.1581054036024017, "loss": 0.7085814476013184, "step": 6470 }, { "ce_loss": 0.3445759415626526, "epoch": 2.1581054036024017, "step": 6470 }, { "distill_loss": 0.20446883141994476, "epoch": 2.1581054036024017, "step": 6470 }, { "epoch": 2.1581054036024017, "ref_ce_loss": 0.15822933614253998, "step": 6470 }, { "epoch": 2.1581054036024017, "loss": 0.5256810188293457, "step": 6470 }, { "ce_loss": 0.19576559960842133, "epoch": 2.1581054036024017, "step": 6470 }, { "distill_loss": 0.12762930989265442, "epoch": 2.1581054036024017, "step": 6470 }, { "epoch": 2.1581054036024017, "ref_ce_loss": 0.13251972198486328, "step": 6470 }, { "epoch": 2.161440960640427, "loss": 0.5677, "step": 6480 }, { "epoch": 2.161440960640427, "grad_norm": 4.109629154205322, "step": 6480 }, { "epoch": 2.161440960640427, "learning_rate": 0.0002568650789994832, "step": 6480 }, { "epoch": 2.161440960640427, "loss": 0.45278507471084595, "step": 6480 }, { "ce_loss": 0.13802358508110046, "epoch": 2.161440960640427, "step": 6480 }, { "distill_loss": 0.15413391590118408, "epoch": 2.161440960640427, "step": 6480 }, { "epoch": 2.161440960640427, "ref_ce_loss": 0.10151858627796173, "step": 6480 }, { "epoch": 2.161440960640427, "loss": 0.789211094379425, "step": 6480 }, { "ce_loss": 0.24831239879131317, "epoch": 2.161440960640427, "step": 6480 }, { "distill_loss": 0.1875370591878891, "epoch": 2.161440960640427, "step": 6480 }, { "epoch": 2.161440960640427, "ref_ce_loss": 0.2326797991991043, "step": 6480 }, { "epoch": 2.161440960640427, "loss": 0.4289882779121399, "step": 6480 }, { "ce_loss": 0.11800291389226913, "epoch": 2.161440960640427, "step": 6480 }, { "distill_loss": 0.1267249584197998, "epoch": 2.161440960640427, "step": 6480 }, { "epoch": 2.161440960640427, "ref_ce_loss": 0.12218296527862549, "step": 6480 }, { "epoch": 2.161440960640427, "loss": 0.3204435408115387, "step": 6480 }, { "ce_loss": 0.10140528529882431, "epoch": 2.161440960640427, "step": 6480 }, { "distill_loss": 0.1469344198703766, "epoch": 2.161440960640427, "step": 6480 }, { "epoch": 2.161440960640427, "ref_ce_loss": 0.07105684280395508, "step": 6480 }, { "epoch": 2.1647765176784524, "loss": 0.5624, "step": 6490 }, { "epoch": 2.1647765176784524, "grad_norm": 2.237226724624634, "step": 6490 }, { "epoch": 2.1647765176784524, "learning_rate": 0.0002567228362977725, "step": 6490 }, { "epoch": 2.1647765176784524, "loss": 0.5324109792709351, "step": 6490 }, { "ce_loss": 0.17774225771427155, "epoch": 2.1647765176784524, "step": 6490 }, { "distill_loss": 0.1355997622013092, "epoch": 2.1647765176784524, "step": 6490 }, { "epoch": 2.1647765176784524, "ref_ce_loss": 0.13850535452365875, "step": 6490 }, { "epoch": 2.1647765176784524, "loss": 0.3751431703567505, "step": 6490 }, { "ce_loss": 0.1006331816315651, "epoch": 2.1647765176784524, "step": 6490 }, { "distill_loss": 0.12125347554683685, "epoch": 2.1647765176784524, "step": 6490 }, { "epoch": 2.1647765176784524, "ref_ce_loss": 0.08901651203632355, "step": 6490 }, { "epoch": 2.1647765176784524, "loss": 0.4847124218940735, "step": 6490 }, { "ce_loss": 0.1677042692899704, "epoch": 2.1647765176784524, "step": 6490 }, { "distill_loss": 0.1740044504404068, "epoch": 2.1647765176784524, "step": 6490 }, { "epoch": 2.1647765176784524, "ref_ce_loss": 0.14263245463371277, "step": 6490 }, { "epoch": 2.1647765176784524, "loss": 0.8070560097694397, "step": 6490 }, { "ce_loss": 0.3739077150821686, "epoch": 2.1647765176784524, "step": 6490 }, { "distill_loss": 0.17787256836891174, "epoch": 2.1647765176784524, "step": 6490 }, { "epoch": 2.1647765176784524, "ref_ce_loss": 0.1976572871208191, "step": 6490 }, { "epoch": 2.1681120747164777, "loss": 0.5638, "step": 6500 }, { "epoch": 2.1681120747164777, "grad_norm": 3.8014607429504395, "step": 6500 }, { "epoch": 2.1681120747164777, "learning_rate": 0.00025658039897570703, "step": 6500 }, { "epoch": 2.1681120747164777, "loss": 0.4397726356983185, "step": 6500 }, { "ce_loss": 0.1840061992406845, "epoch": 2.1681120747164777, "step": 6500 }, { "distill_loss": 0.12719230353832245, "epoch": 2.1681120747164777, "step": 6500 }, { "epoch": 2.1681120747164777, "ref_ce_loss": 0.09979324042797089, "step": 6500 }, { "epoch": 2.1681120747164777, "loss": 0.6797133684158325, "step": 6500 }, { "ce_loss": 0.14498358964920044, "epoch": 2.1681120747164777, "step": 6500 }, { "distill_loss": 0.14739961922168732, "epoch": 2.1681120747164777, "step": 6500 }, { "epoch": 2.1681120747164777, "ref_ce_loss": 0.1322130262851715, "step": 6500 }, { "epoch": 2.1681120747164777, "loss": 0.8477792739868164, "step": 6500 }, { "ce_loss": 0.18177065253257751, "epoch": 2.1681120747164777, "step": 6500 }, { "distill_loss": 0.14841307699680328, "epoch": 2.1681120747164777, "step": 6500 }, { "epoch": 2.1681120747164777, "ref_ce_loss": 0.15319450199604034, "step": 6500 }, { "epoch": 2.1681120747164777, "loss": 0.4149465560913086, "step": 6500 }, { "ce_loss": 0.13384394347667694, "epoch": 2.1681120747164777, "step": 6500 }, { "distill_loss": 0.12086221575737, "epoch": 2.1681120747164777, "step": 6500 }, { "epoch": 2.1681120747164777, "ref_ce_loss": 0.11601506918668747, "step": 6500 }, { "epoch": 2.171447631754503, "loss": 0.5666, "step": 6510 }, { "epoch": 2.171447631754503, "grad_norm": 2.200901508331299, "step": 6510 }, { "epoch": 2.171447631754503, "learning_rate": 0.0002564377672930364, "step": 6510 }, { "epoch": 2.171447631754503, "loss": 0.3441053330898285, "step": 6510 }, { "ce_loss": 0.0955720841884613, "epoch": 2.171447631754503, "step": 6510 }, { "distill_loss": 0.0954127162694931, "epoch": 2.171447631754503, "step": 6510 }, { "epoch": 2.171447631754503, "ref_ce_loss": 0.0820605456829071, "step": 6510 }, { "epoch": 2.171447631754503, "loss": 0.3989987373352051, "step": 6510 }, { "ce_loss": 0.16622278094291687, "epoch": 2.171447631754503, "step": 6510 }, { "distill_loss": 0.14429330825805664, "epoch": 2.171447631754503, "step": 6510 }, { "epoch": 2.171447631754503, "ref_ce_loss": 0.08820192515850067, "step": 6510 }, { "epoch": 2.171447631754503, "loss": 0.594819962978363, "step": 6510 }, { "ce_loss": 0.2731800377368927, "epoch": 2.171447631754503, "step": 6510 }, { "distill_loss": 0.14734132587909698, "epoch": 2.171447631754503, "step": 6510 }, { "epoch": 2.171447631754503, "ref_ce_loss": 0.1282769739627838, "step": 6510 }, { "epoch": 2.171447631754503, "loss": 1.5799360275268555, "step": 6510 }, { "ce_loss": 0.2743644118309021, "epoch": 2.171447631754503, "step": 6510 }, { "distill_loss": 0.15681089460849762, "epoch": 2.171447631754503, "step": 6510 }, { "epoch": 2.171447631754503, "ref_ce_loss": 0.114822618663311, "step": 6510 }, { "epoch": 2.1747831887925284, "loss": 0.6169, "step": 6520 }, { "epoch": 2.1747831887925284, "grad_norm": 3.009552001953125, "step": 6520 }, { "epoch": 2.1747831887925284, "learning_rate": 0.00025629494150986455, "step": 6520 }, { "epoch": 2.1747831887925284, "loss": 0.5791804790496826, "step": 6520 }, { "ce_loss": 0.1254793405532837, "epoch": 2.1747831887925284, "step": 6520 }, { "distill_loss": 0.13040056824684143, "epoch": 2.1747831887925284, "step": 6520 }, { "epoch": 2.1747831887925284, "ref_ce_loss": 0.1082243099808693, "step": 6520 }, { "epoch": 2.1747831887925284, "loss": 0.34121212363243103, "step": 6520 }, { "ce_loss": 0.08749425411224365, "epoch": 2.1747831887925284, "step": 6520 }, { "distill_loss": 0.13595782220363617, "epoch": 2.1747831887925284, "step": 6520 }, { "epoch": 2.1747831887925284, "ref_ce_loss": 0.07215691357851028, "step": 6520 }, { "epoch": 2.1747831887925284, "loss": 0.35587042570114136, "step": 6520 }, { "ce_loss": 0.1334792971611023, "epoch": 2.1747831887925284, "step": 6520 }, { "distill_loss": 0.11082914471626282, "epoch": 2.1747831887925284, "step": 6520 }, { "epoch": 2.1747831887925284, "ref_ce_loss": 0.08055583387613297, "step": 6520 }, { "epoch": 2.1747831887925284, "loss": 0.40503957867622375, "step": 6520 }, { "ce_loss": 0.1451195925474167, "epoch": 2.1747831887925284, "step": 6520 }, { "distill_loss": 0.13516882061958313, "epoch": 2.1747831887925284, "step": 6520 }, { "epoch": 2.1747831887925284, "ref_ce_loss": 0.08957435935735703, "step": 6520 }, { "epoch": 2.1781187458305538, "loss": 0.528, "step": 6530 }, { "epoch": 2.1781187458305538, "grad_norm": 4.192197799682617, "step": 6530 }, { "epoch": 2.1781187458305538, "learning_rate": 0.00025615192188664925, "step": 6530 }, { "epoch": 2.1781187458305538, "loss": 0.45684969425201416, "step": 6530 }, { "ce_loss": 0.17055489122867584, "epoch": 2.1781187458305538, "step": 6530 }, { "distill_loss": 0.1565455198287964, "epoch": 2.1781187458305538, "step": 6530 }, { "epoch": 2.1781187458305538, "ref_ce_loss": 0.05911305919289589, "step": 6530 }, { "epoch": 2.1781187458305538, "loss": 0.32195115089416504, "step": 6530 }, { "ce_loss": 0.06535065174102783, "epoch": 2.1781187458305538, "step": 6530 }, { "distill_loss": 0.11212996393442154, "epoch": 2.1781187458305538, "step": 6530 }, { "epoch": 2.1781187458305538, "ref_ce_loss": 0.09981215000152588, "step": 6530 }, { "epoch": 2.1781187458305538, "loss": 0.5088362693786621, "step": 6530 }, { "ce_loss": 0.20849275588989258, "epoch": 2.1781187458305538, "step": 6530 }, { "distill_loss": 0.16551473736763, "epoch": 2.1781187458305538, "step": 6530 }, { "epoch": 2.1781187458305538, "ref_ce_loss": 0.13474003970623016, "step": 6530 }, { "epoch": 2.1781187458305538, "loss": 0.7871987819671631, "step": 6530 }, { "ce_loss": 0.17473362386226654, "epoch": 2.1781187458305538, "step": 6530 }, { "distill_loss": 0.1323826164007187, "epoch": 2.1781187458305538, "step": 6530 }, { "epoch": 2.1781187458305538, "ref_ce_loss": 0.13147905468940735, "step": 6530 }, { "epoch": 2.181454302868579, "loss": 0.5635, "step": 6540 }, { "epoch": 2.181454302868579, "grad_norm": 2.9585373401641846, "step": 6540 }, { "epoch": 2.181454302868579, "learning_rate": 0.000256008708684202, "step": 6540 }, { "epoch": 2.181454302868579, "loss": 0.5356453657150269, "step": 6540 }, { "ce_loss": 0.187409907579422, "epoch": 2.181454302868579, "step": 6540 }, { "distill_loss": 0.14706161618232727, "epoch": 2.181454302868579, "step": 6540 }, { "epoch": 2.181454302868579, "ref_ce_loss": 0.13905051350593567, "step": 6540 }, { "epoch": 2.181454302868579, "loss": 0.4628433585166931, "step": 6540 }, { "ce_loss": 0.17631632089614868, "epoch": 2.181454302868579, "step": 6540 }, { "distill_loss": 0.08722099661827087, "epoch": 2.181454302868579, "step": 6540 }, { "epoch": 2.181454302868579, "ref_ce_loss": 0.14837141335010529, "step": 6540 }, { "epoch": 2.181454302868579, "loss": 0.4328906834125519, "step": 6540 }, { "ce_loss": 0.1406063288450241, "epoch": 2.181454302868579, "step": 6540 }, { "distill_loss": 0.10556124150753021, "epoch": 2.181454302868579, "step": 6540 }, { "epoch": 2.181454302868579, "ref_ce_loss": 0.08157248049974442, "step": 6540 }, { "epoch": 2.181454302868579, "loss": 0.4424014091491699, "step": 6540 }, { "ce_loss": 0.11750722676515579, "epoch": 2.181454302868579, "step": 6540 }, { "distill_loss": 0.12028899788856506, "epoch": 2.181454302868579, "step": 6540 }, { "epoch": 2.181454302868579, "ref_ce_loss": 0.09090970456600189, "step": 6540 }, { "epoch": 2.1847898599066045, "loss": 0.531, "step": 6550 }, { "epoch": 2.1847898599066045, "grad_norm": 2.208059310913086, "step": 6550 }, { "epoch": 2.1847898599066045, "learning_rate": 0.00025586530216368706, "step": 6550 }, { "epoch": 2.1847898599066045, "loss": 0.49402064085006714, "step": 6550 }, { "ce_loss": 0.18181763589382172, "epoch": 2.1847898599066045, "step": 6550 }, { "distill_loss": 0.10426479578018188, "epoch": 2.1847898599066045, "step": 6550 }, { "epoch": 2.1847898599066045, "ref_ce_loss": 0.1457383632659912, "step": 6550 }, { "epoch": 2.1847898599066045, "loss": 0.6502156257629395, "step": 6550 }, { "ce_loss": 0.1841825395822525, "epoch": 2.1847898599066045, "step": 6550 }, { "distill_loss": 0.1110747829079628, "epoch": 2.1847898599066045, "step": 6550 }, { "epoch": 2.1847898599066045, "ref_ce_loss": 0.09839002788066864, "step": 6550 }, { "epoch": 2.1847898599066045, "loss": 0.5076692700386047, "step": 6550 }, { "ce_loss": 0.1509997844696045, "epoch": 2.1847898599066045, "step": 6550 }, { "distill_loss": 0.1115167960524559, "epoch": 2.1847898599066045, "step": 6550 }, { "epoch": 2.1847898599066045, "ref_ce_loss": 0.14280161261558533, "step": 6550 }, { "epoch": 2.1847898599066045, "loss": 0.4125179052352905, "step": 6550 }, { "ce_loss": 0.14968010783195496, "epoch": 2.1847898599066045, "step": 6550 }, { "distill_loss": 0.08732464909553528, "epoch": 2.1847898599066045, "step": 6550 }, { "epoch": 2.1847898599066045, "ref_ce_loss": 0.11727482080459595, "step": 6550 }, { "epoch": 2.18812541694463, "loss": 0.5558, "step": 6560 }, { "epoch": 2.18812541694463, "grad_norm": 2.251790761947632, "step": 6560 }, { "epoch": 2.18812541694463, "learning_rate": 0.00025572170258662146, "step": 6560 }, { "epoch": 2.18812541694463, "loss": 0.628159761428833, "step": 6560 }, { "ce_loss": 0.22396205365657806, "epoch": 2.18812541694463, "step": 6560 }, { "distill_loss": 0.15010559558868408, "epoch": 2.18812541694463, "step": 6560 }, { "epoch": 2.18812541694463, "ref_ce_loss": 0.1103786751627922, "step": 6560 }, { "epoch": 2.18812541694463, "loss": 0.673648476600647, "step": 6560 }, { "ce_loss": 0.1863107532262802, "epoch": 2.18812541694463, "step": 6560 }, { "distill_loss": 0.15502431988716125, "epoch": 2.18812541694463, "step": 6560 }, { "epoch": 2.18812541694463, "ref_ce_loss": 0.16920864582061768, "step": 6560 }, { "epoch": 2.18812541694463, "loss": 0.6017754077911377, "step": 6560 }, { "ce_loss": 0.19705528020858765, "epoch": 2.18812541694463, "step": 6560 }, { "distill_loss": 0.15663552284240723, "epoch": 2.18812541694463, "step": 6560 }, { "epoch": 2.18812541694463, "ref_ce_loss": 0.1611385941505432, "step": 6560 }, { "epoch": 2.18812541694463, "loss": 0.5017340183258057, "step": 6560 }, { "ce_loss": 0.20633363723754883, "epoch": 2.18812541694463, "step": 6560 }, { "distill_loss": 0.13032835721969604, "epoch": 2.18812541694463, "step": 6560 }, { "epoch": 2.18812541694463, "ref_ce_loss": 0.12185481935739517, "step": 6560 }, { "epoch": 2.191460973982655, "loss": 0.6456, "step": 6570 }, { "epoch": 2.191460973982655, "grad_norm": 5.231355667114258, "step": 6570 }, { "epoch": 2.191460973982655, "learning_rate": 0.00025557791021487417, "step": 6570 }, { "epoch": 2.191460973982655, "loss": 0.6945635080337524, "step": 6570 }, { "ce_loss": 0.13277281820774078, "epoch": 2.191460973982655, "step": 6570 }, { "distill_loss": 0.12004784494638443, "epoch": 2.191460973982655, "step": 6570 }, { "epoch": 2.191460973982655, "ref_ce_loss": 0.13293145596981049, "step": 6570 }, { "epoch": 2.191460973982655, "loss": 0.42795026302337646, "step": 6570 }, { "ce_loss": 0.11150016635656357, "epoch": 2.191460973982655, "step": 6570 }, { "distill_loss": 0.14841175079345703, "epoch": 2.191460973982655, "step": 6570 }, { "epoch": 2.191460973982655, "ref_ce_loss": 0.12076891213655472, "step": 6570 }, { "epoch": 2.191460973982655, "loss": 0.5713083744049072, "step": 6570 }, { "ce_loss": 0.1848202347755432, "epoch": 2.191460973982655, "step": 6570 }, { "distill_loss": 0.17503327131271362, "epoch": 2.191460973982655, "step": 6570 }, { "epoch": 2.191460973982655, "ref_ce_loss": 0.13746482133865356, "step": 6570 }, { "epoch": 2.191460973982655, "loss": 0.4127688407897949, "step": 6570 }, { "ce_loss": 0.09293724596500397, "epoch": 2.191460973982655, "step": 6570 }, { "distill_loss": 0.09941837191581726, "epoch": 2.191460973982655, "step": 6570 }, { "epoch": 2.191460973982655, "ref_ce_loss": 0.08287370204925537, "step": 6570 }, { "epoch": 2.1947965310206805, "loss": 0.575, "step": 6580 }, { "epoch": 2.1947965310206805, "grad_norm": 2.4874107837677, "step": 6580 }, { "epoch": 2.1947965310206805, "learning_rate": 0.0002554339253106657, "step": 6580 }, { "epoch": 2.1947965310206805, "loss": 0.46032392978668213, "step": 6580 }, { "ce_loss": 0.09199360013008118, "epoch": 2.1947965310206805, "step": 6580 }, { "distill_loss": 0.1110043004155159, "epoch": 2.1947965310206805, "step": 6580 }, { "epoch": 2.1947965310206805, "ref_ce_loss": 0.12053350359201431, "step": 6580 }, { "epoch": 2.1947965310206805, "loss": 0.2444971650838852, "step": 6580 }, { "ce_loss": 0.07341790199279785, "epoch": 2.1947965310206805, "step": 6580 }, { "distill_loss": 0.10386674851179123, "epoch": 2.1947965310206805, "step": 6580 }, { "epoch": 2.1947965310206805, "ref_ce_loss": 0.06677841395139694, "step": 6580 }, { "epoch": 2.1947965310206805, "loss": 0.6744773387908936, "step": 6580 }, { "ce_loss": 0.19307246804237366, "epoch": 2.1947965310206805, "step": 6580 }, { "distill_loss": 0.18344935774803162, "epoch": 2.1947965310206805, "step": 6580 }, { "epoch": 2.1947965310206805, "ref_ce_loss": 0.1400938779115677, "step": 6580 }, { "epoch": 2.1947965310206805, "loss": 0.3913784325122833, "step": 6580 }, { "ce_loss": 0.08127550780773163, "epoch": 2.1947965310206805, "step": 6580 }, { "distill_loss": 0.10964338481426239, "epoch": 2.1947965310206805, "step": 6580 }, { "epoch": 2.1947965310206805, "ref_ce_loss": 0.12807399034500122, "step": 6580 }, { "epoch": 2.198132088058706, "loss": 0.5882, "step": 6590 }, { "epoch": 2.198132088058706, "grad_norm": 2.5270981788635254, "step": 6590 }, { "epoch": 2.198132088058706, "learning_rate": 0.00025528974813656785, "step": 6590 }, { "epoch": 2.198132088058706, "loss": 0.4461905062198639, "step": 6590 }, { "ce_loss": 0.18007469177246094, "epoch": 2.198132088058706, "step": 6590 }, { "distill_loss": 0.0942193865776062, "epoch": 2.198132088058706, "step": 6590 }, { "epoch": 2.198132088058706, "ref_ce_loss": 0.17168787121772766, "step": 6590 }, { "epoch": 2.198132088058706, "loss": 0.45658642053604126, "step": 6590 }, { "ce_loss": 0.16922041773796082, "epoch": 2.198132088058706, "step": 6590 }, { "distill_loss": 0.1462121605873108, "epoch": 2.198132088058706, "step": 6590 }, { "epoch": 2.198132088058706, "ref_ce_loss": 0.14055679738521576, "step": 6590 }, { "epoch": 2.198132088058706, "loss": 0.6269693374633789, "step": 6590 }, { "ce_loss": 0.14432154595851898, "epoch": 2.198132088058706, "step": 6590 }, { "distill_loss": 0.11544007807970047, "epoch": 2.198132088058706, "step": 6590 }, { "epoch": 2.198132088058706, "ref_ce_loss": 0.09846153855323792, "step": 6590 }, { "epoch": 2.198132088058706, "loss": 0.41132834553718567, "step": 6590 }, { "ce_loss": 0.13266809284687042, "epoch": 2.198132088058706, "step": 6590 }, { "distill_loss": 0.11859145760536194, "epoch": 2.198132088058706, "step": 6590 }, { "epoch": 2.198132088058706, "ref_ce_loss": 0.1222081333398819, "step": 6590 }, { "epoch": 2.201467645096731, "loss": 0.5259, "step": 6600 }, { "epoch": 2.201467645096731, "grad_norm": 2.9365334510803223, "step": 6600 }, { "epoch": 2.201467645096731, "learning_rate": 0.00025514537895550274, "step": 6600 }, { "epoch": 2.201467645096731, "loss": 0.26942503452301025, "step": 6600 }, { "ce_loss": 0.0721149668097496, "epoch": 2.201467645096731, "step": 6600 }, { "distill_loss": 0.09688413888216019, "epoch": 2.201467645096731, "step": 6600 }, { "epoch": 2.201467645096731, "ref_ce_loss": 0.1003360003232956, "step": 6600 }, { "epoch": 2.201467645096731, "loss": 0.39754799008369446, "step": 6600 }, { "ce_loss": 0.1665194183588028, "epoch": 2.201467645096731, "step": 6600 }, { "distill_loss": 0.1093418300151825, "epoch": 2.201467645096731, "step": 6600 }, { "epoch": 2.201467645096731, "ref_ce_loss": 0.09641645103693008, "step": 6600 }, { "epoch": 2.201467645096731, "loss": 1.559045433998108, "step": 6600 }, { "ce_loss": 0.22894124686717987, "epoch": 2.201467645096731, "step": 6600 }, { "distill_loss": 0.16813145577907562, "epoch": 2.201467645096731, "step": 6600 }, { "epoch": 2.201467645096731, "ref_ce_loss": 0.11116345226764679, "step": 6600 }, { "epoch": 2.201467645096731, "loss": 0.2761334478855133, "step": 6600 }, { "ce_loss": 0.06958366185426712, "epoch": 2.201467645096731, "step": 6600 }, { "distill_loss": 0.10546497255563736, "epoch": 2.201467645096731, "step": 6600 }, { "epoch": 2.201467645096731, "ref_ce_loss": 0.1002037301659584, "step": 6600 }, { "epoch": 2.2048032021347566, "loss": 0.5541, "step": 6610 }, { "epoch": 2.2048032021347566, "grad_norm": 2.531602621078491, "step": 6610 }, { "epoch": 2.2048032021347566, "learning_rate": 0.0002550008180307429, "step": 6610 }, { "epoch": 2.2048032021347566, "loss": 0.7316204309463501, "step": 6610 }, { "ce_loss": 0.18090717494487762, "epoch": 2.2048032021347566, "step": 6610 }, { "distill_loss": 0.10258594155311584, "epoch": 2.2048032021347566, "step": 6610 }, { "epoch": 2.2048032021347566, "ref_ce_loss": 0.12114837020635605, "step": 6610 }, { "epoch": 2.2048032021347566, "loss": 0.5733076930046082, "step": 6610 }, { "ce_loss": 0.26956093311309814, "epoch": 2.2048032021347566, "step": 6610 }, { "distill_loss": 0.10860621929168701, "epoch": 2.2048032021347566, "step": 6610 }, { "epoch": 2.2048032021347566, "ref_ce_loss": 0.13310791552066803, "step": 6610 }, { "epoch": 2.2048032021347566, "loss": 0.40223658084869385, "step": 6610 }, { "ce_loss": 0.16505920886993408, "epoch": 2.2048032021347566, "step": 6610 }, { "distill_loss": 0.10595328360795975, "epoch": 2.2048032021347566, "step": 6610 }, { "epoch": 2.2048032021347566, "ref_ce_loss": 0.09320106357336044, "step": 6610 }, { "epoch": 2.2048032021347566, "loss": 0.36049261689186096, "step": 6610 }, { "ce_loss": 0.11396670341491699, "epoch": 2.2048032021347566, "step": 6610 }, { "distill_loss": 0.10318335890769958, "epoch": 2.2048032021347566, "step": 6610 }, { "epoch": 2.2048032021347566, "ref_ce_loss": 0.09993364661931992, "step": 6610 }, { "epoch": 2.208138759172782, "loss": 0.512, "step": 6620 }, { "epoch": 2.208138759172782, "grad_norm": 4.26342248916626, "step": 6620 }, { "epoch": 2.208138759172782, "learning_rate": 0.0002548560656259104, "step": 6620 }, { "epoch": 2.208138759172782, "loss": 0.47284626960754395, "step": 6620 }, { "ce_loss": 0.2442830353975296, "epoch": 2.208138759172782, "step": 6620 }, { "distill_loss": 0.11992715299129486, "epoch": 2.208138759172782, "step": 6620 }, { "epoch": 2.208138759172782, "ref_ce_loss": 0.07802657037973404, "step": 6620 }, { "epoch": 2.208138759172782, "loss": 0.7858883142471313, "step": 6620 }, { "ce_loss": 0.15104246139526367, "epoch": 2.208138759172782, "step": 6620 }, { "distill_loss": 0.09752034395933151, "epoch": 2.208138759172782, "step": 6620 }, { "epoch": 2.208138759172782, "ref_ce_loss": 0.1495540291070938, "step": 6620 }, { "epoch": 2.208138759172782, "loss": 0.6424992680549622, "step": 6620 }, { "ce_loss": 0.21502114832401276, "epoch": 2.208138759172782, "step": 6620 }, { "distill_loss": 0.12338387966156006, "epoch": 2.208138759172782, "step": 6620 }, { "epoch": 2.208138759172782, "ref_ce_loss": 0.14934112131595612, "step": 6620 }, { "epoch": 2.208138759172782, "loss": 0.5512647032737732, "step": 6620 }, { "ce_loss": 0.2337639182806015, "epoch": 2.208138759172782, "step": 6620 }, { "distill_loss": 0.11226874589920044, "epoch": 2.208138759172782, "step": 6620 }, { "epoch": 2.208138759172782, "ref_ce_loss": 0.1710253655910492, "step": 6620 }, { "epoch": 2.2114743162108073, "loss": 0.5531, "step": 6630 }, { "epoch": 2.2114743162108073, "grad_norm": 1.902422308921814, "step": 6630 }, { "epoch": 2.2114743162108073, "learning_rate": 0.0002547111220049765, "step": 6630 }, { "epoch": 2.2114743162108073, "loss": 0.4596637785434723, "step": 6630 }, { "ce_loss": 0.18465951085090637, "epoch": 2.2114743162108073, "step": 6630 }, { "distill_loss": 0.12532518804073334, "epoch": 2.2114743162108073, "step": 6630 }, { "epoch": 2.2114743162108073, "ref_ce_loss": 0.09735433012247086, "step": 6630 }, { "epoch": 2.2114743162108073, "loss": 0.6123343110084534, "step": 6630 }, { "ce_loss": 0.1756560355424881, "epoch": 2.2114743162108073, "step": 6630 }, { "distill_loss": 0.15115751326084137, "epoch": 2.2114743162108073, "step": 6630 }, { "epoch": 2.2114743162108073, "ref_ce_loss": 0.10715075582265854, "step": 6630 }, { "epoch": 2.2114743162108073, "loss": 0.44199562072753906, "step": 6630 }, { "ce_loss": 0.16095171868801117, "epoch": 2.2114743162108073, "step": 6630 }, { "distill_loss": 0.07144710421562195, "epoch": 2.2114743162108073, "step": 6630 }, { "epoch": 2.2114743162108073, "ref_ce_loss": 0.1259879767894745, "step": 6630 }, { "epoch": 2.2114743162108073, "loss": 0.3934045433998108, "step": 6630 }, { "ce_loss": 0.1257804036140442, "epoch": 2.2114743162108073, "step": 6630 }, { "distill_loss": 0.10377589613199234, "epoch": 2.2114743162108073, "step": 6630 }, { "epoch": 2.2114743162108073, "ref_ce_loss": 0.09237315505743027, "step": 6630 }, { "epoch": 2.2148098732488326, "loss": 0.5225, "step": 6640 }, { "epoch": 2.2148098732488326, "grad_norm": 2.804863214492798, "step": 6640 }, { "epoch": 2.2148098732488326, "learning_rate": 0.00025456598743226134, "step": 6640 }, { "epoch": 2.2148098732488326, "loss": 0.5671036243438721, "step": 6640 }, { "ce_loss": 0.22472530603408813, "epoch": 2.2148098732488326, "step": 6640 }, { "distill_loss": 0.146651491522789, "epoch": 2.2148098732488326, "step": 6640 }, { "epoch": 2.2148098732488326, "ref_ce_loss": 0.12498027831315994, "step": 6640 }, { "epoch": 2.2148098732488326, "loss": 0.51534503698349, "step": 6640 }, { "ce_loss": 0.21721240878105164, "epoch": 2.2148098732488326, "step": 6640 }, { "distill_loss": 0.14261776208877563, "epoch": 2.2148098732488326, "step": 6640 }, { "epoch": 2.2148098732488326, "ref_ce_loss": 0.1554638296365738, "step": 6640 }, { "epoch": 2.2148098732488326, "loss": 0.625286877155304, "step": 6640 }, { "ce_loss": 0.17786265909671783, "epoch": 2.2148098732488326, "step": 6640 }, { "distill_loss": 0.16213417053222656, "epoch": 2.2148098732488326, "step": 6640 }, { "epoch": 2.2148098732488326, "ref_ce_loss": 0.1285240203142166, "step": 6640 }, { "epoch": 2.2148098732488326, "loss": 0.7410913109779358, "step": 6640 }, { "ce_loss": 0.14669311046600342, "epoch": 2.2148098732488326, "step": 6640 }, { "distill_loss": 0.12467852979898453, "epoch": 2.2148098732488326, "step": 6640 }, { "epoch": 2.2148098732488326, "ref_ce_loss": 0.12662538886070251, "step": 6640 }, { "epoch": 2.218145430286858, "loss": 0.5819, "step": 6650 }, { "epoch": 2.218145430286858, "grad_norm": 2.284592390060425, "step": 6650 }, { "epoch": 2.218145430286858, "learning_rate": 0.0002544206621724329, "step": 6650 }, { "epoch": 2.218145430286858, "loss": 0.7698065042495728, "step": 6650 }, { "ce_loss": 0.36121097207069397, "epoch": 2.218145430286858, "step": 6650 }, { "distill_loss": 0.19637200236320496, "epoch": 2.218145430286858, "step": 6650 }, { "epoch": 2.218145430286858, "ref_ce_loss": 0.15686072409152985, "step": 6650 }, { "epoch": 2.218145430286858, "loss": 0.2823468744754791, "step": 6650 }, { "ce_loss": 0.08524331450462341, "epoch": 2.218145430286858, "step": 6650 }, { "distill_loss": 0.1248115599155426, "epoch": 2.218145430286858, "step": 6650 }, { "epoch": 2.218145430286858, "ref_ce_loss": 0.0721224993467331, "step": 6650 }, { "epoch": 2.218145430286858, "loss": 0.8946218490600586, "step": 6650 }, { "ce_loss": 0.17718909680843353, "epoch": 2.218145430286858, "step": 6650 }, { "distill_loss": 0.14615222811698914, "epoch": 2.218145430286858, "step": 6650 }, { "epoch": 2.218145430286858, "ref_ce_loss": 0.12515729665756226, "step": 6650 }, { "epoch": 2.218145430286858, "loss": 0.40610453486442566, "step": 6650 }, { "ce_loss": 0.1060795858502388, "epoch": 2.218145430286858, "step": 6650 }, { "distill_loss": 0.17362916469573975, "epoch": 2.218145430286858, "step": 6650 }, { "epoch": 2.218145430286858, "ref_ce_loss": 0.08574973046779633, "step": 6650 }, { "epoch": 2.2214809873248833, "loss": 0.5958, "step": 6660 }, { "epoch": 2.2214809873248833, "grad_norm": 2.7552988529205322, "step": 6660 }, { "epoch": 2.2214809873248833, "learning_rate": 0.0002542751464905073, "step": 6660 }, { "epoch": 2.2214809873248833, "loss": 0.6930282115936279, "step": 6660 }, { "ce_loss": 0.27699872851371765, "epoch": 2.2214809873248833, "step": 6660 }, { "distill_loss": 0.18802866339683533, "epoch": 2.2214809873248833, "step": 6660 }, { "epoch": 2.2214809873248833, "ref_ce_loss": 0.13296718895435333, "step": 6660 }, { "epoch": 2.2214809873248833, "loss": 0.5645378828048706, "step": 6660 }, { "ce_loss": 0.15500415861606598, "epoch": 2.2214809873248833, "step": 6660 }, { "distill_loss": 0.1263539344072342, "epoch": 2.2214809873248833, "step": 6660 }, { "epoch": 2.2214809873248833, "ref_ce_loss": 0.13144756853580475, "step": 6660 }, { "epoch": 2.2214809873248833, "loss": 0.4319797456264496, "step": 6660 }, { "ce_loss": 0.15598873794078827, "epoch": 2.2214809873248833, "step": 6660 }, { "distill_loss": 0.1293891817331314, "epoch": 2.2214809873248833, "step": 6660 }, { "epoch": 2.2214809873248833, "ref_ce_loss": 0.14640480279922485, "step": 6660 }, { "epoch": 2.2214809873248833, "loss": 0.3450653553009033, "step": 6660 }, { "ce_loss": 0.086029551923275, "epoch": 2.2214809873248833, "step": 6660 }, { "distill_loss": 0.1262974590063095, "epoch": 2.2214809873248833, "step": 6660 }, { "epoch": 2.2214809873248833, "ref_ce_loss": 0.11724057048559189, "step": 6660 }, { "epoch": 2.2248165443629087, "loss": 0.578, "step": 6670 }, { "epoch": 2.2248165443629087, "grad_norm": 2.0899698734283447, "step": 6670 }, { "epoch": 2.2248165443629087, "learning_rate": 0.0002541294406518477, "step": 6670 }, { "epoch": 2.2248165443629087, "loss": 0.519721508026123, "step": 6670 }, { "ce_loss": 0.20661495625972748, "epoch": 2.2248165443629087, "step": 6670 }, { "distill_loss": 0.11647433042526245, "epoch": 2.2248165443629087, "step": 6670 }, { "epoch": 2.2248165443629087, "ref_ce_loss": 0.14371584355831146, "step": 6670 }, { "epoch": 2.2248165443629087, "loss": 0.41748136281967163, "step": 6670 }, { "ce_loss": 0.0835602805018425, "epoch": 2.2248165443629087, "step": 6670 }, { "distill_loss": 0.11514532566070557, "epoch": 2.2248165443629087, "step": 6670 }, { "epoch": 2.2248165443629087, "ref_ce_loss": 0.08703186362981796, "step": 6670 }, { "epoch": 2.2248165443629087, "loss": 0.7007561922073364, "step": 6670 }, { "ce_loss": 0.17401117086410522, "epoch": 2.2248165443629087, "step": 6670 }, { "distill_loss": 0.16014891862869263, "epoch": 2.2248165443629087, "step": 6670 }, { "epoch": 2.2248165443629087, "ref_ce_loss": 0.11379396170377731, "step": 6670 }, { "epoch": 2.2248165443629087, "loss": 0.8025463819503784, "step": 6670 }, { "ce_loss": 0.23345544934272766, "epoch": 2.2248165443629087, "step": 6670 }, { "distill_loss": 0.15384605526924133, "epoch": 2.2248165443629087, "step": 6670 }, { "epoch": 2.2248165443629087, "ref_ce_loss": 0.1933191865682602, "step": 6670 }, { "epoch": 2.228152101400934, "loss": 0.5479, "step": 6680 }, { "epoch": 2.228152101400934, "grad_norm": 2.624807357788086, "step": 6680 }, { "epoch": 2.228152101400934, "learning_rate": 0.0002539835449221641, "step": 6680 }, { "epoch": 2.228152101400934, "loss": 0.8735204339027405, "step": 6680 }, { "ce_loss": 0.17503662407398224, "epoch": 2.228152101400934, "step": 6680 }, { "distill_loss": 0.12262516468763351, "epoch": 2.228152101400934, "step": 6680 }, { "epoch": 2.228152101400934, "ref_ce_loss": 0.15919841825962067, "step": 6680 }, { "epoch": 2.228152101400934, "loss": 0.5863019227981567, "step": 6680 }, { "ce_loss": 0.12611548602581024, "epoch": 2.228152101400934, "step": 6680 }, { "distill_loss": 0.10386236757040024, "epoch": 2.228152101400934, "step": 6680 }, { "epoch": 2.228152101400934, "ref_ce_loss": 0.1623413860797882, "step": 6680 }, { "epoch": 2.228152101400934, "loss": 0.8576542139053345, "step": 6680 }, { "ce_loss": 0.15461662411689758, "epoch": 2.228152101400934, "step": 6680 }, { "distill_loss": 0.13622869551181793, "epoch": 2.228152101400934, "step": 6680 }, { "epoch": 2.228152101400934, "ref_ce_loss": 0.09053165465593338, "step": 6680 }, { "epoch": 2.228152101400934, "loss": 0.3839050531387329, "step": 6680 }, { "ce_loss": 0.17515422403812408, "epoch": 2.228152101400934, "step": 6680 }, { "distill_loss": 0.12041886150836945, "epoch": 2.228152101400934, "step": 6680 }, { "epoch": 2.228152101400934, "ref_ce_loss": 0.0845176950097084, "step": 6680 }, { "epoch": 2.2314876584389594, "loss": 0.5373, "step": 6690 }, { "epoch": 2.2314876584389594, "grad_norm": 3.100541591644287, "step": 6690 }, { "epoch": 2.2314876584389594, "learning_rate": 0.0002538374595675126, "step": 6690 }, { "epoch": 2.2314876584389594, "loss": 0.34606313705444336, "step": 6690 }, { "ce_loss": 0.09983520209789276, "epoch": 2.2314876584389594, "step": 6690 }, { "distill_loss": 0.09398765861988068, "epoch": 2.2314876584389594, "step": 6690 }, { "epoch": 2.2314876584389594, "ref_ce_loss": 0.10421671718358994, "step": 6690 }, { "epoch": 2.2314876584389594, "loss": 0.46764615178108215, "step": 6690 }, { "ce_loss": 0.10423019528388977, "epoch": 2.2314876584389594, "step": 6690 }, { "distill_loss": 0.11818268150091171, "epoch": 2.2314876584389594, "step": 6690 }, { "epoch": 2.2314876584389594, "ref_ce_loss": 0.12623147666454315, "step": 6690 }, { "epoch": 2.2314876584389594, "loss": 0.44064533710479736, "step": 6690 }, { "ce_loss": 0.13126851618289948, "epoch": 2.2314876584389594, "step": 6690 }, { "distill_loss": 0.10611870884895325, "epoch": 2.2314876584389594, "step": 6690 }, { "epoch": 2.2314876584389594, "ref_ce_loss": 0.13785810768604279, "step": 6690 }, { "epoch": 2.2314876584389594, "loss": 0.6716941595077515, "step": 6690 }, { "ce_loss": 0.1662738025188446, "epoch": 2.2314876584389594, "step": 6690 }, { "distill_loss": 0.08780381828546524, "epoch": 2.2314876584389594, "step": 6690 }, { "epoch": 2.2314876584389594, "ref_ce_loss": 0.12413392215967178, "step": 6690 }, { "epoch": 2.2348232154769847, "loss": 0.5717, "step": 6700 }, { "epoch": 2.2348232154769847, "grad_norm": 2.7917656898498535, "step": 6700 }, { "epoch": 2.2348232154769847, "learning_rate": 0.00025369118485429545, "step": 6700 }, { "epoch": 2.2348232154769847, "loss": 0.6208775639533997, "step": 6700 }, { "ce_loss": 0.3293250501155853, "epoch": 2.2348232154769847, "step": 6700 }, { "distill_loss": 0.1531747728586197, "epoch": 2.2348232154769847, "step": 6700 }, { "epoch": 2.2348232154769847, "ref_ce_loss": 0.13826331496238708, "step": 6700 }, { "epoch": 2.2348232154769847, "loss": 0.8903650045394897, "step": 6700 }, { "ce_loss": 0.2016555815935135, "epoch": 2.2348232154769847, "step": 6700 }, { "distill_loss": 0.10579682141542435, "epoch": 2.2348232154769847, "step": 6700 }, { "epoch": 2.2348232154769847, "ref_ce_loss": 0.14238031208515167, "step": 6700 }, { "epoch": 2.2348232154769847, "loss": 0.45686089992523193, "step": 6700 }, { "ce_loss": 0.197709321975708, "epoch": 2.2348232154769847, "step": 6700 }, { "distill_loss": 0.11980358511209488, "epoch": 2.2348232154769847, "step": 6700 }, { "epoch": 2.2348232154769847, "ref_ce_loss": 0.13925841450691223, "step": 6700 }, { "epoch": 2.2348232154769847, "loss": 0.4481344521045685, "step": 6700 }, { "ce_loss": 0.2102845311164856, "epoch": 2.2348232154769847, "step": 6700 }, { "distill_loss": 0.11122799664735794, "epoch": 2.2348232154769847, "step": 6700 }, { "epoch": 2.2348232154769847, "ref_ce_loss": 0.1259424388408661, "step": 6700 }, { "epoch": 2.23815877251501, "loss": 0.518, "step": 6710 }, { "epoch": 2.23815877251501, "grad_norm": 2.218740463256836, "step": 6710 }, { "epoch": 2.23815877251501, "learning_rate": 0.00025354472104926, "step": 6710 }, { "epoch": 2.23815877251501, "loss": 0.3201616108417511, "step": 6710 }, { "ce_loss": 0.0903606191277504, "epoch": 2.23815877251501, "step": 6710 }, { "distill_loss": 0.098169706761837, "epoch": 2.23815877251501, "step": 6710 }, { "epoch": 2.23815877251501, "ref_ce_loss": 0.09702183306217194, "step": 6710 }, { "epoch": 2.23815877251501, "loss": 0.43331289291381836, "step": 6710 }, { "ce_loss": 0.14315032958984375, "epoch": 2.23815877251501, "step": 6710 }, { "distill_loss": 0.11144326627254486, "epoch": 2.23815877251501, "step": 6710 }, { "epoch": 2.23815877251501, "ref_ce_loss": 0.1140560731291771, "step": 6710 }, { "epoch": 2.23815877251501, "loss": 0.6270884871482849, "step": 6710 }, { "ce_loss": 0.16704252362251282, "epoch": 2.23815877251501, "step": 6710 }, { "distill_loss": 0.1067693904042244, "epoch": 2.23815877251501, "step": 6710 }, { "epoch": 2.23815877251501, "ref_ce_loss": 0.101380854845047, "step": 6710 }, { "epoch": 2.23815877251501, "loss": 0.5825164318084717, "step": 6710 }, { "ce_loss": 0.1660555601119995, "epoch": 2.23815877251501, "step": 6710 }, { "distill_loss": 0.11673898249864578, "epoch": 2.23815877251501, "step": 6710 }, { "epoch": 2.23815877251501, "ref_ce_loss": 0.10022629052400589, "step": 6710 }, { "epoch": 2.2414943295530354, "loss": 0.5457, "step": 6720 }, { "epoch": 2.2414943295530354, "grad_norm": 3.044771909713745, "step": 6720 }, { "epoch": 2.2414943295530354, "learning_rate": 0.00025339806841949837, "step": 6720 }, { "epoch": 2.2414943295530354, "loss": 1.189454197883606, "step": 6720 }, { "ce_loss": 0.1252426952123642, "epoch": 2.2414943295530354, "step": 6720 }, { "distill_loss": 0.0986853614449501, "epoch": 2.2414943295530354, "step": 6720 }, { "epoch": 2.2414943295530354, "ref_ce_loss": 0.09285452961921692, "step": 6720 }, { "epoch": 2.2414943295530354, "loss": 0.7065540552139282, "step": 6720 }, { "ce_loss": 0.16824042797088623, "epoch": 2.2414943295530354, "step": 6720 }, { "distill_loss": 0.12243731319904327, "epoch": 2.2414943295530354, "step": 6720 }, { "epoch": 2.2414943295530354, "ref_ce_loss": 0.11678522825241089, "step": 6720 }, { "epoch": 2.2414943295530354, "loss": 0.6083236932754517, "step": 6720 }, { "ce_loss": 0.1929941177368164, "epoch": 2.2414943295530354, "step": 6720 }, { "distill_loss": 0.09316430985927582, "epoch": 2.2414943295530354, "step": 6720 }, { "epoch": 2.2414943295530354, "ref_ce_loss": 0.1361846625804901, "step": 6720 }, { "epoch": 2.2414943295530354, "loss": 0.4545796811580658, "step": 6720 }, { "ce_loss": 0.18878419697284698, "epoch": 2.2414943295530354, "step": 6720 }, { "distill_loss": 0.14338716864585876, "epoch": 2.2414943295530354, "step": 6720 }, { "epoch": 2.2414943295530354, "ref_ce_loss": 0.09671805799007416, "step": 6720 }, { "epoch": 2.2448298865910608, "loss": 0.6404, "step": 6730 }, { "epoch": 2.2448298865910608, "grad_norm": 3.4351155757904053, "step": 6730 }, { "epoch": 2.2448298865910608, "learning_rate": 0.0002532512272324472, "step": 6730 }, { "epoch": 2.2448298865910608, "loss": 0.4270901381969452, "step": 6730 }, { "ce_loss": 0.08665609359741211, "epoch": 2.2448298865910608, "step": 6730 }, { "distill_loss": 0.15767119824886322, "epoch": 2.2448298865910608, "step": 6730 }, { "epoch": 2.2448298865910608, "ref_ce_loss": 0.12011439353227615, "step": 6730 }, { "epoch": 2.2448298865910608, "loss": 0.48001086711883545, "step": 6730 }, { "ce_loss": 0.13299185037612915, "epoch": 2.2448298865910608, "step": 6730 }, { "distill_loss": 0.14891919493675232, "epoch": 2.2448298865910608, "step": 6730 }, { "epoch": 2.2448298865910608, "ref_ce_loss": 0.108582504093647, "step": 6730 }, { "epoch": 2.2448298865910608, "loss": 0.6431511640548706, "step": 6730 }, { "ce_loss": 0.14817194640636444, "epoch": 2.2448298865910608, "step": 6730 }, { "distill_loss": 0.13265648484230042, "epoch": 2.2448298865910608, "step": 6730 }, { "epoch": 2.2448298865910608, "ref_ce_loss": 0.09085676819086075, "step": 6730 }, { "epoch": 2.2448298865910608, "loss": 0.6905701756477356, "step": 6730 }, { "ce_loss": 0.23921699821949005, "epoch": 2.2448298865910608, "step": 6730 }, { "distill_loss": 0.20255146920681, "epoch": 2.2448298865910608, "step": 6730 }, { "epoch": 2.2448298865910608, "ref_ce_loss": 0.19140289723873138, "step": 6730 }, { "epoch": 2.248165443629086, "loss": 0.6584, "step": 6740 }, { "epoch": 2.248165443629086, "grad_norm": 3.7747929096221924, "step": 6740 }, { "epoch": 2.248165443629086, "learning_rate": 0.0002531041977558868, "step": 6740 }, { "epoch": 2.248165443629086, "loss": 0.4672980308532715, "step": 6740 }, { "ce_loss": 0.15710696578025818, "epoch": 2.248165443629086, "step": 6740 }, { "distill_loss": 0.1362805962562561, "epoch": 2.248165443629086, "step": 6740 }, { "epoch": 2.248165443629086, "ref_ce_loss": 0.12380146235227585, "step": 6740 }, { "epoch": 2.248165443629086, "loss": 0.8580319285392761, "step": 6740 }, { "ce_loss": 0.15320658683776855, "epoch": 2.248165443629086, "step": 6740 }, { "distill_loss": 0.12126266956329346, "epoch": 2.248165443629086, "step": 6740 }, { "epoch": 2.248165443629086, "ref_ce_loss": 0.09768229722976685, "step": 6740 }, { "epoch": 2.248165443629086, "loss": 0.247745543718338, "step": 6740 }, { "ce_loss": 0.07998524606227875, "epoch": 2.248165443629086, "step": 6740 }, { "distill_loss": 0.08720527589321136, "epoch": 2.248165443629086, "step": 6740 }, { "epoch": 2.248165443629086, "ref_ce_loss": 0.049813829362392426, "step": 6740 }, { "epoch": 2.248165443629086, "loss": 0.464148610830307, "step": 6740 }, { "ce_loss": 0.16646577417850494, "epoch": 2.248165443629086, "step": 6740 }, { "distill_loss": 0.14713206887245178, "epoch": 2.248165443629086, "step": 6740 }, { "epoch": 2.248165443629086, "ref_ce_loss": 0.10102847218513489, "step": 6740 }, { "epoch": 2.2515010006671115, "loss": 0.551, "step": 6750 }, { "epoch": 2.2515010006671115, "grad_norm": 2.241534948348999, "step": 6750 }, { "epoch": 2.2515010006671115, "learning_rate": 0.00025295698025794094, "step": 6750 }, { "epoch": 2.2515010006671115, "loss": 0.5242320895195007, "step": 6750 }, { "ce_loss": 0.17825943231582642, "epoch": 2.2515010006671115, "step": 6750 }, { "distill_loss": 0.14979057013988495, "epoch": 2.2515010006671115, "step": 6750 }, { "epoch": 2.2515010006671115, "ref_ce_loss": 0.15575054287910461, "step": 6750 }, { "epoch": 2.2515010006671115, "loss": 0.4172029495239258, "step": 6750 }, { "ce_loss": 0.16601456701755524, "epoch": 2.2515010006671115, "step": 6750 }, { "distill_loss": 0.14670082926750183, "epoch": 2.2515010006671115, "step": 6750 }, { "epoch": 2.2515010006671115, "ref_ce_loss": 0.1038890928030014, "step": 6750 }, { "epoch": 2.2515010006671115, "loss": 0.3701702356338501, "step": 6750 }, { "ce_loss": 0.1041710376739502, "epoch": 2.2515010006671115, "step": 6750 }, { "distill_loss": 0.10256893187761307, "epoch": 2.2515010006671115, "step": 6750 }, { "epoch": 2.2515010006671115, "ref_ce_loss": 0.12215316295623779, "step": 6750 }, { "epoch": 2.2515010006671115, "loss": 0.2804224193096161, "step": 6750 }, { "ce_loss": 0.06712121516466141, "epoch": 2.2515010006671115, "step": 6750 }, { "distill_loss": 0.08440475165843964, "epoch": 2.2515010006671115, "step": 6750 }, { "epoch": 2.2515010006671115, "ref_ce_loss": 0.0904068648815155, "step": 6750 }, { "epoch": 2.254836557705137, "loss": 0.537, "step": 6760 }, { "epoch": 2.254836557705137, "grad_norm": 3.29146146774292, "step": 6760 }, { "epoch": 2.254836557705137, "learning_rate": 0.0002528095750070764, "step": 6760 }, { "epoch": 2.254836557705137, "loss": 0.30096712708473206, "step": 6760 }, { "ce_loss": 0.06180081143975258, "epoch": 2.254836557705137, "step": 6760 }, { "distill_loss": 0.09566116333007812, "epoch": 2.254836557705137, "step": 6760 }, { "epoch": 2.254836557705137, "ref_ce_loss": 0.09152912348508835, "step": 6760 }, { "epoch": 2.254836557705137, "loss": 0.6212273836135864, "step": 6760 }, { "ce_loss": 0.3123440444469452, "epoch": 2.254836557705137, "step": 6760 }, { "distill_loss": 0.12399928271770477, "epoch": 2.254836557705137, "step": 6760 }, { "epoch": 2.254836557705137, "ref_ce_loss": 0.18479116261005402, "step": 6760 }, { "epoch": 2.254836557705137, "loss": 0.45415574312210083, "step": 6760 }, { "ce_loss": 0.18255992233753204, "epoch": 2.254836557705137, "step": 6760 }, { "distill_loss": 0.12544983625411987, "epoch": 2.254836557705137, "step": 6760 }, { "epoch": 2.254836557705137, "ref_ce_loss": 0.08303073048591614, "step": 6760 }, { "epoch": 2.254836557705137, "loss": 0.599034309387207, "step": 6760 }, { "ce_loss": 0.2385578453540802, "epoch": 2.254836557705137, "step": 6760 }, { "distill_loss": 0.13109007477760315, "epoch": 2.254836557705137, "step": 6760 }, { "epoch": 2.254836557705137, "ref_ce_loss": 0.12280596792697906, "step": 6760 }, { "epoch": 2.258172114743162, "loss": 0.5318, "step": 6770 }, { "epoch": 2.258172114743162, "grad_norm": 3.4085757732391357, "step": 6770 }, { "epoch": 2.258172114743162, "learning_rate": 0.00025266198227210203, "step": 6770 }, { "epoch": 2.258172114743162, "loss": 0.31608596444129944, "step": 6770 }, { "ce_loss": 0.07763687521219254, "epoch": 2.258172114743162, "step": 6770 }, { "distill_loss": 0.10791601240634918, "epoch": 2.258172114743162, "step": 6770 }, { "epoch": 2.258172114743162, "ref_ce_loss": 0.07429070770740509, "step": 6770 }, { "epoch": 2.258172114743162, "loss": 0.417531818151474, "step": 6770 }, { "ce_loss": 0.14198024570941925, "epoch": 2.258172114743162, "step": 6770 }, { "distill_loss": 0.13423949480056763, "epoch": 2.258172114743162, "step": 6770 }, { "epoch": 2.258172114743162, "ref_ce_loss": 0.14104554057121277, "step": 6770 }, { "epoch": 2.258172114743162, "loss": 0.4688641428947449, "step": 6770 }, { "ce_loss": 0.21024879813194275, "epoch": 2.258172114743162, "step": 6770 }, { "distill_loss": 0.1353413462638855, "epoch": 2.258172114743162, "step": 6770 }, { "epoch": 2.258172114743162, "ref_ce_loss": 0.12314224988222122, "step": 6770 }, { "epoch": 2.258172114743162, "loss": 0.4443889856338501, "step": 6770 }, { "ce_loss": 0.106757752597332, "epoch": 2.258172114743162, "step": 6770 }, { "distill_loss": 0.10572335869073868, "epoch": 2.258172114743162, "step": 6770 }, { "epoch": 2.258172114743162, "ref_ce_loss": 0.110373854637146, "step": 6770 }, { "epoch": 2.2615076717811875, "loss": 0.5922, "step": 6780 }, { "epoch": 2.2615076717811875, "grad_norm": 3.920358180999756, "step": 6780 }, { "epoch": 2.2615076717811875, "learning_rate": 0.0002525142023221689, "step": 6780 }, { "epoch": 2.2615076717811875, "loss": 0.2829582393169403, "step": 6780 }, { "ce_loss": 0.11151076853275299, "epoch": 2.2615076717811875, "step": 6780 }, { "distill_loss": 0.10000155121088028, "epoch": 2.2615076717811875, "step": 6780 }, { "epoch": 2.2615076717811875, "ref_ce_loss": 0.07125628739595413, "step": 6780 }, { "epoch": 2.2615076717811875, "loss": 0.5526301264762878, "step": 6780 }, { "ce_loss": 0.17572642862796783, "epoch": 2.2615076717811875, "step": 6780 }, { "distill_loss": 0.15626123547554016, "epoch": 2.2615076717811875, "step": 6780 }, { "epoch": 2.2615076717811875, "ref_ce_loss": 0.1614377349615097, "step": 6780 }, { "epoch": 2.2615076717811875, "loss": 0.5078950524330139, "step": 6780 }, { "ce_loss": 0.16883182525634766, "epoch": 2.2615076717811875, "step": 6780 }, { "distill_loss": 0.11712483316659927, "epoch": 2.2615076717811875, "step": 6780 }, { "epoch": 2.2615076717811875, "ref_ce_loss": 0.1466582715511322, "step": 6780 }, { "epoch": 2.2615076717811875, "loss": 0.7580641508102417, "step": 6780 }, { "ce_loss": 0.10384313017129898, "epoch": 2.2615076717811875, "step": 6780 }, { "distill_loss": 0.10925843566656113, "epoch": 2.2615076717811875, "step": 6780 }, { "epoch": 2.2615076717811875, "ref_ce_loss": 0.08862993121147156, "step": 6780 }, { "epoch": 2.264843228819213, "loss": 0.5518, "step": 6790 }, { "epoch": 2.264843228819213, "grad_norm": 1.862863302230835, "step": 6790 }, { "epoch": 2.264843228819213, "learning_rate": 0.0002523662354267693, "step": 6790 }, { "epoch": 2.264843228819213, "loss": 0.2049292027950287, "step": 6790 }, { "ce_loss": 0.054800186306238174, "epoch": 2.264843228819213, "step": 6790 }, { "distill_loss": 0.05896997079253197, "epoch": 2.264843228819213, "step": 6790 }, { "epoch": 2.264843228819213, "ref_ce_loss": 0.055623188614845276, "step": 6790 }, { "epoch": 2.264843228819213, "loss": 0.4574205279350281, "step": 6790 }, { "ce_loss": 0.16178199648857117, "epoch": 2.264843228819213, "step": 6790 }, { "distill_loss": 0.0981992781162262, "epoch": 2.264843228819213, "step": 6790 }, { "epoch": 2.264843228819213, "ref_ce_loss": 0.10306326299905777, "step": 6790 }, { "epoch": 2.264843228819213, "loss": 0.3297522962093353, "step": 6790 }, { "ce_loss": 0.10595489293336868, "epoch": 2.264843228819213, "step": 6790 }, { "distill_loss": 0.1075303927063942, "epoch": 2.264843228819213, "step": 6790 }, { "epoch": 2.264843228819213, "ref_ce_loss": 0.0912047028541565, "step": 6790 }, { "epoch": 2.264843228819213, "loss": 0.932381272315979, "step": 6790 }, { "ce_loss": 0.19181010127067566, "epoch": 2.264843228819213, "step": 6790 }, { "distill_loss": 0.13706378638744354, "epoch": 2.264843228819213, "step": 6790 }, { "epoch": 2.264843228819213, "ref_ce_loss": 0.18505807220935822, "step": 6790 }, { "epoch": 2.268178785857238, "loss": 0.5435, "step": 6800 }, { "epoch": 2.268178785857238, "grad_norm": 3.101024627685547, "step": 6800 }, { "epoch": 2.268178785857238, "learning_rate": 0.0002522180818557364, "step": 6800 }, { "epoch": 2.268178785857238, "loss": 0.7316094040870667, "step": 6800 }, { "ce_loss": 0.12107143551111221, "epoch": 2.268178785857238, "step": 6800 }, { "distill_loss": 0.09981100261211395, "epoch": 2.268178785857238, "step": 6800 }, { "epoch": 2.268178785857238, "ref_ce_loss": 0.11426037549972534, "step": 6800 }, { "epoch": 2.268178785857238, "loss": 0.4724943935871124, "step": 6800 }, { "ce_loss": 0.19530630111694336, "epoch": 2.268178785857238, "step": 6800 }, { "distill_loss": 0.1164764016866684, "epoch": 2.268178785857238, "step": 6800 }, { "epoch": 2.268178785857238, "ref_ce_loss": 0.12989644706249237, "step": 6800 }, { "epoch": 2.268178785857238, "loss": 0.288299024105072, "step": 6800 }, { "ce_loss": 0.08834747225046158, "epoch": 2.268178785857238, "step": 6800 }, { "distill_loss": 0.07737602293491364, "epoch": 2.268178785857238, "step": 6800 }, { "epoch": 2.268178785857238, "ref_ce_loss": 0.12200861424207687, "step": 6800 }, { "epoch": 2.268178785857238, "loss": 0.3896799087524414, "step": 6800 }, { "ce_loss": 0.14076721668243408, "epoch": 2.268178785857238, "step": 6800 }, { "distill_loss": 0.09837865829467773, "epoch": 2.268178785857238, "step": 6800 }, { "epoch": 2.268178785857238, "ref_ce_loss": 0.10813792049884796, "step": 6800 }, { "epoch": 2.2715143428952635, "loss": 0.5463, "step": 6810 }, { "epoch": 2.2715143428952635, "grad_norm": 3.9756486415863037, "step": 6810 }, { "epoch": 2.2715143428952635, "learning_rate": 0.00025206974187924397, "step": 6810 }, { "epoch": 2.2715143428952635, "loss": 0.7348902821540833, "step": 6810 }, { "ce_loss": 0.1007981151342392, "epoch": 2.2715143428952635, "step": 6810 }, { "distill_loss": 0.07493539154529572, "epoch": 2.2715143428952635, "step": 6810 }, { "epoch": 2.2715143428952635, "ref_ce_loss": 0.1073109582066536, "step": 6810 }, { "epoch": 2.2715143428952635, "loss": 0.7470747232437134, "step": 6810 }, { "ce_loss": 0.2006990760564804, "epoch": 2.2715143428952635, "step": 6810 }, { "distill_loss": 0.0991901382803917, "epoch": 2.2715143428952635, "step": 6810 }, { "epoch": 2.2715143428952635, "ref_ce_loss": 0.11322102695703506, "step": 6810 }, { "epoch": 2.2715143428952635, "loss": 0.49651744961738586, "step": 6810 }, { "ce_loss": 0.19121769070625305, "epoch": 2.2715143428952635, "step": 6810 }, { "distill_loss": 0.08532430976629257, "epoch": 2.2715143428952635, "step": 6810 }, { "epoch": 2.2715143428952635, "ref_ce_loss": 0.1548120677471161, "step": 6810 }, { "epoch": 2.2715143428952635, "loss": 0.44100818037986755, "step": 6810 }, { "ce_loss": 0.20114682614803314, "epoch": 2.2715143428952635, "step": 6810 }, { "distill_loss": 0.13773350417613983, "epoch": 2.2715143428952635, "step": 6810 }, { "epoch": 2.2715143428952635, "ref_ce_loss": 0.0654730275273323, "step": 6810 }, { "epoch": 2.274849899933289, "loss": 0.5656, "step": 6820 }, { "epoch": 2.274849899933289, "grad_norm": 2.955838441848755, "step": 6820 }, { "epoch": 2.274849899933289, "learning_rate": 0.0002519212157678056, "step": 6820 }, { "epoch": 2.274849899933289, "loss": 0.5254935622215271, "step": 6820 }, { "ce_loss": 0.24336126446723938, "epoch": 2.274849899933289, "step": 6820 }, { "distill_loss": 0.13984636962413788, "epoch": 2.274849899933289, "step": 6820 }, { "epoch": 2.274849899933289, "ref_ce_loss": 0.10717921704053879, "step": 6820 }, { "epoch": 2.274849899933289, "loss": 0.861416220664978, "step": 6820 }, { "ce_loss": 0.17388497292995453, "epoch": 2.274849899933289, "step": 6820 }, { "distill_loss": 0.12055382877588272, "epoch": 2.274849899933289, "step": 6820 }, { "epoch": 2.274849899933289, "ref_ce_loss": 0.1794699728488922, "step": 6820 }, { "epoch": 2.274849899933289, "loss": 0.44711706042289734, "step": 6820 }, { "ce_loss": 0.21135549247264862, "epoch": 2.274849899933289, "step": 6820 }, { "distill_loss": 0.1068238839507103, "epoch": 2.274849899933289, "step": 6820 }, { "epoch": 2.274849899933289, "ref_ce_loss": 0.12878869473934174, "step": 6820 }, { "epoch": 2.274849899933289, "loss": 0.9810287952423096, "step": 6820 }, { "ce_loss": 0.24978908896446228, "epoch": 2.274849899933289, "step": 6820 }, { "distill_loss": 0.11645025759935379, "epoch": 2.274849899933289, "step": 6820 }, { "epoch": 2.274849899933289, "ref_ce_loss": 0.1504693478345871, "step": 6820 }, { "epoch": 2.2781854569713142, "loss": 0.4938, "step": 6830 }, { "epoch": 2.2781854569713142, "grad_norm": 1.7476575374603271, "step": 6830 }, { "epoch": 2.2781854569713142, "learning_rate": 0.00025177250379227427, "step": 6830 }, { "epoch": 2.2781854569713142, "loss": 0.39564117789268494, "step": 6830 }, { "ce_loss": 0.16511234641075134, "epoch": 2.2781854569713142, "step": 6830 }, { "distill_loss": 0.1051425114274025, "epoch": 2.2781854569713142, "step": 6830 }, { "epoch": 2.2781854569713142, "ref_ce_loss": 0.10434064269065857, "step": 6830 }, { "epoch": 2.2781854569713142, "loss": 0.4065183103084564, "step": 6830 }, { "ce_loss": 0.1354883462190628, "epoch": 2.2781854569713142, "step": 6830 }, { "distill_loss": 0.08841560781002045, "epoch": 2.2781854569713142, "step": 6830 }, { "epoch": 2.2781854569713142, "ref_ce_loss": 0.10876308381557465, "step": 6830 }, { "epoch": 2.2781854569713142, "loss": 0.553986668586731, "step": 6830 }, { "ce_loss": 0.16752132773399353, "epoch": 2.2781854569713142, "step": 6830 }, { "distill_loss": 0.09990599751472473, "epoch": 2.2781854569713142, "step": 6830 }, { "epoch": 2.2781854569713142, "ref_ce_loss": 0.190854012966156, "step": 6830 }, { "epoch": 2.2781854569713142, "loss": 0.6248155236244202, "step": 6830 }, { "ce_loss": 0.1778598129749298, "epoch": 2.2781854569713142, "step": 6830 }, { "distill_loss": 0.12430022656917572, "epoch": 2.2781854569713142, "step": 6830 }, { "epoch": 2.2781854569713142, "ref_ce_loss": 0.15459713339805603, "step": 6830 }, { "epoch": 2.2815210140093396, "loss": 0.5051, "step": 6840 }, { "epoch": 2.2815210140093396, "grad_norm": 2.298872232437134, "step": 6840 }, { "epoch": 2.2815210140093396, "learning_rate": 0.00025162360622384204, "step": 6840 }, { "epoch": 2.2815210140093396, "loss": 0.9022257328033447, "step": 6840 }, { "ce_loss": 0.13605789840221405, "epoch": 2.2815210140093396, "step": 6840 }, { "distill_loss": 0.09870048612356186, "epoch": 2.2815210140093396, "step": 6840 }, { "epoch": 2.2815210140093396, "ref_ce_loss": 0.1417873352766037, "step": 6840 }, { "epoch": 2.2815210140093396, "loss": 0.48904597759246826, "step": 6840 }, { "ce_loss": 0.24442356824874878, "epoch": 2.2815210140093396, "step": 6840 }, { "distill_loss": 0.10273656249046326, "epoch": 2.2815210140093396, "step": 6840 }, { "epoch": 2.2815210140093396, "ref_ce_loss": 0.1416039913892746, "step": 6840 }, { "epoch": 2.2815210140093396, "loss": 0.4396345317363739, "step": 6840 }, { "ce_loss": 0.1511615812778473, "epoch": 2.2815210140093396, "step": 6840 }, { "distill_loss": 0.08964379131793976, "epoch": 2.2815210140093396, "step": 6840 }, { "epoch": 2.2815210140093396, "ref_ce_loss": 0.11310078203678131, "step": 6840 }, { "epoch": 2.2815210140093396, "loss": 0.717617392539978, "step": 6840 }, { "ce_loss": 0.22391705214977264, "epoch": 2.2815210140093396, "step": 6840 }, { "distill_loss": 0.12639391422271729, "epoch": 2.2815210140093396, "step": 6840 }, { "epoch": 2.2815210140093396, "ref_ce_loss": 0.15380483865737915, "step": 6840 }, { "epoch": 2.284856571047365, "loss": 0.5406, "step": 6850 }, { "epoch": 2.284856571047365, "grad_norm": 4.39909553527832, "step": 6850 }, { "epoch": 2.284856571047365, "learning_rate": 0.0002514745233340393, "step": 6850 }, { "epoch": 2.284856571047365, "loss": 0.33096733689308167, "step": 6850 }, { "ce_loss": 0.06250643730163574, "epoch": 2.284856571047365, "step": 6850 }, { "distill_loss": 0.09241592884063721, "epoch": 2.284856571047365, "step": 6850 }, { "epoch": 2.284856571047365, "ref_ce_loss": 0.08213594555854797, "step": 6850 }, { "epoch": 2.284856571047365, "loss": 0.4952237010002136, "step": 6850 }, { "ce_loss": 0.19607624411582947, "epoch": 2.284856571047365, "step": 6850 }, { "distill_loss": 0.11687838286161423, "epoch": 2.284856571047365, "step": 6850 }, { "epoch": 2.284856571047365, "ref_ce_loss": 0.13495883345603943, "step": 6850 }, { "epoch": 2.284856571047365, "loss": 0.28248468041419983, "step": 6850 }, { "ce_loss": 0.07966279238462448, "epoch": 2.284856571047365, "step": 6850 }, { "distill_loss": 0.08891388028860092, "epoch": 2.284856571047365, "step": 6850 }, { "epoch": 2.284856571047365, "ref_ce_loss": 0.11379349231719971, "step": 6850 }, { "epoch": 2.284856571047365, "loss": 0.6294571161270142, "step": 6850 }, { "ce_loss": 0.23358556628227234, "epoch": 2.284856571047365, "step": 6850 }, { "distill_loss": 0.12204214185476303, "epoch": 2.284856571047365, "step": 6850 }, { "epoch": 2.284856571047365, "ref_ce_loss": 0.11346219480037689, "step": 6850 }, { "epoch": 2.2881921280853903, "loss": 0.5312, "step": 6860 }, { "epoch": 2.2881921280853903, "grad_norm": 2.713907241821289, "step": 6860 }, { "epoch": 2.2881921280853903, "learning_rate": 0.0002513252553947344, "step": 6860 }, { "epoch": 2.2881921280853903, "loss": 1.2297799587249756, "step": 6860 }, { "ce_loss": 0.2428199201822281, "epoch": 2.2881921280853903, "step": 6860 }, { "distill_loss": 0.10671888291835785, "epoch": 2.2881921280853903, "step": 6860 }, { "epoch": 2.2881921280853903, "ref_ce_loss": 0.0930713415145874, "step": 6860 }, { "epoch": 2.2881921280853903, "loss": 0.4076330065727234, "step": 6860 }, { "ce_loss": 0.1973707377910614, "epoch": 2.2881921280853903, "step": 6860 }, { "distill_loss": 0.09870386868715286, "epoch": 2.2881921280853903, "step": 6860 }, { "epoch": 2.2881921280853903, "ref_ce_loss": 0.11129038035869598, "step": 6860 }, { "epoch": 2.2881921280853903, "loss": 0.49984055757522583, "step": 6860 }, { "ce_loss": 0.25557148456573486, "epoch": 2.2881921280853903, "step": 6860 }, { "distill_loss": 0.11275038868188858, "epoch": 2.2881921280853903, "step": 6860 }, { "epoch": 2.2881921280853903, "ref_ce_loss": 0.1314382702112198, "step": 6860 }, { "epoch": 2.2881921280853903, "loss": 0.3817070722579956, "step": 6860 }, { "ce_loss": 0.10354137420654297, "epoch": 2.2881921280853903, "step": 6860 }, { "distill_loss": 0.09449654817581177, "epoch": 2.2881921280853903, "step": 6860 }, { "epoch": 2.2881921280853903, "ref_ce_loss": 0.1289808601140976, "step": 6860 }, { "epoch": 2.2915276851234156, "loss": 0.613, "step": 6870 }, { "epoch": 2.2915276851234156, "grad_norm": 4.802642345428467, "step": 6870 }, { "epoch": 2.2915276851234156, "learning_rate": 0.00025117580267813324, "step": 6870 }, { "epoch": 2.2915276851234156, "loss": 0.40103641152381897, "step": 6870 }, { "ce_loss": 0.12699085474014282, "epoch": 2.2915276851234156, "step": 6870 }, { "distill_loss": 0.10826943814754486, "epoch": 2.2915276851234156, "step": 6870 }, { "epoch": 2.2915276851234156, "ref_ce_loss": 0.11575514823198318, "step": 6870 }, { "epoch": 2.2915276851234156, "loss": 0.5134308934211731, "step": 6870 }, { "ce_loss": 0.17653977870941162, "epoch": 2.2915276851234156, "step": 6870 }, { "distill_loss": 0.10092929750680923, "epoch": 2.2915276851234156, "step": 6870 }, { "epoch": 2.2915276851234156, "ref_ce_loss": 0.12954755127429962, "step": 6870 }, { "epoch": 2.2915276851234156, "loss": 0.49596887826919556, "step": 6870 }, { "ce_loss": 0.12782566249370575, "epoch": 2.2915276851234156, "step": 6870 }, { "distill_loss": 0.0934700220823288, "epoch": 2.2915276851234156, "step": 6870 }, { "epoch": 2.2915276851234156, "ref_ce_loss": 0.11930210143327713, "step": 6870 }, { "epoch": 2.2915276851234156, "loss": 0.4235532283782959, "step": 6870 }, { "ce_loss": 0.1441873013973236, "epoch": 2.2915276851234156, "step": 6870 }, { "distill_loss": 0.13626454770565033, "epoch": 2.2915276851234156, "step": 6870 }, { "epoch": 2.2915276851234156, "ref_ce_loss": 0.09318730980157852, "step": 6870 }, { "epoch": 2.294863242161441, "loss": 0.5244, "step": 6880 }, { "epoch": 2.294863242161441, "grad_norm": 1.8829163312911987, "step": 6880 }, { "epoch": 2.294863242161441, "learning_rate": 0.00025102616545677855, "step": 6880 }, { "epoch": 2.294863242161441, "loss": 0.550043523311615, "step": 6880 }, { "ce_loss": 0.2702171802520752, "epoch": 2.294863242161441, "step": 6880 }, { "distill_loss": 0.11289297789335251, "epoch": 2.294863242161441, "step": 6880 }, { "epoch": 2.294863242161441, "ref_ce_loss": 0.1368609368801117, "step": 6880 }, { "epoch": 2.294863242161441, "loss": 0.532550573348999, "step": 6880 }, { "ce_loss": 0.1481907218694687, "epoch": 2.294863242161441, "step": 6880 }, { "distill_loss": 0.1028796136379242, "epoch": 2.294863242161441, "step": 6880 }, { "epoch": 2.294863242161441, "ref_ce_loss": 0.12922383844852448, "step": 6880 }, { "epoch": 2.294863242161441, "loss": 0.41644102334976196, "step": 6880 }, { "ce_loss": 0.17733660340309143, "epoch": 2.294863242161441, "step": 6880 }, { "distill_loss": 0.09103229641914368, "epoch": 2.294863242161441, "step": 6880 }, { "epoch": 2.294863242161441, "ref_ce_loss": 0.14790113270282745, "step": 6880 }, { "epoch": 2.294863242161441, "loss": 0.3858429789543152, "step": 6880 }, { "ce_loss": 0.11549221724271774, "epoch": 2.294863242161441, "step": 6880 }, { "distill_loss": 0.10008694231510162, "epoch": 2.294863242161441, "step": 6880 }, { "epoch": 2.294863242161441, "ref_ce_loss": 0.10730014741420746, "step": 6880 }, { "epoch": 2.2981987991994663, "loss": 0.5339, "step": 6890 }, { "epoch": 2.2981987991994663, "grad_norm": 3.124227523803711, "step": 6890 }, { "epoch": 2.2981987991994663, "learning_rate": 0.0002508763440035497, "step": 6890 }, { "epoch": 2.2981987991994663, "loss": 0.4120410978794098, "step": 6890 }, { "ce_loss": 0.17701852321624756, "epoch": 2.2981987991994663, "step": 6890 }, { "distill_loss": 0.11152602732181549, "epoch": 2.2981987991994663, "step": 6890 }, { "epoch": 2.2981987991994663, "ref_ce_loss": 0.0911896601319313, "step": 6890 }, { "epoch": 2.2981987991994663, "loss": 0.4871509373188019, "step": 6890 }, { "ce_loss": 0.20323820412158966, "epoch": 2.2981987991994663, "step": 6890 }, { "distill_loss": 0.11872399598360062, "epoch": 2.2981987991994663, "step": 6890 }, { "epoch": 2.2981987991994663, "ref_ce_loss": 0.16509409248828888, "step": 6890 }, { "epoch": 2.2981987991994663, "loss": 0.8271893262863159, "step": 6890 }, { "ce_loss": 0.20623229444026947, "epoch": 2.2981987991994663, "step": 6890 }, { "distill_loss": 0.17105066776275635, "epoch": 2.2981987991994663, "step": 6890 }, { "epoch": 2.2981987991994663, "ref_ce_loss": 0.11689022928476334, "step": 6890 }, { "epoch": 2.2981987991994663, "loss": 0.5279237627983093, "step": 6890 }, { "ce_loss": 0.2500387728214264, "epoch": 2.2981987991994663, "step": 6890 }, { "distill_loss": 0.1337040662765503, "epoch": 2.2981987991994663, "step": 6890 }, { "epoch": 2.2981987991994663, "ref_ce_loss": 0.10777636617422104, "step": 6890 }, { "epoch": 2.3015343562374917, "loss": 0.5284, "step": 6900 }, { "epoch": 2.3015343562374917, "grad_norm": 1.9892547130584717, "step": 6900 }, { "epoch": 2.3015343562374917, "learning_rate": 0.0002507263385916618, "step": 6900 }, { "epoch": 2.3015343562374917, "loss": 0.33847102522850037, "step": 6900 }, { "ce_loss": 0.07629029452800751, "epoch": 2.3015343562374917, "step": 6900 }, { "distill_loss": 0.09815974533557892, "epoch": 2.3015343562374917, "step": 6900 }, { "epoch": 2.3015343562374917, "ref_ce_loss": 0.11757113039493561, "step": 6900 }, { "epoch": 2.3015343562374917, "loss": 0.8411189317703247, "step": 6900 }, { "ce_loss": 0.17542950809001923, "epoch": 2.3015343562374917, "step": 6900 }, { "distill_loss": 0.12596750259399414, "epoch": 2.3015343562374917, "step": 6900 }, { "epoch": 2.3015343562374917, "ref_ce_loss": 0.0883040726184845, "step": 6900 }, { "epoch": 2.3015343562374917, "loss": 0.6303671598434448, "step": 6900 }, { "ce_loss": 0.07354290038347244, "epoch": 2.3015343562374917, "step": 6900 }, { "distill_loss": 0.08964565396308899, "epoch": 2.3015343562374917, "step": 6900 }, { "epoch": 2.3015343562374917, "ref_ce_loss": 0.11973276734352112, "step": 6900 }, { "epoch": 2.3015343562374917, "loss": 0.9342833757400513, "step": 6900 }, { "ce_loss": 0.3235514163970947, "epoch": 2.3015343562374917, "step": 6900 }, { "distill_loss": 0.10573019087314606, "epoch": 2.3015343562374917, "step": 6900 }, { "epoch": 2.3015343562374917, "ref_ce_loss": 0.21635988354682922, "step": 6900 }, { "epoch": 2.304869913275517, "loss": 0.547, "step": 6910 }, { "epoch": 2.304869913275517, "grad_norm": 3.8128700256347656, "step": 6910 }, { "epoch": 2.304869913275517, "learning_rate": 0.00025057614949466564, "step": 6910 }, { "epoch": 2.304869913275517, "loss": 0.38203656673431396, "step": 6910 }, { "ce_loss": 0.15076223015785217, "epoch": 2.304869913275517, "step": 6910 }, { "distill_loss": 0.11055965721607208, "epoch": 2.304869913275517, "step": 6910 }, { "epoch": 2.304869913275517, "ref_ce_loss": 0.11993446946144104, "step": 6910 }, { "epoch": 2.304869913275517, "loss": 0.3910497725009918, "step": 6910 }, { "ce_loss": 0.1450098156929016, "epoch": 2.304869913275517, "step": 6910 }, { "distill_loss": 0.08259563148021698, "epoch": 2.304869913275517, "step": 6910 }, { "epoch": 2.304869913275517, "ref_ce_loss": 0.11308462172746658, "step": 6910 }, { "epoch": 2.304869913275517, "loss": 0.4037795066833496, "step": 6910 }, { "ce_loss": 0.14874710142612457, "epoch": 2.304869913275517, "step": 6910 }, { "distill_loss": 0.094327911734581, "epoch": 2.304869913275517, "step": 6910 }, { "epoch": 2.304869913275517, "ref_ce_loss": 0.06298214197158813, "step": 6910 }, { "epoch": 2.304869913275517, "loss": 0.3290916681289673, "step": 6910 }, { "ce_loss": 0.12153016775846481, "epoch": 2.304869913275517, "step": 6910 }, { "distill_loss": 0.09355287253856659, "epoch": 2.304869913275517, "step": 6910 }, { "epoch": 2.304869913275517, "ref_ce_loss": 0.0676640197634697, "step": 6910 }, { "epoch": 2.3082054703135424, "loss": 0.5028, "step": 6920 }, { "epoch": 2.3082054703135424, "grad_norm": 2.110680341720581, "step": 6920 }, { "epoch": 2.3082054703135424, "learning_rate": 0.0002504257769864468, "step": 6920 }, { "epoch": 2.3082054703135424, "loss": 0.35967057943344116, "step": 6920 }, { "ce_loss": 0.10454410314559937, "epoch": 2.3082054703135424, "step": 6920 }, { "distill_loss": 0.0831480473279953, "epoch": 2.3082054703135424, "step": 6920 }, { "epoch": 2.3082054703135424, "ref_ce_loss": 0.1044725775718689, "step": 6920 }, { "epoch": 2.3082054703135424, "loss": 0.4940726161003113, "step": 6920 }, { "ce_loss": 0.1524488776922226, "epoch": 2.3082054703135424, "step": 6920 }, { "distill_loss": 0.10413416475057602, "epoch": 2.3082054703135424, "step": 6920 }, { "epoch": 2.3082054703135424, "ref_ce_loss": 0.14821214973926544, "step": 6920 }, { "epoch": 2.3082054703135424, "loss": 0.6797365546226501, "step": 6920 }, { "ce_loss": 0.2348724603652954, "epoch": 2.3082054703135424, "step": 6920 }, { "distill_loss": 0.17091615498065948, "epoch": 2.3082054703135424, "step": 6920 }, { "epoch": 2.3082054703135424, "ref_ce_loss": 0.1520233303308487, "step": 6920 }, { "epoch": 2.3082054703135424, "loss": 0.6118256449699402, "step": 6920 }, { "ce_loss": 0.2456093281507492, "epoch": 2.3082054703135424, "step": 6920 }, { "distill_loss": 0.1130257397890091, "epoch": 2.3082054703135424, "step": 6920 }, { "epoch": 2.3082054703135424, "ref_ce_loss": 0.1817878931760788, "step": 6920 }, { "epoch": 2.3115410273515677, "loss": 0.5564, "step": 6930 }, { "epoch": 2.3115410273515677, "grad_norm": 5.056583881378174, "step": 6930 }, { "epoch": 2.3115410273515677, "learning_rate": 0.0002502752213412255, "step": 6930 }, { "epoch": 2.3115410273515677, "loss": 0.5287309288978577, "step": 6930 }, { "ce_loss": 0.22509720921516418, "epoch": 2.3115410273515677, "step": 6930 }, { "distill_loss": 0.12229382991790771, "epoch": 2.3115410273515677, "step": 6930 }, { "epoch": 2.3115410273515677, "ref_ce_loss": 0.14213591814041138, "step": 6930 }, { "epoch": 2.3115410273515677, "loss": 0.3925534188747406, "step": 6930 }, { "ce_loss": 0.1103292927145958, "epoch": 2.3115410273515677, "step": 6930 }, { "distill_loss": 0.10397907346487045, "epoch": 2.3115410273515677, "step": 6930 }, { "epoch": 2.3115410273515677, "ref_ce_loss": 0.09071511775255203, "step": 6930 }, { "epoch": 2.3115410273515677, "loss": 0.7502164840698242, "step": 6930 }, { "ce_loss": 0.25732266902923584, "epoch": 2.3115410273515677, "step": 6930 }, { "distill_loss": 0.13244201242923737, "epoch": 2.3115410273515677, "step": 6930 }, { "epoch": 2.3115410273515677, "ref_ce_loss": 0.16759437322616577, "step": 6930 }, { "epoch": 2.3115410273515677, "loss": 0.42582589387893677, "step": 6930 }, { "ce_loss": 0.13769946992397308, "epoch": 2.3115410273515677, "step": 6930 }, { "distill_loss": 0.08645257353782654, "epoch": 2.3115410273515677, "step": 6930 }, { "epoch": 2.3115410273515677, "ref_ce_loss": 0.1043008342385292, "step": 6930 }, { "epoch": 2.314876584389593, "loss": 0.533, "step": 6940 }, { "epoch": 2.314876584389593, "grad_norm": 5.912909507751465, "step": 6940 }, { "epoch": 2.314876584389593, "learning_rate": 0.00025012448283355586, "step": 6940 }, { "epoch": 2.314876584389593, "loss": 0.4608493745326996, "step": 6940 }, { "ce_loss": 0.13830618560314178, "epoch": 2.314876584389593, "step": 6940 }, { "distill_loss": 0.1606249064207077, "epoch": 2.314876584389593, "step": 6940 }, { "epoch": 2.314876584389593, "ref_ce_loss": 0.13436095416545868, "step": 6940 }, { "epoch": 2.314876584389593, "loss": 0.3904031217098236, "step": 6940 }, { "ce_loss": 0.14121873676776886, "epoch": 2.314876584389593, "step": 6940 }, { "distill_loss": 0.18405036628246307, "epoch": 2.314876584389593, "step": 6940 }, { "epoch": 2.314876584389593, "ref_ce_loss": 0.06506030261516571, "step": 6940 }, { "epoch": 2.314876584389593, "loss": 0.8715546131134033, "step": 6940 }, { "ce_loss": 0.2018638551235199, "epoch": 2.314876584389593, "step": 6940 }, { "distill_loss": 0.20629999041557312, "epoch": 2.314876584389593, "step": 6940 }, { "epoch": 2.314876584389593, "ref_ce_loss": 0.11235152184963226, "step": 6940 }, { "epoch": 2.314876584389593, "loss": 0.5256648063659668, "step": 6940 }, { "ce_loss": 0.22348742187023163, "epoch": 2.314876584389593, "step": 6940 }, { "distill_loss": 0.20976266264915466, "epoch": 2.314876584389593, "step": 6940 }, { "epoch": 2.314876584389593, "ref_ce_loss": 0.09235940873622894, "step": 6940 }, { "epoch": 2.3182121414276184, "loss": 0.5821, "step": 6950 }, { "epoch": 2.3182121414276184, "grad_norm": 2.4662444591522217, "step": 6950 }, { "epoch": 2.3182121414276184, "learning_rate": 0.00024997356173832536, "step": 6950 }, { "epoch": 2.3182121414276184, "loss": 0.61894690990448, "step": 6950 }, { "ce_loss": 0.24030759930610657, "epoch": 2.3182121414276184, "step": 6950 }, { "distill_loss": 0.1709204614162445, "epoch": 2.3182121414276184, "step": 6950 }, { "epoch": 2.3182121414276184, "ref_ce_loss": 0.14262616634368896, "step": 6950 }, { "epoch": 2.3182121414276184, "loss": 0.7224009037017822, "step": 6950 }, { "ce_loss": 0.1713646501302719, "epoch": 2.3182121414276184, "step": 6950 }, { "distill_loss": 0.21908612549304962, "epoch": 2.3182121414276184, "step": 6950 }, { "epoch": 2.3182121414276184, "ref_ce_loss": 0.12334079295396805, "step": 6950 }, { "epoch": 2.3182121414276184, "loss": 0.6632697582244873, "step": 6950 }, { "ce_loss": 0.1673959642648697, "epoch": 2.3182121414276184, "step": 6950 }, { "distill_loss": 0.19471748173236847, "epoch": 2.3182121414276184, "step": 6950 }, { "epoch": 2.3182121414276184, "ref_ce_loss": 0.12122780829668045, "step": 6950 }, { "epoch": 2.3182121414276184, "loss": 0.8324764966964722, "step": 6950 }, { "ce_loss": 0.24417831003665924, "epoch": 2.3182121414276184, "step": 6950 }, { "distill_loss": 0.18310905992984772, "epoch": 2.3182121414276184, "step": 6950 }, { "epoch": 2.3182121414276184, "ref_ce_loss": 0.17036187648773193, "step": 6950 }, { "epoch": 2.321547698465644, "loss": 0.5725, "step": 6960 }, { "epoch": 2.321547698465644, "grad_norm": 4.33769416809082, "step": 6960 }, { "epoch": 2.321547698465644, "learning_rate": 0.00024982245833075466, "step": 6960 }, { "epoch": 2.321547698465644, "loss": 0.6916789412498474, "step": 6960 }, { "ce_loss": 0.23741775751113892, "epoch": 2.321547698465644, "step": 6960 }, { "distill_loss": 0.17516985535621643, "epoch": 2.321547698465644, "step": 6960 }, { "epoch": 2.321547698465644, "ref_ce_loss": 0.1561703383922577, "step": 6960 }, { "epoch": 2.321547698465644, "loss": 0.4892004132270813, "step": 6960 }, { "ce_loss": 0.10765889286994934, "epoch": 2.321547698465644, "step": 6960 }, { "distill_loss": 0.1549256592988968, "epoch": 2.321547698465644, "step": 6960 }, { "epoch": 2.321547698465644, "ref_ce_loss": 0.07869797199964523, "step": 6960 }, { "epoch": 2.321547698465644, "loss": 0.5315861701965332, "step": 6960 }, { "ce_loss": 0.23685623705387115, "epoch": 2.321547698465644, "step": 6960 }, { "distill_loss": 0.16759350895881653, "epoch": 2.321547698465644, "step": 6960 }, { "epoch": 2.321547698465644, "ref_ce_loss": 0.12687110900878906, "step": 6960 }, { "epoch": 2.321547698465644, "loss": 0.9347847700119019, "step": 6960 }, { "ce_loss": 0.32063230872154236, "epoch": 2.321547698465644, "step": 6960 }, { "distill_loss": 0.20522774755954742, "epoch": 2.321547698465644, "step": 6960 }, { "epoch": 2.321547698465644, "ref_ce_loss": 0.14673230051994324, "step": 6960 }, { "epoch": 2.324883255503669, "loss": 0.5492, "step": 6970 }, { "epoch": 2.324883255503669, "grad_norm": 2.2744076251983643, "step": 6970 }, { "epoch": 2.324883255503669, "learning_rate": 0.0002496711728863967, "step": 6970 }, { "epoch": 2.324883255503669, "loss": 0.7938024997711182, "step": 6970 }, { "ce_loss": 0.3438526391983032, "epoch": 2.324883255503669, "step": 6970 }, { "distill_loss": 0.18260395526885986, "epoch": 2.324883255503669, "step": 6970 }, { "epoch": 2.324883255503669, "ref_ce_loss": 0.13226082921028137, "step": 6970 }, { "epoch": 2.324883255503669, "loss": 0.48433953523635864, "step": 6970 }, { "ce_loss": 0.06481018662452698, "epoch": 2.324883255503669, "step": 6970 }, { "distill_loss": 0.1196662038564682, "epoch": 2.324883255503669, "step": 6970 }, { "epoch": 2.324883255503669, "ref_ce_loss": 0.09014244377613068, "step": 6970 }, { "epoch": 2.324883255503669, "loss": 0.7640020847320557, "step": 6970 }, { "ce_loss": 0.1803368777036667, "epoch": 2.324883255503669, "step": 6970 }, { "distill_loss": 0.1577882617712021, "epoch": 2.324883255503669, "step": 6970 }, { "epoch": 2.324883255503669, "ref_ce_loss": 0.11870657652616501, "step": 6970 }, { "epoch": 2.324883255503669, "loss": 0.482220321893692, "step": 6970 }, { "ce_loss": 0.16063043475151062, "epoch": 2.324883255503669, "step": 6970 }, { "distill_loss": 0.11836622655391693, "epoch": 2.324883255503669, "step": 6970 }, { "epoch": 2.324883255503669, "ref_ce_loss": 0.1077476367354393, "step": 6970 }, { "epoch": 2.3282188125416945, "loss": 0.5672, "step": 6980 }, { "epoch": 2.3282188125416945, "grad_norm": 1.7576720714569092, "step": 6980 }, { "epoch": 2.3282188125416945, "learning_rate": 0.00024951970568113643, "step": 6980 }, { "epoch": 2.3282188125416945, "loss": 0.4736265540122986, "step": 6980 }, { "ce_loss": 0.19416333734989166, "epoch": 2.3282188125416945, "step": 6980 }, { "distill_loss": 0.08972302824258804, "epoch": 2.3282188125416945, "step": 6980 }, { "epoch": 2.3282188125416945, "ref_ce_loss": 0.1895930916070938, "step": 6980 }, { "epoch": 2.3282188125416945, "loss": 0.84056156873703, "step": 6980 }, { "ce_loss": 0.1776997596025467, "epoch": 2.3282188125416945, "step": 6980 }, { "distill_loss": 0.10737963765859604, "epoch": 2.3282188125416945, "step": 6980 }, { "epoch": 2.3282188125416945, "ref_ce_loss": 0.20549514889717102, "step": 6980 }, { "epoch": 2.3282188125416945, "loss": 0.40008676052093506, "step": 6980 }, { "ce_loss": 0.17353692650794983, "epoch": 2.3282188125416945, "step": 6980 }, { "distill_loss": 0.11199227720499039, "epoch": 2.3282188125416945, "step": 6980 }, { "epoch": 2.3282188125416945, "ref_ce_loss": 0.06525639444589615, "step": 6980 }, { "epoch": 2.3282188125416945, "loss": 0.5162920355796814, "step": 6980 }, { "ce_loss": 0.1759594827890396, "epoch": 2.3282188125416945, "step": 6980 }, { "distill_loss": 0.08586210012435913, "epoch": 2.3282188125416945, "step": 6980 }, { "epoch": 2.3282188125416945, "ref_ce_loss": 0.1958712935447693, "step": 6980 }, { "epoch": 2.33155436957972, "loss": 0.5367, "step": 6990 }, { "epoch": 2.33155436957972, "grad_norm": 2.3340952396392822, "step": 6990 }, { "epoch": 2.33155436957972, "learning_rate": 0.00024936805699119033, "step": 6990 }, { "epoch": 2.33155436957972, "loss": 0.5874610543251038, "step": 6990 }, { "ce_loss": 0.24022972583770752, "epoch": 2.33155436957972, "step": 6990 }, { "distill_loss": 0.12990692257881165, "epoch": 2.33155436957972, "step": 6990 }, { "epoch": 2.33155436957972, "ref_ce_loss": 0.13573411107063293, "step": 6990 }, { "epoch": 2.33155436957972, "loss": 0.683862030506134, "step": 6990 }, { "ce_loss": 0.19738270342350006, "epoch": 2.33155436957972, "step": 6990 }, { "distill_loss": 0.12065930664539337, "epoch": 2.33155436957972, "step": 6990 }, { "epoch": 2.33155436957972, "ref_ce_loss": 0.13016332685947418, "step": 6990 }, { "epoch": 2.33155436957972, "loss": 0.5245134830474854, "step": 6990 }, { "ce_loss": 0.1756822019815445, "epoch": 2.33155436957972, "step": 6990 }, { "distill_loss": 0.09732089936733246, "epoch": 2.33155436957972, "step": 6990 }, { "epoch": 2.33155436957972, "ref_ce_loss": 0.11447296291589737, "step": 6990 }, { "epoch": 2.33155436957972, "loss": 0.43875250220298767, "step": 6990 }, { "ce_loss": 0.23195013403892517, "epoch": 2.33155436957972, "step": 6990 }, { "distill_loss": 0.10867423564195633, "epoch": 2.33155436957972, "step": 6990 }, { "epoch": 2.33155436957972, "ref_ce_loss": 0.09810131043195724, "step": 6990 }, { "epoch": 2.334889926617745, "loss": 0.5477, "step": 7000 }, { "epoch": 2.334889926617745, "grad_norm": 5.084808826446533, "step": 7000 }, { "epoch": 2.334889926617745, "learning_rate": 0.0002492162270931058, "step": 7000 }, { "epoch": 2.334889926617745, "loss": 0.6767491102218628, "step": 7000 }, { "ce_loss": 0.1351834088563919, "epoch": 2.334889926617745, "step": 7000 }, { "distill_loss": 0.11761026084423065, "epoch": 2.334889926617745, "step": 7000 }, { "epoch": 2.334889926617745, "ref_ce_loss": 0.08548547327518463, "step": 7000 }, { "epoch": 2.334889926617745, "loss": 0.35615816712379456, "step": 7000 }, { "ce_loss": 0.06495611369609833, "epoch": 2.334889926617745, "step": 7000 }, { "distill_loss": 0.10093344748020172, "epoch": 2.334889926617745, "step": 7000 }, { "epoch": 2.334889926617745, "ref_ce_loss": 0.12442238628864288, "step": 7000 }, { "epoch": 2.334889926617745, "loss": 1.0504748821258545, "step": 7000 }, { "ce_loss": 0.5850064158439636, "epoch": 2.334889926617745, "step": 7000 }, { "distill_loss": 0.11385861039161682, "epoch": 2.334889926617745, "step": 7000 }, { "epoch": 2.334889926617745, "ref_ce_loss": 0.28474873304367065, "step": 7000 }, { "epoch": 2.334889926617745, "loss": 0.4212246537208557, "step": 7000 }, { "ce_loss": 0.16298897564411163, "epoch": 2.334889926617745, "step": 7000 }, { "distill_loss": 0.11466117203235626, "epoch": 2.334889926617745, "step": 7000 }, { "epoch": 2.334889926617745, "ref_ce_loss": 0.11506110429763794, "step": 7000 }, { "epoch": 2.3382254836557705, "loss": 0.4893, "step": 7010 }, { "epoch": 2.3382254836557705, "grad_norm": 3.7350194454193115, "step": 7010 }, { "epoch": 2.3382254836557705, "learning_rate": 0.0002490642162637606, "step": 7010 }, { "epoch": 2.3382254836557705, "loss": 0.49059048295021057, "step": 7010 }, { "ce_loss": 0.18139110505580902, "epoch": 2.3382254836557705, "step": 7010 }, { "distill_loss": 0.10281254351139069, "epoch": 2.3382254836557705, "step": 7010 }, { "epoch": 2.3382254836557705, "ref_ce_loss": 0.12374398857355118, "step": 7010 }, { "epoch": 2.3382254836557705, "loss": 0.5319090485572815, "step": 7010 }, { "ce_loss": 0.19318652153015137, "epoch": 2.3382254836557705, "step": 7010 }, { "distill_loss": 0.10822658240795135, "epoch": 2.3382254836557705, "step": 7010 }, { "epoch": 2.3382254836557705, "ref_ce_loss": 0.10500287264585495, "step": 7010 }, { "epoch": 2.3382254836557705, "loss": 0.5998363494873047, "step": 7010 }, { "ce_loss": 0.15518829226493835, "epoch": 2.3382254836557705, "step": 7010 }, { "distill_loss": 0.08122570812702179, "epoch": 2.3382254836557705, "step": 7010 }, { "epoch": 2.3382254836557705, "ref_ce_loss": 0.08441562205553055, "step": 7010 }, { "epoch": 2.3382254836557705, "loss": 0.4391767084598541, "step": 7010 }, { "ce_loss": 0.1472361832857132, "epoch": 2.3382254836557705, "step": 7010 }, { "distill_loss": 0.09130523353815079, "epoch": 2.3382254836557705, "step": 7010 }, { "epoch": 2.3382254836557705, "ref_ce_loss": 0.12980183959007263, "step": 7010 }, { "epoch": 2.341561040693796, "loss": 0.55, "step": 7020 }, { "epoch": 2.341561040693796, "grad_norm": 2.3400723934173584, "step": 7020 }, { "epoch": 2.341561040693796, "learning_rate": 0.00024891202478036266, "step": 7020 }, { "epoch": 2.341561040693796, "loss": 0.5382486581802368, "step": 7020 }, { "ce_loss": 0.20204707980155945, "epoch": 2.341561040693796, "step": 7020 }, { "distill_loss": 0.11083859950304031, "epoch": 2.341561040693796, "step": 7020 }, { "epoch": 2.341561040693796, "ref_ce_loss": 0.10628888756036758, "step": 7020 }, { "epoch": 2.341561040693796, "loss": 1.0087568759918213, "step": 7020 }, { "ce_loss": 0.20493319630622864, "epoch": 2.341561040693796, "step": 7020 }, { "distill_loss": 0.11655318737030029, "epoch": 2.341561040693796, "step": 7020 }, { "epoch": 2.341561040693796, "ref_ce_loss": 0.13127465546131134, "step": 7020 }, { "epoch": 2.341561040693796, "loss": 0.3462040424346924, "step": 7020 }, { "ce_loss": 0.11205989122390747, "epoch": 2.341561040693796, "step": 7020 }, { "distill_loss": 0.09971515089273453, "epoch": 2.341561040693796, "step": 7020 }, { "epoch": 2.341561040693796, "ref_ce_loss": 0.095881387591362, "step": 7020 }, { "epoch": 2.341561040693796, "loss": 0.5632637739181519, "step": 7020 }, { "ce_loss": 0.09938246756792068, "epoch": 2.341561040693796, "step": 7020 }, { "distill_loss": 0.11057905852794647, "epoch": 2.341561040693796, "step": 7020 }, { "epoch": 2.341561040693796, "ref_ce_loss": 0.08367249369621277, "step": 7020 }, { "epoch": 2.3448965977318212, "loss": 0.5794, "step": 7030 }, { "epoch": 2.3448965977318212, "grad_norm": 2.2300267219543457, "step": 7030 }, { "epoch": 2.3448965977318212, "learning_rate": 0.0002487596529204491, "step": 7030 }, { "epoch": 2.3448965977318212, "loss": 0.34646064043045044, "step": 7030 }, { "ce_loss": 0.10655689984560013, "epoch": 2.3448965977318212, "step": 7030 }, { "distill_loss": 0.07144635915756226, "epoch": 2.3448965977318212, "step": 7030 }, { "epoch": 2.3448965977318212, "ref_ce_loss": 0.11395888030529022, "step": 7030 }, { "epoch": 2.3448965977318212, "loss": 0.9182996153831482, "step": 7030 }, { "ce_loss": 0.12896713614463806, "epoch": 2.3448965977318212, "step": 7030 }, { "distill_loss": 0.12328529357910156, "epoch": 2.3448965977318212, "step": 7030 }, { "epoch": 2.3448965977318212, "ref_ce_loss": 0.1499665230512619, "step": 7030 }, { "epoch": 2.3448965977318212, "loss": 0.796744167804718, "step": 7030 }, { "ce_loss": 0.11686861515045166, "epoch": 2.3448965977318212, "step": 7030 }, { "distill_loss": 0.11429211497306824, "epoch": 2.3448965977318212, "step": 7030 }, { "epoch": 2.3448965977318212, "ref_ce_loss": 0.12650713324546814, "step": 7030 }, { "epoch": 2.3448965977318212, "loss": 0.5051373243331909, "step": 7030 }, { "ce_loss": 0.19313965737819672, "epoch": 2.3448965977318212, "step": 7030 }, { "distill_loss": 0.08210955560207367, "epoch": 2.3448965977318212, "step": 7030 }, { "epoch": 2.3448965977318212, "ref_ce_loss": 0.12427300959825516, "step": 7030 }, { "epoch": 2.3482321547698466, "loss": 0.5902, "step": 7040 }, { "epoch": 2.3482321547698466, "grad_norm": 2.497122287750244, "step": 7040 }, { "epoch": 2.3482321547698466, "learning_rate": 0.0002486071009618861, "step": 7040 }, { "epoch": 2.3482321547698466, "loss": 0.436869740486145, "step": 7040 }, { "ce_loss": 0.11832589656114578, "epoch": 2.3482321547698466, "step": 7040 }, { "distill_loss": 0.11758086085319519, "epoch": 2.3482321547698466, "step": 7040 }, { "epoch": 2.3482321547698466, "ref_ce_loss": 0.13616949319839478, "step": 7040 }, { "epoch": 2.3482321547698466, "loss": 0.6059623956680298, "step": 7040 }, { "ce_loss": 0.24056315422058105, "epoch": 2.3482321547698466, "step": 7040 }, { "distill_loss": 0.1205277070403099, "epoch": 2.3482321547698466, "step": 7040 }, { "epoch": 2.3482321547698466, "ref_ce_loss": 0.1293945163488388, "step": 7040 }, { "epoch": 2.3482321547698466, "loss": 1.240486979484558, "step": 7040 }, { "ce_loss": 0.18717877566814423, "epoch": 2.3482321547698466, "step": 7040 }, { "distill_loss": 0.13339264690876007, "epoch": 2.3482321547698466, "step": 7040 }, { "epoch": 2.3482321547698466, "ref_ce_loss": 0.10200013965368271, "step": 7040 }, { "epoch": 2.3482321547698466, "loss": 0.49349045753479004, "step": 7040 }, { "ce_loss": 0.23079393804073334, "epoch": 2.3482321547698466, "step": 7040 }, { "distill_loss": 0.14471150934696198, "epoch": 2.3482321547698466, "step": 7040 }, { "epoch": 2.3482321547698466, "ref_ce_loss": 0.11784505099058151, "step": 7040 }, { "epoch": 2.351567711807872, "loss": 0.4947, "step": 7050 }, { "epoch": 2.351567711807872, "grad_norm": 2.4697394371032715, "step": 7050 }, { "epoch": 2.351567711807872, "learning_rate": 0.0002484543691828683, "step": 7050 }, { "epoch": 2.351567711807872, "loss": 0.8933064341545105, "step": 7050 }, { "ce_loss": 0.1610158532857895, "epoch": 2.351567711807872, "step": 7050 }, { "distill_loss": 0.11047893017530441, "epoch": 2.351567711807872, "step": 7050 }, { "epoch": 2.351567711807872, "ref_ce_loss": 0.15877987444400787, "step": 7050 }, { "epoch": 2.351567711807872, "loss": 0.3495582044124603, "step": 7050 }, { "ce_loss": 0.13080823421478271, "epoch": 2.351567711807872, "step": 7050 }, { "distill_loss": 0.10535383224487305, "epoch": 2.351567711807872, "step": 7050 }, { "epoch": 2.351567711807872, "ref_ce_loss": 0.11328636109828949, "step": 7050 }, { "epoch": 2.351567711807872, "loss": 0.6516172885894775, "step": 7050 }, { "ce_loss": 0.19208601117134094, "epoch": 2.351567711807872, "step": 7050 }, { "distill_loss": 0.11426350474357605, "epoch": 2.351567711807872, "step": 7050 }, { "epoch": 2.351567711807872, "ref_ce_loss": 0.11293771117925644, "step": 7050 }, { "epoch": 2.351567711807872, "loss": 0.7531253695487976, "step": 7050 }, { "ce_loss": 0.20007860660552979, "epoch": 2.351567711807872, "step": 7050 }, { "distill_loss": 0.11392208933830261, "epoch": 2.351567711807872, "step": 7050 }, { "epoch": 2.351567711807872, "ref_ce_loss": 0.09171842783689499, "step": 7050 }, { "epoch": 2.3549032688458973, "loss": 0.5329, "step": 7060 }, { "epoch": 2.3549032688458973, "grad_norm": 3.29000186920166, "step": 7060 }, { "epoch": 2.3549032688458973, "learning_rate": 0.0002483014578619181, "step": 7060 }, { "epoch": 2.3549032688458973, "loss": 0.5926487445831299, "step": 7060 }, { "ce_loss": 0.2161916345357895, "epoch": 2.3549032688458973, "step": 7060 }, { "distill_loss": 0.11993826925754547, "epoch": 2.3549032688458973, "step": 7060 }, { "epoch": 2.3549032688458973, "ref_ce_loss": 0.09713123738765717, "step": 7060 }, { "epoch": 2.3549032688458973, "loss": 0.5170358419418335, "step": 7060 }, { "ce_loss": 0.1522398144006729, "epoch": 2.3549032688458973, "step": 7060 }, { "distill_loss": 0.10683247447013855, "epoch": 2.3549032688458973, "step": 7060 }, { "epoch": 2.3549032688458973, "ref_ce_loss": 0.10169428586959839, "step": 7060 }, { "epoch": 2.3549032688458973, "loss": 0.33274805545806885, "step": 7060 }, { "ce_loss": 0.12017644196748734, "epoch": 2.3549032688458973, "step": 7060 }, { "distill_loss": 0.10068129003047943, "epoch": 2.3549032688458973, "step": 7060 }, { "epoch": 2.3549032688458973, "ref_ce_loss": 0.11175446957349777, "step": 7060 }, { "epoch": 2.3549032688458973, "loss": 0.5639784932136536, "step": 7060 }, { "ce_loss": 0.19975608587265015, "epoch": 2.3549032688458973, "step": 7060 }, { "distill_loss": 0.13790909945964813, "epoch": 2.3549032688458973, "step": 7060 }, { "epoch": 2.3549032688458973, "ref_ce_loss": 0.17795276641845703, "step": 7060 }, { "epoch": 2.3582388258839226, "loss": 0.5653, "step": 7070 }, { "epoch": 2.3582388258839226, "grad_norm": 11.505758285522461, "step": 7070 }, { "epoch": 2.3582388258839226, "learning_rate": 0.00024814836727788563, "step": 7070 }, { "epoch": 2.3582388258839226, "loss": 1.0071794986724854, "step": 7070 }, { "ce_loss": 0.23054859042167664, "epoch": 2.3582388258839226, "step": 7070 }, { "distill_loss": 0.217034250497818, "epoch": 2.3582388258839226, "step": 7070 }, { "epoch": 2.3582388258839226, "ref_ce_loss": 0.13613423705101013, "step": 7070 }, { "epoch": 2.3582388258839226, "loss": 0.5255445241928101, "step": 7070 }, { "ce_loss": 0.16354890167713165, "epoch": 2.3582388258839226, "step": 7070 }, { "distill_loss": 0.1455252468585968, "epoch": 2.3582388258839226, "step": 7070 }, { "epoch": 2.3582388258839226, "ref_ce_loss": 0.13496388494968414, "step": 7070 }, { "epoch": 2.3582388258839226, "loss": 0.45980530977249146, "step": 7070 }, { "ce_loss": 0.16896405816078186, "epoch": 2.3582388258839226, "step": 7070 }, { "distill_loss": 0.13719318807125092, "epoch": 2.3582388258839226, "step": 7070 }, { "epoch": 2.3582388258839226, "ref_ce_loss": 0.09889309853315353, "step": 7070 }, { "epoch": 2.3582388258839226, "loss": 0.6259099841117859, "step": 7070 }, { "ce_loss": 0.2755209505558014, "epoch": 2.3582388258839226, "step": 7070 }, { "distill_loss": 0.16722777485847473, "epoch": 2.3582388258839226, "step": 7070 }, { "epoch": 2.3582388258839226, "ref_ce_loss": 0.1830323040485382, "step": 7070 }, { "epoch": 2.361574382921948, "loss": 0.6363, "step": 7080 }, { "epoch": 2.361574382921948, "grad_norm": 2.382572889328003, "step": 7080 }, { "epoch": 2.361574382921948, "learning_rate": 0.0002479950977099476, "step": 7080 }, { "epoch": 2.361574382921948, "loss": 0.547733724117279, "step": 7080 }, { "ce_loss": 0.15202975273132324, "epoch": 2.361574382921948, "step": 7080 }, { "distill_loss": 0.12852467596530914, "epoch": 2.361574382921948, "step": 7080 }, { "epoch": 2.361574382921948, "ref_ce_loss": 0.10482831299304962, "step": 7080 }, { "epoch": 2.361574382921948, "loss": 1.1188392639160156, "step": 7080 }, { "ce_loss": 0.15732690691947937, "epoch": 2.361574382921948, "step": 7080 }, { "distill_loss": 0.13412557542324066, "epoch": 2.361574382921948, "step": 7080 }, { "epoch": 2.361574382921948, "ref_ce_loss": 0.12009337544441223, "step": 7080 }, { "epoch": 2.361574382921948, "loss": 0.551268994808197, "step": 7080 }, { "ce_loss": 0.12435823678970337, "epoch": 2.361574382921948, "step": 7080 }, { "distill_loss": 0.10511200875043869, "epoch": 2.361574382921948, "step": 7080 }, { "epoch": 2.361574382921948, "ref_ce_loss": 0.09536106884479523, "step": 7080 }, { "epoch": 2.361574382921948, "loss": 0.5037704706192017, "step": 7080 }, { "ce_loss": 0.20579804480075836, "epoch": 2.361574382921948, "step": 7080 }, { "distill_loss": 0.15192145109176636, "epoch": 2.361574382921948, "step": 7080 }, { "epoch": 2.361574382921948, "ref_ce_loss": 0.10141074657440186, "step": 7080 }, { "epoch": 2.3649099399599733, "loss": 0.5554, "step": 7090 }, { "epoch": 2.3649099399599733, "grad_norm": 2.5210964679718018, "step": 7090 }, { "epoch": 2.3649099399599733, "learning_rate": 0.0002478416494376072, "step": 7090 }, { "epoch": 2.3649099399599733, "loss": 0.5080283284187317, "step": 7090 }, { "ce_loss": 0.2142353057861328, "epoch": 2.3649099399599733, "step": 7090 }, { "distill_loss": 0.12848493456840515, "epoch": 2.3649099399599733, "step": 7090 }, { "epoch": 2.3649099399599733, "ref_ce_loss": 0.13930000364780426, "step": 7090 }, { "epoch": 2.3649099399599733, "loss": 0.4127797484397888, "step": 7090 }, { "ce_loss": 0.18228977918624878, "epoch": 2.3649099399599733, "step": 7090 }, { "distill_loss": 0.12166300415992737, "epoch": 2.3649099399599733, "step": 7090 }, { "epoch": 2.3649099399599733, "ref_ce_loss": 0.10858716070652008, "step": 7090 }, { "epoch": 2.3649099399599733, "loss": 1.189202070236206, "step": 7090 }, { "ce_loss": 0.25980982184410095, "epoch": 2.3649099399599733, "step": 7090 }, { "distill_loss": 0.17335101962089539, "epoch": 2.3649099399599733, "step": 7090 }, { "epoch": 2.3649099399599733, "ref_ce_loss": 0.17701305449008942, "step": 7090 }, { "epoch": 2.3649099399599733, "loss": 0.4164826273918152, "step": 7090 }, { "ce_loss": 0.15672534704208374, "epoch": 2.3649099399599733, "step": 7090 }, { "distill_loss": 0.11506892740726471, "epoch": 2.3649099399599733, "step": 7090 }, { "epoch": 2.3649099399599733, "ref_ce_loss": 0.09737000614404678, "step": 7090 }, { "epoch": 2.3682454969979987, "loss": 0.5592, "step": 7100 }, { "epoch": 2.3682454969979987, "grad_norm": 3.242518186569214, "step": 7100 }, { "epoch": 2.3682454969979987, "learning_rate": 0.00024768802274069364, "step": 7100 }, { "epoch": 2.3682454969979987, "loss": 0.8438761234283447, "step": 7100 }, { "ce_loss": 0.11892778426408768, "epoch": 2.3682454969979987, "step": 7100 }, { "distill_loss": 0.14269644021987915, "epoch": 2.3682454969979987, "step": 7100 }, { "epoch": 2.3682454969979987, "ref_ce_loss": 0.09960228204727173, "step": 7100 }, { "epoch": 2.3682454969979987, "loss": 0.5341728925704956, "step": 7100 }, { "ce_loss": 0.13597016036510468, "epoch": 2.3682454969979987, "step": 7100 }, { "distill_loss": 0.11470731347799301, "epoch": 2.3682454969979987, "step": 7100 }, { "epoch": 2.3682454969979987, "ref_ce_loss": 0.16889235377311707, "step": 7100 }, { "epoch": 2.3682454969979987, "loss": 0.4424186646938324, "step": 7100 }, { "ce_loss": 0.10149525851011276, "epoch": 2.3682454969979987, "step": 7100 }, { "distill_loss": 0.11648110300302505, "epoch": 2.3682454969979987, "step": 7100 }, { "epoch": 2.3682454969979987, "ref_ce_loss": 0.09637293964624405, "step": 7100 }, { "epoch": 2.3682454969979987, "loss": 1.2790297269821167, "step": 7100 }, { "ce_loss": 0.29890012741088867, "epoch": 2.3682454969979987, "step": 7100 }, { "distill_loss": 0.15040647983551025, "epoch": 2.3682454969979987, "step": 7100 }, { "epoch": 2.3682454969979987, "ref_ce_loss": 0.11567104607820511, "step": 7100 }, { "epoch": 2.371581054036024, "loss": 0.5256, "step": 7110 }, { "epoch": 2.371581054036024, "grad_norm": 2.799344062805176, "step": 7110 }, { "epoch": 2.371581054036024, "learning_rate": 0.0002475342178993614, "step": 7110 }, { "epoch": 2.371581054036024, "loss": 0.5514856576919556, "step": 7110 }, { "ce_loss": 0.2613866627216339, "epoch": 2.371581054036024, "step": 7110 }, { "distill_loss": 0.11943652480840683, "epoch": 2.371581054036024, "step": 7110 }, { "epoch": 2.371581054036024, "ref_ce_loss": 0.10984734445810318, "step": 7110 }, { "epoch": 2.371581054036024, "loss": 0.6448547840118408, "step": 7110 }, { "ce_loss": 0.19715182483196259, "epoch": 2.371581054036024, "step": 7110 }, { "distill_loss": 0.11577840894460678, "epoch": 2.371581054036024, "step": 7110 }, { "epoch": 2.371581054036024, "ref_ce_loss": 0.09529617428779602, "step": 7110 }, { "epoch": 2.371581054036024, "loss": 0.7144370079040527, "step": 7110 }, { "ce_loss": 0.1511741429567337, "epoch": 2.371581054036024, "step": 7110 }, { "distill_loss": 0.11388468742370605, "epoch": 2.371581054036024, "step": 7110 }, { "epoch": 2.371581054036024, "ref_ce_loss": 0.11055812239646912, "step": 7110 }, { "epoch": 2.371581054036024, "loss": 0.3908351957798004, "step": 7110 }, { "ce_loss": 0.1625000536441803, "epoch": 2.371581054036024, "step": 7110 }, { "distill_loss": 0.10220208019018173, "epoch": 2.371581054036024, "step": 7110 }, { "epoch": 2.371581054036024, "ref_ce_loss": 0.08996450901031494, "step": 7110 }, { "epoch": 2.3749166110740494, "loss": 0.5628, "step": 7120 }, { "epoch": 2.3749166110740494, "grad_norm": 3.208664655685425, "step": 7120 }, { "epoch": 2.3749166110740494, "learning_rate": 0.00024738023519408985, "step": 7120 }, { "epoch": 2.3749166110740494, "loss": 0.3598056137561798, "step": 7120 }, { "ce_loss": 0.0844772681593895, "epoch": 2.3749166110740494, "step": 7120 }, { "distill_loss": 0.1388201117515564, "epoch": 2.3749166110740494, "step": 7120 }, { "epoch": 2.3749166110740494, "ref_ce_loss": 0.13615448772907257, "step": 7120 }, { "epoch": 2.3749166110740494, "loss": 0.4420519471168518, "step": 7120 }, { "ce_loss": 0.13381193578243256, "epoch": 2.3749166110740494, "step": 7120 }, { "distill_loss": 0.11907447874546051, "epoch": 2.3749166110740494, "step": 7120 }, { "epoch": 2.3749166110740494, "ref_ce_loss": 0.11265262961387634, "step": 7120 }, { "epoch": 2.3749166110740494, "loss": 0.31493082642555237, "step": 7120 }, { "ce_loss": 0.10242314636707306, "epoch": 2.3749166110740494, "step": 7120 }, { "distill_loss": 0.10008740425109863, "epoch": 2.3749166110740494, "step": 7120 }, { "epoch": 2.3749166110740494, "ref_ce_loss": 0.11232121288776398, "step": 7120 }, { "epoch": 2.3749166110740494, "loss": 0.5138975977897644, "step": 7120 }, { "ce_loss": 0.16858409345149994, "epoch": 2.3749166110740494, "step": 7120 }, { "distill_loss": 0.1288112848997116, "epoch": 2.3749166110740494, "step": 7120 }, { "epoch": 2.3749166110740494, "ref_ce_loss": 0.12751555442810059, "step": 7120 }, { "epoch": 2.3782521681120747, "loss": 0.5237, "step": 7130 }, { "epoch": 2.3782521681120747, "grad_norm": 2.2312326431274414, "step": 7130 }, { "epoch": 2.3782521681120747, "learning_rate": 0.00024722607490568264, "step": 7130 }, { "epoch": 2.3782521681120747, "loss": 0.34697839617729187, "step": 7130 }, { "ce_loss": 0.09735045582056046, "epoch": 2.3782521681120747, "step": 7130 }, { "distill_loss": 0.11878188699483871, "epoch": 2.3782521681120747, "step": 7130 }, { "epoch": 2.3782521681120747, "ref_ce_loss": 0.09604649245738983, "step": 7130 }, { "epoch": 2.3782521681120747, "loss": 0.5361130833625793, "step": 7130 }, { "ce_loss": 0.22890308499336243, "epoch": 2.3782521681120747, "step": 7130 }, { "distill_loss": 0.12308313697576523, "epoch": 2.3782521681120747, "step": 7130 }, { "epoch": 2.3782521681120747, "ref_ce_loss": 0.1419752985239029, "step": 7130 }, { "epoch": 2.3782521681120747, "loss": 0.47083860635757446, "step": 7130 }, { "ce_loss": 0.22402635216712952, "epoch": 2.3782521681120747, "step": 7130 }, { "distill_loss": 0.12542489171028137, "epoch": 2.3782521681120747, "step": 7130 }, { "epoch": 2.3782521681120747, "ref_ce_loss": 0.12129999697208405, "step": 7130 }, { "epoch": 2.3782521681120747, "loss": 0.4552367031574249, "step": 7130 }, { "ce_loss": 0.09671945869922638, "epoch": 2.3782521681120747, "step": 7130 }, { "distill_loss": 0.12107788771390915, "epoch": 2.3782521681120747, "step": 7130 }, { "epoch": 2.3782521681120747, "ref_ce_loss": 0.05718206241726875, "step": 7130 }, { "epoch": 2.3815877251501, "loss": 0.5365, "step": 7140 }, { "epoch": 2.3815877251501, "grad_norm": 2.714935541152954, "step": 7140 }, { "epoch": 2.3815877251501, "learning_rate": 0.00024707173731526735, "step": 7140 }, { "epoch": 2.3815877251501, "loss": 0.5343358516693115, "step": 7140 }, { "ce_loss": 0.16792216897010803, "epoch": 2.3815877251501, "step": 7140 }, { "distill_loss": 0.1140684261918068, "epoch": 2.3815877251501, "step": 7140 }, { "epoch": 2.3815877251501, "ref_ce_loss": 0.11957409977912903, "step": 7140 }, { "epoch": 2.3815877251501, "loss": 0.5848384499549866, "step": 7140 }, { "ce_loss": 0.12534525990486145, "epoch": 2.3815877251501, "step": 7140 }, { "distill_loss": 0.1129332184791565, "epoch": 2.3815877251501, "step": 7140 }, { "epoch": 2.3815877251501, "ref_ce_loss": 0.1090116947889328, "step": 7140 }, { "epoch": 2.3815877251501, "loss": 0.5402489900588989, "step": 7140 }, { "ce_loss": 0.19635345041751862, "epoch": 2.3815877251501, "step": 7140 }, { "distill_loss": 0.12364673614501953, "epoch": 2.3815877251501, "step": 7140 }, { "epoch": 2.3815877251501, "ref_ce_loss": 0.10902047902345657, "step": 7140 }, { "epoch": 2.3815877251501, "loss": 0.46899038553237915, "step": 7140 }, { "ce_loss": 0.13804945349693298, "epoch": 2.3815877251501, "step": 7140 }, { "distill_loss": 0.1111493930220604, "epoch": 2.3815877251501, "step": 7140 }, { "epoch": 2.3815877251501, "ref_ce_loss": 0.14478346705436707, "step": 7140 }, { "epoch": 2.3849232821881254, "loss": 0.5629, "step": 7150 }, { "epoch": 2.3849232821881254, "grad_norm": 6.925165176391602, "step": 7150 }, { "epoch": 2.3849232821881254, "learning_rate": 0.0002469172227042948, "step": 7150 }, { "epoch": 2.3849232821881254, "loss": 0.48434075713157654, "step": 7150 }, { "ce_loss": 0.13119007647037506, "epoch": 2.3849232821881254, "step": 7150 }, { "distill_loss": 0.15414267778396606, "epoch": 2.3849232821881254, "step": 7150 }, { "epoch": 2.3849232821881254, "ref_ce_loss": 0.14164388179779053, "step": 7150 }, { "epoch": 2.3849232821881254, "loss": 0.5125703811645508, "step": 7150 }, { "ce_loss": 0.08586469292640686, "epoch": 2.3849232821881254, "step": 7150 }, { "distill_loss": 0.12737464904785156, "epoch": 2.3849232821881254, "step": 7150 }, { "epoch": 2.3849232821881254, "ref_ce_loss": 0.16618117690086365, "step": 7150 }, { "epoch": 2.3849232821881254, "loss": 0.6230001449584961, "step": 7150 }, { "ce_loss": 0.19951888918876648, "epoch": 2.3849232821881254, "step": 7150 }, { "distill_loss": 0.18361909687519073, "epoch": 2.3849232821881254, "step": 7150 }, { "epoch": 2.3849232821881254, "ref_ce_loss": 0.10332217812538147, "step": 7150 }, { "epoch": 2.3849232821881254, "loss": 1.4790210723876953, "step": 7150 }, { "ce_loss": 0.33115363121032715, "epoch": 2.3849232821881254, "step": 7150 }, { "distill_loss": 0.19230781495571136, "epoch": 2.3849232821881254, "step": 7150 }, { "epoch": 2.3849232821881254, "ref_ce_loss": 0.16792990267276764, "step": 7150 }, { "epoch": 2.388258839226151, "loss": 0.616, "step": 7160 }, { "epoch": 2.388258839226151, "grad_norm": 4.0847601890563965, "step": 7160 }, { "epoch": 2.388258839226151, "learning_rate": 0.0002467625313545389, "step": 7160 }, { "epoch": 2.388258839226151, "loss": 0.611827552318573, "step": 7160 }, { "ce_loss": 0.2245769500732422, "epoch": 2.388258839226151, "step": 7160 }, { "distill_loss": 0.16543962061405182, "epoch": 2.388258839226151, "step": 7160 }, { "epoch": 2.388258839226151, "ref_ce_loss": 0.15356603264808655, "step": 7160 }, { "epoch": 2.388258839226151, "loss": 0.45121482014656067, "step": 7160 }, { "ce_loss": 0.15090312063694, "epoch": 2.388258839226151, "step": 7160 }, { "distill_loss": 0.17984391748905182, "epoch": 2.388258839226151, "step": 7160 }, { "epoch": 2.388258839226151, "ref_ce_loss": 0.09603415429592133, "step": 7160 }, { "epoch": 2.388258839226151, "loss": 0.5079750418663025, "step": 7160 }, { "ce_loss": 0.11780863255262375, "epoch": 2.388258839226151, "step": 7160 }, { "distill_loss": 0.10958898812532425, "epoch": 2.388258839226151, "step": 7160 }, { "epoch": 2.388258839226151, "ref_ce_loss": 0.10029244422912598, "step": 7160 }, { "epoch": 2.388258839226151, "loss": 0.5058857202529907, "step": 7160 }, { "ce_loss": 0.1748339980840683, "epoch": 2.388258839226151, "step": 7160 }, { "distill_loss": 0.11158843338489532, "epoch": 2.388258839226151, "step": 7160 }, { "epoch": 2.388258839226151, "ref_ce_loss": 0.13546571135520935, "step": 7160 }, { "epoch": 2.391594396264176, "loss": 0.5656, "step": 7170 }, { "epoch": 2.391594396264176, "grad_norm": 7.381030559539795, "step": 7170 }, { "epoch": 2.391594396264176, "learning_rate": 0.00024660766354809546, "step": 7170 }, { "epoch": 2.391594396264176, "loss": 0.48350441455841064, "step": 7170 }, { "ce_loss": 0.1267509013414383, "epoch": 2.391594396264176, "step": 7170 }, { "distill_loss": 0.1259116232395172, "epoch": 2.391594396264176, "step": 7170 }, { "epoch": 2.391594396264176, "ref_ce_loss": 0.10241179913282394, "step": 7170 }, { "epoch": 2.391594396264176, "loss": 0.6579767465591431, "step": 7170 }, { "ce_loss": 0.17158272862434387, "epoch": 2.391594396264176, "step": 7170 }, { "distill_loss": 0.14517442882061005, "epoch": 2.391594396264176, "step": 7170 }, { "epoch": 2.391594396264176, "ref_ce_loss": 0.11714108288288116, "step": 7170 }, { "epoch": 2.391594396264176, "loss": 0.47211408615112305, "step": 7170 }, { "ce_loss": 0.21960081160068512, "epoch": 2.391594396264176, "step": 7170 }, { "distill_loss": 0.13582301139831543, "epoch": 2.391594396264176, "step": 7170 }, { "epoch": 2.391594396264176, "ref_ce_loss": 0.11660350859165192, "step": 7170 }, { "epoch": 2.391594396264176, "loss": 0.3569020926952362, "step": 7170 }, { "ce_loss": 0.14226363599300385, "epoch": 2.391594396264176, "step": 7170 }, { "distill_loss": 0.12439102679491043, "epoch": 2.391594396264176, "step": 7170 }, { "epoch": 2.391594396264176, "ref_ce_loss": 0.09003002196550369, "step": 7170 }, { "epoch": 2.3949299533022015, "loss": 0.5258, "step": 7180 }, { "epoch": 2.3949299533022015, "grad_norm": 2.013179063796997, "step": 7180 }, { "epoch": 2.3949299533022015, "learning_rate": 0.00024645261956738224, "step": 7180 }, { "epoch": 2.3949299533022015, "loss": 0.6626637578010559, "step": 7180 }, { "ce_loss": 0.1727692186832428, "epoch": 2.3949299533022015, "step": 7180 }, { "distill_loss": 0.10929200053215027, "epoch": 2.3949299533022015, "step": 7180 }, { "epoch": 2.3949299533022015, "ref_ce_loss": 0.11079227924346924, "step": 7180 }, { "epoch": 2.3949299533022015, "loss": 0.4409700632095337, "step": 7180 }, { "ce_loss": 0.14989428222179413, "epoch": 2.3949299533022015, "step": 7180 }, { "distill_loss": 0.10095194727182388, "epoch": 2.3949299533022015, "step": 7180 }, { "epoch": 2.3949299533022015, "ref_ce_loss": 0.08986108005046844, "step": 7180 }, { "epoch": 2.3949299533022015, "loss": 0.44294336438179016, "step": 7180 }, { "ce_loss": 0.17101377248764038, "epoch": 2.3949299533022015, "step": 7180 }, { "distill_loss": 0.12090830504894257, "epoch": 2.3949299533022015, "step": 7180 }, { "epoch": 2.3949299533022015, "ref_ce_loss": 0.11498505622148514, "step": 7180 }, { "epoch": 2.3949299533022015, "loss": 0.7781896591186523, "step": 7180 }, { "ce_loss": 0.12104091793298721, "epoch": 2.3949299533022015, "step": 7180 }, { "distill_loss": 0.09193304926156998, "epoch": 2.3949299533022015, "step": 7180 }, { "epoch": 2.3949299533022015, "ref_ce_loss": 0.11385586112737656, "step": 7180 }, { "epoch": 2.398265510340227, "loss": 0.6057, "step": 7190 }, { "epoch": 2.398265510340227, "grad_norm": 4.252617835998535, "step": 7190 }, { "epoch": 2.398265510340227, "learning_rate": 0.00024629739969513845, "step": 7190 }, { "epoch": 2.398265510340227, "loss": 0.5514110326766968, "step": 7190 }, { "ce_loss": 0.09623024612665176, "epoch": 2.398265510340227, "step": 7190 }, { "distill_loss": 0.13418076932430267, "epoch": 2.398265510340227, "step": 7190 }, { "epoch": 2.398265510340227, "ref_ce_loss": 0.09687874466180801, "step": 7190 }, { "epoch": 2.398265510340227, "loss": 0.8390406370162964, "step": 7190 }, { "ce_loss": 0.20159050822257996, "epoch": 2.398265510340227, "step": 7190 }, { "distill_loss": 0.13292783498764038, "epoch": 2.398265510340227, "step": 7190 }, { "epoch": 2.398265510340227, "ref_ce_loss": 0.1463102549314499, "step": 7190 }, { "epoch": 2.398265510340227, "loss": 0.7187709808349609, "step": 7190 }, { "ce_loss": 0.1949305534362793, "epoch": 2.398265510340227, "step": 7190 }, { "distill_loss": 0.12101311981678009, "epoch": 2.398265510340227, "step": 7190 }, { "epoch": 2.398265510340227, "ref_ce_loss": 0.12897959351539612, "step": 7190 }, { "epoch": 2.398265510340227, "loss": 0.7193197011947632, "step": 7190 }, { "ce_loss": 0.16670188307762146, "epoch": 2.398265510340227, "step": 7190 }, { "distill_loss": 0.14500631392002106, "epoch": 2.398265510340227, "step": 7190 }, { "epoch": 2.398265510340227, "ref_ce_loss": 0.12325281649827957, "step": 7190 }, { "epoch": 2.401601067378252, "loss": 0.6018, "step": 7200 }, { "epoch": 2.401601067378252, "grad_norm": 3.247999668121338, "step": 7200 }, { "epoch": 2.401601067378252, "learning_rate": 0.00024614200421442387, "step": 7200 }, { "epoch": 2.401601067378252, "loss": 0.5324526429176331, "step": 7200 }, { "ce_loss": 0.11869847774505615, "epoch": 2.401601067378252, "step": 7200 }, { "distill_loss": 0.11210515350103378, "epoch": 2.401601067378252, "step": 7200 }, { "epoch": 2.401601067378252, "ref_ce_loss": 0.12298411130905151, "step": 7200 }, { "epoch": 2.401601067378252, "loss": 0.5949093103408813, "step": 7200 }, { "ce_loss": 0.07167255133390427, "epoch": 2.401601067378252, "step": 7200 }, { "distill_loss": 0.11562314629554749, "epoch": 2.401601067378252, "step": 7200 }, { "epoch": 2.401601067378252, "ref_ce_loss": 0.1257380098104477, "step": 7200 }, { "epoch": 2.401601067378252, "loss": 0.7308820486068726, "step": 7200 }, { "ce_loss": 0.21154363453388214, "epoch": 2.401601067378252, "step": 7200 }, { "distill_loss": 0.11605453491210938, "epoch": 2.401601067378252, "step": 7200 }, { "epoch": 2.401601067378252, "ref_ce_loss": 0.14783884584903717, "step": 7200 }, { "epoch": 2.401601067378252, "loss": 0.7168588638305664, "step": 7200 }, { "ce_loss": 0.2147577702999115, "epoch": 2.401601067378252, "step": 7200 }, { "distill_loss": 0.1553606241941452, "epoch": 2.401601067378252, "step": 7200 }, { "epoch": 2.401601067378252, "ref_ce_loss": 0.17037084698677063, "step": 7200 }, { "epoch": 2.4049366244162775, "loss": 0.5372, "step": 7210 }, { "epoch": 2.4049366244162775, "grad_norm": 2.400362014770508, "step": 7210 }, { "epoch": 2.4049366244162775, "learning_rate": 0.0002459864334086185, "step": 7210 }, { "epoch": 2.4049366244162775, "loss": 0.6147167086601257, "step": 7210 }, { "ce_loss": 0.21412178874015808, "epoch": 2.4049366244162775, "step": 7210 }, { "distill_loss": 0.13666468858718872, "epoch": 2.4049366244162775, "step": 7210 }, { "epoch": 2.4049366244162775, "ref_ce_loss": 0.20343522727489471, "step": 7210 }, { "epoch": 2.4049366244162775, "loss": 0.4731769263744354, "step": 7210 }, { "ce_loss": 0.11004534363746643, "epoch": 2.4049366244162775, "step": 7210 }, { "distill_loss": 0.11246877908706665, "epoch": 2.4049366244162775, "step": 7210 }, { "epoch": 2.4049366244162775, "ref_ce_loss": 0.18797901272773743, "step": 7210 }, { "epoch": 2.4049366244162775, "loss": 0.4884134531021118, "step": 7210 }, { "ce_loss": 0.1199505552649498, "epoch": 2.4049366244162775, "step": 7210 }, { "distill_loss": 0.1352088749408722, "epoch": 2.4049366244162775, "step": 7210 }, { "epoch": 2.4049366244162775, "ref_ce_loss": 0.08366453647613525, "step": 7210 }, { "epoch": 2.4049366244162775, "loss": 0.3768484890460968, "step": 7210 }, { "ce_loss": 0.13738639652729034, "epoch": 2.4049366244162775, "step": 7210 }, { "distill_loss": 0.1332009732723236, "epoch": 2.4049366244162775, "step": 7210 }, { "epoch": 2.4049366244162775, "ref_ce_loss": 0.07930707186460495, "step": 7210 }, { "epoch": 2.408272181454303, "loss": 0.5629, "step": 7220 }, { "epoch": 2.408272181454303, "grad_norm": 2.687861204147339, "step": 7220 }, { "epoch": 2.408272181454303, "learning_rate": 0.0002458306875614221, "step": 7220 }, { "epoch": 2.408272181454303, "loss": 0.5342181921005249, "step": 7220 }, { "ce_loss": 0.19010506570339203, "epoch": 2.408272181454303, "step": 7220 }, { "distill_loss": 0.143671452999115, "epoch": 2.408272181454303, "step": 7220 }, { "epoch": 2.408272181454303, "ref_ce_loss": 0.20026570558547974, "step": 7220 }, { "epoch": 2.408272181454303, "loss": 0.4170488715171814, "step": 7220 }, { "ce_loss": 0.1819777935743332, "epoch": 2.408272181454303, "step": 7220 }, { "distill_loss": 0.1289082020521164, "epoch": 2.408272181454303, "step": 7220 }, { "epoch": 2.408272181454303, "ref_ce_loss": 0.10599298775196075, "step": 7220 }, { "epoch": 2.408272181454303, "loss": 0.9120379686355591, "step": 7220 }, { "ce_loss": 0.18983766436576843, "epoch": 2.408272181454303, "step": 7220 }, { "distill_loss": 0.17872394621372223, "epoch": 2.408272181454303, "step": 7220 }, { "epoch": 2.408272181454303, "ref_ce_loss": 0.11700882762670517, "step": 7220 }, { "epoch": 2.408272181454303, "loss": 0.4734853506088257, "step": 7220 }, { "ce_loss": 0.13527773320674896, "epoch": 2.408272181454303, "step": 7220 }, { "distill_loss": 0.14905977249145508, "epoch": 2.408272181454303, "step": 7220 }, { "epoch": 2.408272181454303, "ref_ce_loss": 0.11568107455968857, "step": 7220 }, { "epoch": 2.4116077384923282, "loss": 0.5457, "step": 7230 }, { "epoch": 2.4116077384923282, "grad_norm": 2.3142776489257812, "step": 7230 }, { "epoch": 2.4116077384923282, "learning_rate": 0.0002456747669568538, "step": 7230 }, { "epoch": 2.4116077384923282, "loss": 0.7524964213371277, "step": 7230 }, { "ce_loss": 0.30550333857536316, "epoch": 2.4116077384923282, "step": 7230 }, { "distill_loss": 0.15317846834659576, "epoch": 2.4116077384923282, "step": 7230 }, { "epoch": 2.4116077384923282, "ref_ce_loss": 0.1951562613248825, "step": 7230 }, { "epoch": 2.4116077384923282, "loss": 0.5358716249465942, "step": 7230 }, { "ce_loss": 0.14135171473026276, "epoch": 2.4116077384923282, "step": 7230 }, { "distill_loss": 0.13840961456298828, "epoch": 2.4116077384923282, "step": 7230 }, { "epoch": 2.4116077384923282, "ref_ce_loss": 0.14255258440971375, "step": 7230 }, { "epoch": 2.4116077384923282, "loss": 0.994782567024231, "step": 7230 }, { "ce_loss": 0.2172781229019165, "epoch": 2.4116077384923282, "step": 7230 }, { "distill_loss": 0.15493077039718628, "epoch": 2.4116077384923282, "step": 7230 }, { "epoch": 2.4116077384923282, "ref_ce_loss": 0.124776691198349, "step": 7230 }, { "epoch": 2.4116077384923282, "loss": 0.3869795799255371, "step": 7230 }, { "ce_loss": 0.1280602514743805, "epoch": 2.4116077384923282, "step": 7230 }, { "distill_loss": 0.13424427807331085, "epoch": 2.4116077384923282, "step": 7230 }, { "epoch": 2.4116077384923282, "ref_ce_loss": 0.09938256442546844, "step": 7230 }, { "epoch": 2.4149432955303536, "loss": 0.6535, "step": 7240 }, { "epoch": 2.4149432955303536, "grad_norm": 3.7801764011383057, "step": 7240 }, { "epoch": 2.4149432955303536, "learning_rate": 0.00024551867187925114, "step": 7240 }, { "epoch": 2.4149432955303536, "loss": 1.0940275192260742, "step": 7240 }, { "ce_loss": 0.30816763639450073, "epoch": 2.4149432955303536, "step": 7240 }, { "distill_loss": 0.16756215691566467, "epoch": 2.4149432955303536, "step": 7240 }, { "epoch": 2.4149432955303536, "ref_ce_loss": 0.1721998006105423, "step": 7240 }, { "epoch": 2.4149432955303536, "loss": 0.5963624119758606, "step": 7240 }, { "ce_loss": 0.18392635881900787, "epoch": 2.4149432955303536, "step": 7240 }, { "distill_loss": 0.14534255862236023, "epoch": 2.4149432955303536, "step": 7240 }, { "epoch": 2.4149432955303536, "ref_ce_loss": 0.13652925193309784, "step": 7240 }, { "epoch": 2.4149432955303536, "loss": 0.8176337480545044, "step": 7240 }, { "ce_loss": 0.14677022397518158, "epoch": 2.4149432955303536, "step": 7240 }, { "distill_loss": 0.15364260971546173, "epoch": 2.4149432955303536, "step": 7240 }, { "epoch": 2.4149432955303536, "ref_ce_loss": 0.14605900645256042, "step": 7240 }, { "epoch": 2.4149432955303536, "loss": 0.5201737880706787, "step": 7240 }, { "ce_loss": 0.14001545310020447, "epoch": 2.4149432955303536, "step": 7240 }, { "distill_loss": 0.21256887912750244, "epoch": 2.4149432955303536, "step": 7240 }, { "epoch": 2.4149432955303536, "ref_ce_loss": 0.11998400092124939, "step": 7240 }, { "epoch": 2.418278852568379, "loss": 0.6528, "step": 7250 }, { "epoch": 2.418278852568379, "grad_norm": 3.017970085144043, "step": 7250 }, { "epoch": 2.418278852568379, "learning_rate": 0.00024536240261327003, "step": 7250 }, { "epoch": 2.418278852568379, "loss": 0.7078180313110352, "step": 7250 }, { "ce_loss": 0.2540886402130127, "epoch": 2.418278852568379, "step": 7250 }, { "distill_loss": 0.12883436679840088, "epoch": 2.418278852568379, "step": 7250 }, { "epoch": 2.418278852568379, "ref_ce_loss": 0.1449848860502243, "step": 7250 }, { "epoch": 2.418278852568379, "loss": 0.5828919410705566, "step": 7250 }, { "ce_loss": 0.16208089888095856, "epoch": 2.418278852568379, "step": 7250 }, { "distill_loss": 0.1052861288189888, "epoch": 2.418278852568379, "step": 7250 }, { "epoch": 2.418278852568379, "ref_ce_loss": 0.10545025765895844, "step": 7250 }, { "epoch": 2.418278852568379, "loss": 0.5507915019989014, "step": 7250 }, { "ce_loss": 0.1334887444972992, "epoch": 2.418278852568379, "step": 7250 }, { "distill_loss": 0.1257612407207489, "epoch": 2.418278852568379, "step": 7250 }, { "epoch": 2.418278852568379, "ref_ce_loss": 0.1570345163345337, "step": 7250 }, { "epoch": 2.418278852568379, "loss": 0.6326401233673096, "step": 7250 }, { "ce_loss": 0.2364356815814972, "epoch": 2.418278852568379, "step": 7250 }, { "distill_loss": 0.13967809081077576, "epoch": 2.418278852568379, "step": 7250 }, { "epoch": 2.418278852568379, "ref_ce_loss": 0.12372446805238724, "step": 7250 }, { "epoch": 2.4216144096064043, "loss": 0.6028, "step": 7260 }, { "epoch": 2.4216144096064043, "grad_norm": 4.875604629516602, "step": 7260 }, { "epoch": 2.4216144096064043, "learning_rate": 0.0002452059594438839, "step": 7260 }, { "epoch": 2.4216144096064043, "loss": 0.4415472745895386, "step": 7260 }, { "ce_loss": 0.18437181413173676, "epoch": 2.4216144096064043, "step": 7260 }, { "distill_loss": 0.13511112332344055, "epoch": 2.4216144096064043, "step": 7260 }, { "epoch": 2.4216144096064043, "ref_ce_loss": 0.08003831654787064, "step": 7260 }, { "epoch": 2.4216144096064043, "loss": 0.37648123502731323, "step": 7260 }, { "ce_loss": 0.09765855222940445, "epoch": 2.4216144096064043, "step": 7260 }, { "distill_loss": 0.10940150916576385, "epoch": 2.4216144096064043, "step": 7260 }, { "epoch": 2.4216144096064043, "ref_ce_loss": 0.08033854514360428, "step": 7260 }, { "epoch": 2.4216144096064043, "loss": 0.6172460913658142, "step": 7260 }, { "ce_loss": 0.16564592719078064, "epoch": 2.4216144096064043, "step": 7260 }, { "distill_loss": 0.1328679919242859, "epoch": 2.4216144096064043, "step": 7260 }, { "epoch": 2.4216144096064043, "ref_ce_loss": 0.1384454071521759, "step": 7260 }, { "epoch": 2.4216144096064043, "loss": 0.43410724401474, "step": 7260 }, { "ce_loss": 0.14599348604679108, "epoch": 2.4216144096064043, "step": 7260 }, { "distill_loss": 0.1227763220667839, "epoch": 2.4216144096064043, "step": 7260 }, { "epoch": 2.4216144096064043, "ref_ce_loss": 0.12593898177146912, "step": 7260 }, { "epoch": 2.4249499666444296, "loss": 0.5562, "step": 7270 }, { "epoch": 2.4249499666444296, "grad_norm": 4.715060710906982, "step": 7270 }, { "epoch": 2.4249499666444296, "learning_rate": 0.00024504934265638347, "step": 7270 }, { "epoch": 2.4249499666444296, "loss": 0.49528905749320984, "step": 7270 }, { "ce_loss": 0.211774080991745, "epoch": 2.4249499666444296, "step": 7270 }, { "distill_loss": 0.1500503122806549, "epoch": 2.4249499666444296, "step": 7270 }, { "epoch": 2.4249499666444296, "ref_ce_loss": 0.13327829539775848, "step": 7270 }, { "epoch": 2.4249499666444296, "loss": 0.7034370303153992, "step": 7270 }, { "ce_loss": 0.21872128546237946, "epoch": 2.4249499666444296, "step": 7270 }, { "distill_loss": 0.13050577044487, "epoch": 2.4249499666444296, "step": 7270 }, { "epoch": 2.4249499666444296, "ref_ce_loss": 0.1425921618938446, "step": 7270 }, { "epoch": 2.4249499666444296, "loss": 0.4694666266441345, "step": 7270 }, { "ce_loss": 0.13093678653240204, "epoch": 2.4249499666444296, "step": 7270 }, { "distill_loss": 0.11353382468223572, "epoch": 2.4249499666444296, "step": 7270 }, { "epoch": 2.4249499666444296, "ref_ce_loss": 0.11853688210248947, "step": 7270 }, { "epoch": 2.4249499666444296, "loss": 0.6298027038574219, "step": 7270 }, { "ce_loss": 0.15209899842739105, "epoch": 2.4249499666444296, "step": 7270 }, { "distill_loss": 0.11965539306402206, "epoch": 2.4249499666444296, "step": 7270 }, { "epoch": 2.4249499666444296, "ref_ce_loss": 0.10618112981319427, "step": 7270 }, { "epoch": 2.428285523682455, "loss": 0.5977, "step": 7280 }, { "epoch": 2.428285523682455, "grad_norm": 2.8958613872528076, "step": 7280 }, { "epoch": 2.428285523682455, "learning_rate": 0.000244892552536376, "step": 7280 }, { "epoch": 2.428285523682455, "loss": 0.879882276058197, "step": 7280 }, { "ce_loss": 0.21199427545070648, "epoch": 2.428285523682455, "step": 7280 }, { "distill_loss": 0.12943725287914276, "epoch": 2.428285523682455, "step": 7280 }, { "epoch": 2.428285523682455, "ref_ce_loss": 0.12772376835346222, "step": 7280 }, { "epoch": 2.428285523682455, "loss": 0.5259045362472534, "step": 7280 }, { "ce_loss": 0.13685457408428192, "epoch": 2.428285523682455, "step": 7280 }, { "distill_loss": 0.11776725947856903, "epoch": 2.428285523682455, "step": 7280 }, { "epoch": 2.428285523682455, "ref_ce_loss": 0.11824677139520645, "step": 7280 }, { "epoch": 2.428285523682455, "loss": 1.1360373497009277, "step": 7280 }, { "ce_loss": 0.20049446821212769, "epoch": 2.428285523682455, "step": 7280 }, { "distill_loss": 0.1447339951992035, "epoch": 2.428285523682455, "step": 7280 }, { "epoch": 2.428285523682455, "ref_ce_loss": 0.15620869398117065, "step": 7280 }, { "epoch": 2.428285523682455, "loss": 0.2793632745742798, "step": 7280 }, { "ce_loss": 0.09599223732948303, "epoch": 2.428285523682455, "step": 7280 }, { "distill_loss": 0.09701315313577652, "epoch": 2.428285523682455, "step": 7280 }, { "epoch": 2.428285523682455, "ref_ce_loss": 0.0862441137433052, "step": 7280 }, { "epoch": 2.4316210807204803, "loss": 0.5863, "step": 7290 }, { "epoch": 2.4316210807204803, "grad_norm": 2.2033097743988037, "step": 7290 }, { "epoch": 2.4316210807204803, "learning_rate": 0.0002447355893697847, "step": 7290 }, { "epoch": 2.4316210807204803, "loss": 0.41150063276290894, "step": 7290 }, { "ce_loss": 0.10379496961832047, "epoch": 2.4316210807204803, "step": 7290 }, { "distill_loss": 0.08479201048612595, "epoch": 2.4316210807204803, "step": 7290 }, { "epoch": 2.4316210807204803, "ref_ce_loss": 0.15390710532665253, "step": 7290 }, { "epoch": 2.4316210807204803, "loss": 0.44651439785957336, "step": 7290 }, { "ce_loss": 0.1582019180059433, "epoch": 2.4316210807204803, "step": 7290 }, { "distill_loss": 0.08264210820198059, "epoch": 2.4316210807204803, "step": 7290 }, { "epoch": 2.4316210807204803, "ref_ce_loss": 0.09824780374765396, "step": 7290 }, { "epoch": 2.4316210807204803, "loss": 0.736008882522583, "step": 7290 }, { "ce_loss": 0.1359715312719345, "epoch": 2.4316210807204803, "step": 7290 }, { "distill_loss": 0.11413221061229706, "epoch": 2.4316210807204803, "step": 7290 }, { "epoch": 2.4316210807204803, "ref_ce_loss": 0.14620079100131989, "step": 7290 }, { "epoch": 2.4316210807204803, "loss": 0.6262930631637573, "step": 7290 }, { "ce_loss": 0.2595474123954773, "epoch": 2.4316210807204803, "step": 7290 }, { "distill_loss": 0.12630505859851837, "epoch": 2.4316210807204803, "step": 7290 }, { "epoch": 2.4316210807204803, "ref_ce_loss": 0.16754832863807678, "step": 7290 }, { "epoch": 2.4349566377585057, "loss": 0.5062, "step": 7300 }, { "epoch": 2.4349566377585057, "grad_norm": 4.805685997009277, "step": 7300 }, { "epoch": 2.4349566377585057, "learning_rate": 0.00024457845344284855, "step": 7300 }, { "epoch": 2.4349566377585057, "loss": 0.8836522698402405, "step": 7300 }, { "ce_loss": 0.24709133803844452, "epoch": 2.4349566377585057, "step": 7300 }, { "distill_loss": 0.15019738674163818, "epoch": 2.4349566377585057, "step": 7300 }, { "epoch": 2.4349566377585057, "ref_ce_loss": 0.15257512032985687, "step": 7300 }, { "epoch": 2.4349566377585057, "loss": 0.40954825282096863, "step": 7300 }, { "ce_loss": 0.12927237153053284, "epoch": 2.4349566377585057, "step": 7300 }, { "distill_loss": 0.10678776353597641, "epoch": 2.4349566377585057, "step": 7300 }, { "epoch": 2.4349566377585057, "ref_ce_loss": 0.14493626356124878, "step": 7300 }, { "epoch": 2.4349566377585057, "loss": 1.000192642211914, "step": 7300 }, { "ce_loss": 0.23533885180950165, "epoch": 2.4349566377585057, "step": 7300 }, { "distill_loss": 0.16133449971675873, "epoch": 2.4349566377585057, "step": 7300 }, { "epoch": 2.4349566377585057, "ref_ce_loss": 0.1425640732049942, "step": 7300 }, { "epoch": 2.4349566377585057, "loss": 0.519072949886322, "step": 7300 }, { "ce_loss": 0.07202491909265518, "epoch": 2.4349566377585057, "step": 7300 }, { "distill_loss": 0.1196460872888565, "epoch": 2.4349566377585057, "step": 7300 }, { "epoch": 2.4349566377585057, "ref_ce_loss": 0.09123296290636063, "step": 7300 }, { "epoch": 2.438292194796531, "loss": 0.5642, "step": 7310 }, { "epoch": 2.438292194796531, "grad_norm": 2.3900463581085205, "step": 7310 }, { "epoch": 2.438292194796531, "learning_rate": 0.0002444211450421214, "step": 7310 }, { "epoch": 2.438292194796531, "loss": 0.5913569927215576, "step": 7310 }, { "ce_loss": 0.236644446849823, "epoch": 2.438292194796531, "step": 7310 }, { "distill_loss": 0.14004617929458618, "epoch": 2.438292194796531, "step": 7310 }, { "epoch": 2.438292194796531, "ref_ce_loss": 0.11488953977823257, "step": 7310 }, { "epoch": 2.438292194796531, "loss": 0.7740991115570068, "step": 7310 }, { "ce_loss": 0.2927137315273285, "epoch": 2.438292194796531, "step": 7310 }, { "distill_loss": 0.13568630814552307, "epoch": 2.438292194796531, "step": 7310 }, { "epoch": 2.438292194796531, "ref_ce_loss": 0.17373815178871155, "step": 7310 }, { "epoch": 2.438292194796531, "loss": 0.47589385509490967, "step": 7310 }, { "ce_loss": 0.09717685729265213, "epoch": 2.438292194796531, "step": 7310 }, { "distill_loss": 0.13696280121803284, "epoch": 2.438292194796531, "step": 7310 }, { "epoch": 2.438292194796531, "ref_ce_loss": 0.09850306063890457, "step": 7310 }, { "epoch": 2.438292194796531, "loss": 0.6440833806991577, "step": 7310 }, { "ce_loss": 0.1265231966972351, "epoch": 2.438292194796531, "step": 7310 }, { "distill_loss": 0.13772481679916382, "epoch": 2.438292194796531, "step": 7310 }, { "epoch": 2.438292194796531, "ref_ce_loss": 0.08583074063062668, "step": 7310 }, { "epoch": 2.4416277518345564, "loss": 0.5549, "step": 7320 }, { "epoch": 2.4416277518345564, "grad_norm": 3.4240992069244385, "step": 7320 }, { "epoch": 2.4416277518345564, "learning_rate": 0.00024426366445447185, "step": 7320 }, { "epoch": 2.4416277518345564, "loss": 0.738955557346344, "step": 7320 }, { "ce_loss": 0.23381483554840088, "epoch": 2.4416277518345564, "step": 7320 }, { "distill_loss": 0.17107410728931427, "epoch": 2.4416277518345564, "step": 7320 }, { "epoch": 2.4416277518345564, "ref_ce_loss": 0.15074104070663452, "step": 7320 }, { "epoch": 2.4416277518345564, "loss": 0.6326438188552856, "step": 7320 }, { "ce_loss": 0.225467249751091, "epoch": 2.4416277518345564, "step": 7320 }, { "distill_loss": 0.15438255667686462, "epoch": 2.4416277518345564, "step": 7320 }, { "epoch": 2.4416277518345564, "ref_ce_loss": 0.11623335629701614, "step": 7320 }, { "epoch": 2.4416277518345564, "loss": 0.40885937213897705, "step": 7320 }, { "ce_loss": 0.12698811292648315, "epoch": 2.4416277518345564, "step": 7320 }, { "distill_loss": 0.130739226937294, "epoch": 2.4416277518345564, "step": 7320 }, { "epoch": 2.4416277518345564, "ref_ce_loss": 0.08082147687673569, "step": 7320 }, { "epoch": 2.4416277518345564, "loss": 0.30342355370521545, "step": 7320 }, { "ce_loss": 0.10416149348020554, "epoch": 2.4416277518345564, "step": 7320 }, { "distill_loss": 0.10678577423095703, "epoch": 2.4416277518345564, "step": 7320 }, { "epoch": 2.4416277518345564, "ref_ce_loss": 0.09228299558162689, "step": 7320 }, { "epoch": 2.4449633088725817, "loss": 0.5681, "step": 7330 }, { "epoch": 2.4449633088725817, "grad_norm": 1.6141737699508667, "step": 7330 }, { "epoch": 2.4449633088725817, "learning_rate": 0.00024410601196708236, "step": 7330 }, { "epoch": 2.4449633088725817, "loss": 0.8059208989143372, "step": 7330 }, { "ce_loss": 0.22274239361286163, "epoch": 2.4449633088725817, "step": 7330 }, { "distill_loss": 0.14510369300842285, "epoch": 2.4449633088725817, "step": 7330 }, { "epoch": 2.4449633088725817, "ref_ce_loss": 0.21629805862903595, "step": 7330 }, { "epoch": 2.4449633088725817, "loss": 0.5609763860702515, "step": 7330 }, { "ce_loss": 0.12866266071796417, "epoch": 2.4449633088725817, "step": 7330 }, { "distill_loss": 0.09237472712993622, "epoch": 2.4449633088725817, "step": 7330 }, { "epoch": 2.4449633088725817, "ref_ce_loss": 0.07987705618143082, "step": 7330 }, { "epoch": 2.4449633088725817, "loss": 0.5934087634086609, "step": 7330 }, { "ce_loss": 0.23136092722415924, "epoch": 2.4449633088725817, "step": 7330 }, { "distill_loss": 0.12402576208114624, "epoch": 2.4449633088725817, "step": 7330 }, { "epoch": 2.4449633088725817, "ref_ce_loss": 0.1395598202943802, "step": 7330 }, { "epoch": 2.4449633088725817, "loss": 0.40078234672546387, "step": 7330 }, { "ce_loss": 0.1275785267353058, "epoch": 2.4449633088725817, "step": 7330 }, { "distill_loss": 0.12972263991832733, "epoch": 2.4449633088725817, "step": 7330 }, { "epoch": 2.4449633088725817, "ref_ce_loss": 0.11326507478952408, "step": 7330 }, { "epoch": 2.448298865910607, "loss": 0.5538, "step": 7340 }, { "epoch": 2.448298865910607, "grad_norm": 3.5968334674835205, "step": 7340 }, { "epoch": 2.448298865910607, "learning_rate": 0.0002439481878674488, "step": 7340 }, { "epoch": 2.448298865910607, "loss": 0.6532106995582581, "step": 7340 }, { "ce_loss": 0.21292003989219666, "epoch": 2.448298865910607, "step": 7340 }, { "distill_loss": 0.1319594830274582, "epoch": 2.448298865910607, "step": 7340 }, { "epoch": 2.448298865910607, "ref_ce_loss": 0.10543256253004074, "step": 7340 }, { "epoch": 2.448298865910607, "loss": 0.4424779713153839, "step": 7340 }, { "ce_loss": 0.14241373538970947, "epoch": 2.448298865910607, "step": 7340 }, { "distill_loss": 0.1377052217721939, "epoch": 2.448298865910607, "step": 7340 }, { "epoch": 2.448298865910607, "ref_ce_loss": 0.09160786122083664, "step": 7340 }, { "epoch": 2.448298865910607, "loss": 0.4995424449443817, "step": 7340 }, { "ce_loss": 0.11426243185997009, "epoch": 2.448298865910607, "step": 7340 }, { "distill_loss": 0.11159699410200119, "epoch": 2.448298865910607, "step": 7340 }, { "epoch": 2.448298865910607, "ref_ce_loss": 0.14085379242897034, "step": 7340 }, { "epoch": 2.448298865910607, "loss": 0.7496430277824402, "step": 7340 }, { "ce_loss": 0.15991029143333435, "epoch": 2.448298865910607, "step": 7340 }, { "distill_loss": 0.12658250331878662, "epoch": 2.448298865910607, "step": 7340 }, { "epoch": 2.448298865910607, "ref_ce_loss": 0.11923353374004364, "step": 7340 }, { "epoch": 2.4516344229486324, "loss": 0.5659, "step": 7350 }, { "epoch": 2.4516344229486324, "grad_norm": 2.7785444259643555, "step": 7350 }, { "epoch": 2.4516344229486324, "learning_rate": 0.00024379019244338007, "step": 7350 }, { "epoch": 2.4516344229486324, "loss": 0.6230776906013489, "step": 7350 }, { "ce_loss": 0.20530030131340027, "epoch": 2.4516344229486324, "step": 7350 }, { "distill_loss": 0.1313774734735489, "epoch": 2.4516344229486324, "step": 7350 }, { "epoch": 2.4516344229486324, "ref_ce_loss": 0.15010467171669006, "step": 7350 }, { "epoch": 2.4516344229486324, "loss": 0.5327169299125671, "step": 7350 }, { "ce_loss": 0.13175390660762787, "epoch": 2.4516344229486324, "step": 7350 }, { "distill_loss": 0.11843869090080261, "epoch": 2.4516344229486324, "step": 7350 }, { "epoch": 2.4516344229486324, "ref_ce_loss": 0.11874283850193024, "step": 7350 }, { "epoch": 2.4516344229486324, "loss": 0.5524466037750244, "step": 7350 }, { "ce_loss": 0.17544156312942505, "epoch": 2.4516344229486324, "step": 7350 }, { "distill_loss": 0.10477293282747269, "epoch": 2.4516344229486324, "step": 7350 }, { "epoch": 2.4516344229486324, "ref_ce_loss": 0.126258984208107, "step": 7350 }, { "epoch": 2.4516344229486324, "loss": 0.23935043811798096, "step": 7350 }, { "ce_loss": 0.0685420110821724, "epoch": 2.4516344229486324, "step": 7350 }, { "distill_loss": 0.10156445950269699, "epoch": 2.4516344229486324, "step": 7350 }, { "epoch": 2.4516344229486324, "ref_ce_loss": 0.06903655081987381, "step": 7350 }, { "epoch": 2.454969979986658, "loss": 0.588, "step": 7360 }, { "epoch": 2.454969979986658, "grad_norm": 4.6122355461120605, "step": 7360 }, { "epoch": 2.454969979986658, "learning_rate": 0.00024363202598299755, "step": 7360 }, { "epoch": 2.454969979986658, "loss": 0.460933655500412, "step": 7360 }, { "ce_loss": 0.1626943051815033, "epoch": 2.454969979986658, "step": 7360 }, { "distill_loss": 0.1440202295780182, "epoch": 2.454969979986658, "step": 7360 }, { "epoch": 2.454969979986658, "ref_ce_loss": 0.1275286227464676, "step": 7360 }, { "epoch": 2.454969979986658, "loss": 0.45748433470726013, "step": 7360 }, { "ce_loss": 0.14533036947250366, "epoch": 2.454969979986658, "step": 7360 }, { "distill_loss": 0.1288251280784607, "epoch": 2.454969979986658, "step": 7360 }, { "epoch": 2.454969979986658, "ref_ce_loss": 0.13756705820560455, "step": 7360 }, { "epoch": 2.454969979986658, "loss": 0.7040202021598816, "step": 7360 }, { "ce_loss": 0.20019203424453735, "epoch": 2.454969979986658, "step": 7360 }, { "distill_loss": 0.1584950089454651, "epoch": 2.454969979986658, "step": 7360 }, { "epoch": 2.454969979986658, "ref_ce_loss": 0.11588919907808304, "step": 7360 }, { "epoch": 2.454969979986658, "loss": 0.7524511218070984, "step": 7360 }, { "ce_loss": 0.1585523635149002, "epoch": 2.454969979986658, "step": 7360 }, { "distill_loss": 0.17898641526699066, "epoch": 2.454969979986658, "step": 7360 }, { "epoch": 2.454969979986658, "ref_ce_loss": 0.09428147971630096, "step": 7360 }, { "epoch": 2.458305537024683, "loss": 0.5598, "step": 7370 }, { "epoch": 2.458305537024683, "grad_norm": 8.720335960388184, "step": 7370 }, { "epoch": 2.458305537024683, "learning_rate": 0.00024347368877473448, "step": 7370 }, { "epoch": 2.458305537024683, "loss": 0.4737747311592102, "step": 7370 }, { "ce_loss": 0.12464018166065216, "epoch": 2.458305537024683, "step": 7370 }, { "distill_loss": 0.24916736781597137, "epoch": 2.458305537024683, "step": 7370 }, { "epoch": 2.458305537024683, "ref_ce_loss": 0.07362619787454605, "step": 7370 }, { "epoch": 2.458305537024683, "loss": 0.5169686079025269, "step": 7370 }, { "ce_loss": 0.13986755907535553, "epoch": 2.458305537024683, "step": 7370 }, { "distill_loss": 0.18237434327602386, "epoch": 2.458305537024683, "step": 7370 }, { "epoch": 2.458305537024683, "ref_ce_loss": 0.08128245919942856, "step": 7370 }, { "epoch": 2.458305537024683, "loss": 0.71217280626297, "step": 7370 }, { "ce_loss": 0.06464303284883499, "epoch": 2.458305537024683, "step": 7370 }, { "distill_loss": 0.10259261727333069, "epoch": 2.458305537024683, "step": 7370 }, { "epoch": 2.458305537024683, "ref_ce_loss": 0.08970564603805542, "step": 7370 }, { "epoch": 2.458305537024683, "loss": 1.0477197170257568, "step": 7370 }, { "ce_loss": 0.1423245221376419, "epoch": 2.458305537024683, "step": 7370 }, { "distill_loss": 0.17700180411338806, "epoch": 2.458305537024683, "step": 7370 }, { "epoch": 2.458305537024683, "ref_ce_loss": 0.1467050313949585, "step": 7370 }, { "epoch": 2.4616410940627085, "loss": 0.9109, "step": 7380 }, { "epoch": 2.4616410940627085, "grad_norm": 8.063809394836426, "step": 7380 }, { "epoch": 2.4616410940627085, "learning_rate": 0.00024331518110733545, "step": 7380 }, { "epoch": 2.4616410940627085, "loss": 0.5266667008399963, "step": 7380 }, { "ce_loss": 0.0833396315574646, "epoch": 2.4616410940627085, "step": 7380 }, { "distill_loss": 0.29295581579208374, "epoch": 2.4616410940627085, "step": 7380 }, { "epoch": 2.4616410940627085, "ref_ce_loss": 0.1029653325676918, "step": 7380 }, { "epoch": 2.4616410940627085, "loss": 1.089504361152649, "step": 7380 }, { "ce_loss": 0.1683725267648697, "epoch": 2.4616410940627085, "step": 7380 }, { "distill_loss": 0.7212741374969482, "epoch": 2.4616410940627085, "step": 7380 }, { "epoch": 2.4616410940627085, "ref_ce_loss": 0.15166783332824707, "step": 7380 }, { "epoch": 2.4616410940627085, "loss": 1.0741162300109863, "step": 7380 }, { "ce_loss": 0.262071818113327, "epoch": 2.4616410940627085, "step": 7380 }, { "distill_loss": 0.4402065873146057, "epoch": 2.4616410940627085, "step": 7380 }, { "epoch": 2.4616410940627085, "ref_ce_loss": 0.06969785690307617, "step": 7380 }, { "epoch": 2.4616410940627085, "loss": 0.5918949246406555, "step": 7380 }, { "ce_loss": 0.07457996159791946, "epoch": 2.4616410940627085, "step": 7380 }, { "distill_loss": 0.38692402839660645, "epoch": 2.4616410940627085, "step": 7380 }, { "epoch": 2.4616410940627085, "ref_ce_loss": 0.08364200592041016, "step": 7380 }, { "epoch": 2.464976651100734, "loss": 0.6996, "step": 7390 }, { "epoch": 2.464976651100734, "grad_norm": 2.4126689434051514, "step": 7390 }, { "epoch": 2.464976651100734, "learning_rate": 0.00024315650326985595, "step": 7390 }, { "epoch": 2.464976651100734, "loss": 1.2089097499847412, "step": 7390 }, { "ce_loss": 0.18091559410095215, "epoch": 2.464976651100734, "step": 7390 }, { "distill_loss": 0.41868704557418823, "epoch": 2.464976651100734, "step": 7390 }, { "epoch": 2.464976651100734, "ref_ce_loss": 0.13473795354366302, "step": 7390 }, { "epoch": 2.464976651100734, "loss": 0.7378758192062378, "step": 7390 }, { "ce_loss": 0.20848192274570465, "epoch": 2.464976651100734, "step": 7390 }, { "distill_loss": 0.22565500438213348, "epoch": 2.464976651100734, "step": 7390 }, { "epoch": 2.464976651100734, "ref_ce_loss": 0.1735285222530365, "step": 7390 }, { "epoch": 2.464976651100734, "loss": 0.8218880891799927, "step": 7390 }, { "ce_loss": 0.2554478049278259, "epoch": 2.464976651100734, "step": 7390 }, { "distill_loss": 0.24467813968658447, "epoch": 2.464976651100734, "step": 7390 }, { "epoch": 2.464976651100734, "ref_ce_loss": 0.16238157451152802, "step": 7390 }, { "epoch": 2.464976651100734, "loss": 0.6124982833862305, "step": 7390 }, { "ce_loss": 0.18273940682411194, "epoch": 2.464976651100734, "step": 7390 }, { "distill_loss": 0.20060420036315918, "epoch": 2.464976651100734, "step": 7390 }, { "epoch": 2.464976651100734, "ref_ce_loss": 0.15322014689445496, "step": 7390 }, { "epoch": 2.468312208138759, "loss": 0.6726, "step": 7400 }, { "epoch": 2.468312208138759, "grad_norm": 3.222508192062378, "step": 7400 }, { "epoch": 2.468312208138759, "learning_rate": 0.00024299765555166162, "step": 7400 }, { "epoch": 2.468312208138759, "loss": 0.5121899247169495, "step": 7400 }, { "ce_loss": 0.15992094576358795, "epoch": 2.468312208138759, "step": 7400 }, { "distill_loss": 0.17074273526668549, "epoch": 2.468312208138759, "step": 7400 }, { "epoch": 2.468312208138759, "ref_ce_loss": 0.1101238951086998, "step": 7400 }, { "epoch": 2.468312208138759, "loss": 0.6162878274917603, "step": 7400 }, { "ce_loss": 0.24921391904354095, "epoch": 2.468312208138759, "step": 7400 }, { "distill_loss": 0.16301168501377106, "epoch": 2.468312208138759, "step": 7400 }, { "epoch": 2.468312208138759, "ref_ce_loss": 0.13484197854995728, "step": 7400 }, { "epoch": 2.468312208138759, "loss": 0.518480122089386, "step": 7400 }, { "ce_loss": 0.11398441344499588, "epoch": 2.468312208138759, "step": 7400 }, { "distill_loss": 0.1706867665052414, "epoch": 2.468312208138759, "step": 7400 }, { "epoch": 2.468312208138759, "ref_ce_loss": 0.09749495983123779, "step": 7400 }, { "epoch": 2.468312208138759, "loss": 1.0171473026275635, "step": 7400 }, { "ce_loss": 0.14315776526927948, "epoch": 2.468312208138759, "step": 7400 }, { "distill_loss": 0.17418572306632996, "epoch": 2.468312208138759, "step": 7400 }, { "epoch": 2.468312208138759, "ref_ce_loss": 0.11513806879520416, "step": 7400 }, { "epoch": 2.4716477651767845, "loss": 0.5981, "step": 7410 }, { "epoch": 2.4716477651767845, "grad_norm": 3.1181561946868896, "step": 7410 }, { "epoch": 2.4716477651767845, "learning_rate": 0.00024283863824242825, "step": 7410 }, { "epoch": 2.4716477651767845, "loss": 0.4941348731517792, "step": 7410 }, { "ce_loss": 0.16458964347839355, "epoch": 2.4716477651767845, "step": 7410 }, { "distill_loss": 0.1792687326669693, "epoch": 2.4716477651767845, "step": 7410 }, { "epoch": 2.4716477651767845, "ref_ce_loss": 0.14990025758743286, "step": 7410 }, { "epoch": 2.4716477651767845, "loss": 0.6867243647575378, "step": 7410 }, { "ce_loss": 0.2279725968837738, "epoch": 2.4716477651767845, "step": 7410 }, { "distill_loss": 0.27562469244003296, "epoch": 2.4716477651767845, "step": 7410 }, { "epoch": 2.4716477651767845, "ref_ce_loss": 0.1251329928636551, "step": 7410 }, { "epoch": 2.4716477651767845, "loss": 0.5477474331855774, "step": 7410 }, { "ce_loss": 0.22149042785167694, "epoch": 2.4716477651767845, "step": 7410 }, { "distill_loss": 0.16655884683132172, "epoch": 2.4716477651767845, "step": 7410 }, { "epoch": 2.4716477651767845, "ref_ce_loss": 0.11619669944047928, "step": 7410 }, { "epoch": 2.4716477651767845, "loss": 0.7532987594604492, "step": 7410 }, { "ce_loss": 0.17848901450634003, "epoch": 2.4716477651767845, "step": 7410 }, { "distill_loss": 0.4082738757133484, "epoch": 2.4716477651767845, "step": 7410 }, { "epoch": 2.4716477651767845, "ref_ce_loss": 0.12353993207216263, "step": 7410 }, { "epoch": 2.47498332221481, "loss": 0.6411, "step": 7420 }, { "epoch": 2.47498332221481, "grad_norm": 4.742797374725342, "step": 7420 }, { "epoch": 2.47498332221481, "learning_rate": 0.0002426794516321405, "step": 7420 }, { "epoch": 2.47498332221481, "loss": 0.5106517672538757, "step": 7420 }, { "ce_loss": 0.14942465722560883, "epoch": 2.47498332221481, "step": 7420 }, { "distill_loss": 0.15792769193649292, "epoch": 2.47498332221481, "step": 7420 }, { "epoch": 2.47498332221481, "ref_ce_loss": 0.1275612860918045, "step": 7420 }, { "epoch": 2.47498332221481, "loss": 0.5815585255622864, "step": 7420 }, { "ce_loss": 0.08623402565717697, "epoch": 2.47498332221481, "step": 7420 }, { "distill_loss": 0.14738768339157104, "epoch": 2.47498332221481, "step": 7420 }, { "epoch": 2.47498332221481, "ref_ce_loss": 0.12640981376171112, "step": 7420 }, { "epoch": 2.47498332221481, "loss": 0.7946469783782959, "step": 7420 }, { "ce_loss": 0.2270994633436203, "epoch": 2.47498332221481, "step": 7420 }, { "distill_loss": 0.25851958990097046, "epoch": 2.47498332221481, "step": 7420 }, { "epoch": 2.47498332221481, "ref_ce_loss": 0.13658231496810913, "step": 7420 }, { "epoch": 2.47498332221481, "loss": 0.5645567774772644, "step": 7420 }, { "ce_loss": 0.11589132994413376, "epoch": 2.47498332221481, "step": 7420 }, { "distill_loss": 0.3163776099681854, "epoch": 2.47498332221481, "step": 7420 }, { "epoch": 2.47498332221481, "ref_ce_loss": 0.13185791671276093, "step": 7420 }, { "epoch": 2.4783188792528352, "loss": 0.6218, "step": 7430 }, { "epoch": 2.4783188792528352, "grad_norm": 3.2630066871643066, "step": 7430 }, { "epoch": 2.4783188792528352, "learning_rate": 0.00024252009601109206, "step": 7430 }, { "epoch": 2.4783188792528352, "loss": 1.7518689632415771, "step": 7430 }, { "ce_loss": 0.21943138539791107, "epoch": 2.4783188792528352, "step": 7430 }, { "distill_loss": 0.16195166110992432, "epoch": 2.4783188792528352, "step": 7430 }, { "epoch": 2.4783188792528352, "ref_ce_loss": 0.13848480582237244, "step": 7430 }, { "epoch": 2.4783188792528352, "loss": 0.42734628915786743, "step": 7430 }, { "ce_loss": 0.13677547872066498, "epoch": 2.4783188792528352, "step": 7430 }, { "distill_loss": 0.14099811017513275, "epoch": 2.4783188792528352, "step": 7430 }, { "epoch": 2.4783188792528352, "ref_ce_loss": 0.09117099642753601, "step": 7430 }, { "epoch": 2.4783188792528352, "loss": 0.49946892261505127, "step": 7430 }, { "ce_loss": 0.11629904061555862, "epoch": 2.4783188792528352, "step": 7430 }, { "distill_loss": 0.1060149148106575, "epoch": 2.4783188792528352, "step": 7430 }, { "epoch": 2.4783188792528352, "ref_ce_loss": 0.10538594424724579, "step": 7430 }, { "epoch": 2.4783188792528352, "loss": 0.5545316934585571, "step": 7430 }, { "ce_loss": 0.17956024408340454, "epoch": 2.4783188792528352, "step": 7430 }, { "distill_loss": 0.16018567979335785, "epoch": 2.4783188792528352, "step": 7430 }, { "epoch": 2.4783188792528352, "ref_ce_loss": 0.11891523748636246, "step": 7430 }, { "epoch": 2.4816544362908606, "loss": 0.5819, "step": 7440 }, { "epoch": 2.4816544362908606, "grad_norm": 2.5894501209259033, "step": 7440 }, { "epoch": 2.4816544362908606, "learning_rate": 0.0002423605716698847, "step": 7440 }, { "epoch": 2.4816544362908606, "loss": 0.462877482175827, "step": 7440 }, { "ce_loss": 0.21872608363628387, "epoch": 2.4816544362908606, "step": 7440 }, { "distill_loss": 0.15316803753376007, "epoch": 2.4816544362908606, "step": 7440 }, { "epoch": 2.4816544362908606, "ref_ce_loss": 0.09063727408647537, "step": 7440 }, { "epoch": 2.4816544362908606, "loss": 0.49275457859039307, "step": 7440 }, { "ce_loss": 0.16110378503799438, "epoch": 2.4816544362908606, "step": 7440 }, { "distill_loss": 0.16137196123600006, "epoch": 2.4816544362908606, "step": 7440 }, { "epoch": 2.4816544362908606, "ref_ce_loss": 0.08860895782709122, "step": 7440 }, { "epoch": 2.4816544362908606, "loss": 0.5795860886573792, "step": 7440 }, { "ce_loss": 0.150924414396286, "epoch": 2.4816544362908606, "step": 7440 }, { "distill_loss": 0.2513508200645447, "epoch": 2.4816544362908606, "step": 7440 }, { "epoch": 2.4816544362908606, "ref_ce_loss": 0.1376534402370453, "step": 7440 }, { "epoch": 2.4816544362908606, "loss": 0.39724642038345337, "step": 7440 }, { "ce_loss": 0.14655473828315735, "epoch": 2.4816544362908606, "step": 7440 }, { "distill_loss": 0.12975682318210602, "epoch": 2.4816544362908606, "step": 7440 }, { "epoch": 2.4816544362908606, "ref_ce_loss": 0.12082649022340775, "step": 7440 }, { "epoch": 2.484989993328886, "loss": 0.6028, "step": 7450 }, { "epoch": 2.484989993328886, "grad_norm": 2.79844331741333, "step": 7450 }, { "epoch": 2.484989993328886, "learning_rate": 0.00024220087889942793, "step": 7450 }, { "epoch": 2.484989993328886, "loss": 0.46565479040145874, "step": 7450 }, { "ce_loss": 0.16986431181430817, "epoch": 2.484989993328886, "step": 7450 }, { "distill_loss": 0.12018732726573944, "epoch": 2.484989993328886, "step": 7450 }, { "epoch": 2.484989993328886, "ref_ce_loss": 0.17548438906669617, "step": 7450 }, { "epoch": 2.484989993328886, "loss": 0.6156365871429443, "step": 7450 }, { "ce_loss": 0.14211425185203552, "epoch": 2.484989993328886, "step": 7450 }, { "distill_loss": 0.3125298321247101, "epoch": 2.484989993328886, "step": 7450 }, { "epoch": 2.484989993328886, "ref_ce_loss": 0.111871138215065, "step": 7450 }, { "epoch": 2.484989993328886, "loss": 0.9736812114715576, "step": 7450 }, { "ce_loss": 0.2266884446144104, "epoch": 2.484989993328886, "step": 7450 }, { "distill_loss": 0.2732122242450714, "epoch": 2.484989993328886, "step": 7450 }, { "epoch": 2.484989993328886, "ref_ce_loss": 0.16810236871242523, "step": 7450 }, { "epoch": 2.484989993328886, "loss": 0.8139933347702026, "step": 7450 }, { "ce_loss": 0.19329741597175598, "epoch": 2.484989993328886, "step": 7450 }, { "distill_loss": 0.17764835059642792, "epoch": 2.484989993328886, "step": 7450 }, { "epoch": 2.484989993328886, "ref_ce_loss": 0.16515131294727325, "step": 7450 }, { "epoch": 2.4883255503669113, "loss": 0.5792, "step": 7460 }, { "epoch": 2.4883255503669113, "grad_norm": 2.714251756668091, "step": 7460 }, { "epoch": 2.4883255503669113, "learning_rate": 0.00024204101799093824, "step": 7460 }, { "epoch": 2.4883255503669113, "loss": 0.569564700126648, "step": 7460 }, { "ce_loss": 0.17240485548973083, "epoch": 2.4883255503669113, "step": 7460 }, { "distill_loss": 0.1441771388053894, "epoch": 2.4883255503669113, "step": 7460 }, { "epoch": 2.4883255503669113, "ref_ce_loss": 0.10838434845209122, "step": 7460 }, { "epoch": 2.4883255503669113, "loss": 0.6830180287361145, "step": 7460 }, { "ce_loss": 0.2065476030111313, "epoch": 2.4883255503669113, "step": 7460 }, { "distill_loss": 0.2618443965911865, "epoch": 2.4883255503669113, "step": 7460 }, { "epoch": 2.4883255503669113, "ref_ce_loss": 0.1021922305226326, "step": 7460 }, { "epoch": 2.4883255503669113, "loss": 0.39164695143699646, "step": 7460 }, { "ce_loss": 0.1578332930803299, "epoch": 2.4883255503669113, "step": 7460 }, { "distill_loss": 0.14780035614967346, "epoch": 2.4883255503669113, "step": 7460 }, { "epoch": 2.4883255503669113, "ref_ce_loss": 0.08552494645118713, "step": 7460 }, { "epoch": 2.4883255503669113, "loss": 0.42083337903022766, "step": 7460 }, { "ce_loss": 0.1467936635017395, "epoch": 2.4883255503669113, "step": 7460 }, { "distill_loss": 0.11076997220516205, "epoch": 2.4883255503669113, "step": 7460 }, { "epoch": 2.4883255503669113, "ref_ce_loss": 0.11539274454116821, "step": 7460 }, { "epoch": 2.4916611074049366, "loss": 0.6286, "step": 7470 }, { "epoch": 2.4916611074049366, "grad_norm": 4.215702056884766, "step": 7470 }, { "epoch": 2.4916611074049366, "learning_rate": 0.00024188098923593902, "step": 7470 }, { "epoch": 2.4916611074049366, "loss": 0.5800678133964539, "step": 7470 }, { "ce_loss": 0.16411294043064117, "epoch": 2.4916611074049366, "step": 7470 }, { "distill_loss": 0.23902928829193115, "epoch": 2.4916611074049366, "step": 7470 }, { "epoch": 2.4916611074049366, "ref_ce_loss": 0.11827875673770905, "step": 7470 }, { "epoch": 2.4916611074049366, "loss": 0.3950924873352051, "step": 7470 }, { "ce_loss": 0.061212923377752304, "epoch": 2.4916611074049366, "step": 7470 }, { "distill_loss": 0.08429202437400818, "epoch": 2.4916611074049366, "step": 7470 }, { "epoch": 2.4916611074049366, "ref_ce_loss": 0.0835026279091835, "step": 7470 }, { "epoch": 2.4916611074049366, "loss": 0.43904346227645874, "step": 7470 }, { "ce_loss": 0.10477552562952042, "epoch": 2.4916611074049366, "step": 7470 }, { "distill_loss": 0.2018274962902069, "epoch": 2.4916611074049366, "step": 7470 }, { "epoch": 2.4916611074049366, "ref_ce_loss": 0.08816604316234589, "step": 7470 }, { "epoch": 2.4916611074049366, "loss": 0.5673875212669373, "step": 7470 }, { "ce_loss": 0.18711598217487335, "epoch": 2.4916611074049366, "step": 7470 }, { "distill_loss": 0.19603317975997925, "epoch": 2.4916611074049366, "step": 7470 }, { "epoch": 2.4916611074049366, "ref_ce_loss": 0.15125808119773865, "step": 7470 }, { "epoch": 2.494996664442962, "loss": 0.7135, "step": 7480 }, { "epoch": 2.494996664442962, "grad_norm": 3.397770404815674, "step": 7480 }, { "epoch": 2.494996664442962, "learning_rate": 0.00024172079292625952, "step": 7480 }, { "epoch": 2.494996664442962, "loss": 0.4040660262107849, "step": 7480 }, { "ce_loss": 0.1057736873626709, "epoch": 2.494996664442962, "step": 7480 }, { "distill_loss": 0.17069004476070404, "epoch": 2.494996664442962, "step": 7480 }, { "epoch": 2.494996664442962, "ref_ce_loss": 0.0931447446346283, "step": 7480 }, { "epoch": 2.494996664442962, "loss": 0.6216812133789062, "step": 7480 }, { "ce_loss": 0.12649120390415192, "epoch": 2.494996664442962, "step": 7480 }, { "distill_loss": 0.20638568699359894, "epoch": 2.494996664442962, "step": 7480 }, { "epoch": 2.494996664442962, "ref_ce_loss": 0.1636555939912796, "step": 7480 }, { "epoch": 2.494996664442962, "loss": 0.5215267539024353, "step": 7480 }, { "ce_loss": 0.1331818848848343, "epoch": 2.494996664442962, "step": 7480 }, { "distill_loss": 0.21987393498420715, "epoch": 2.494996664442962, "step": 7480 }, { "epoch": 2.494996664442962, "ref_ce_loss": 0.13187724351882935, "step": 7480 }, { "epoch": 2.494996664442962, "loss": 0.6386827230453491, "step": 7480 }, { "ce_loss": 0.10811474174261093, "epoch": 2.494996664442962, "step": 7480 }, { "distill_loss": 0.24786339700222015, "epoch": 2.494996664442962, "step": 7480 }, { "epoch": 2.494996664442962, "ref_ce_loss": 0.09294579923152924, "step": 7480 }, { "epoch": 2.4983322214809873, "loss": 0.6108, "step": 7490 }, { "epoch": 2.4983322214809873, "grad_norm": 3.2522027492523193, "step": 7490 }, { "epoch": 2.4983322214809873, "learning_rate": 0.00024156042935403462, "step": 7490 }, { "epoch": 2.4983322214809873, "loss": 0.5217353701591492, "step": 7490 }, { "ce_loss": 0.20177902281284332, "epoch": 2.4983322214809873, "step": 7490 }, { "distill_loss": 0.17955923080444336, "epoch": 2.4983322214809873, "step": 7490 }, { "epoch": 2.4983322214809873, "ref_ce_loss": 0.10753311216831207, "step": 7490 }, { "epoch": 2.4983322214809873, "loss": 0.29052671790122986, "step": 7490 }, { "ce_loss": 0.07492802292108536, "epoch": 2.4983322214809873, "step": 7490 }, { "distill_loss": 0.13153742253780365, "epoch": 2.4983322214809873, "step": 7490 }, { "epoch": 2.4983322214809873, "ref_ce_loss": 0.08389332890510559, "step": 7490 }, { "epoch": 2.4983322214809873, "loss": 0.9276106357574463, "step": 7490 }, { "ce_loss": 0.21934948861598969, "epoch": 2.4983322214809873, "step": 7490 }, { "distill_loss": 0.2663779854774475, "epoch": 2.4983322214809873, "step": 7490 }, { "epoch": 2.4983322214809873, "ref_ce_loss": 0.14847496151924133, "step": 7490 }, { "epoch": 2.4983322214809873, "loss": 0.433186411857605, "step": 7490 }, { "ce_loss": 0.15955831110477448, "epoch": 2.4983322214809873, "step": 7490 }, { "distill_loss": 0.1145581305027008, "epoch": 2.4983322214809873, "step": 7490 }, { "epoch": 2.4983322214809873, "ref_ce_loss": 0.12650629878044128, "step": 7490 }, { "epoch": 2.5016677785190127, "loss": 0.539, "step": 7500 }, { "epoch": 2.5016677785190127, "grad_norm": 2.983107805252075, "step": 7500 }, { "epoch": 2.5016677785190127, "learning_rate": 0.0002413998988117042, "step": 7500 }, { "epoch": 2.5016677785190127, "loss": 0.9979486465454102, "step": 7500 }, { "ce_loss": 0.2776032090187073, "epoch": 2.5016677785190127, "step": 7500 }, { "distill_loss": 0.11760402470827103, "epoch": 2.5016677785190127, "step": 7500 }, { "epoch": 2.5016677785190127, "ref_ce_loss": 0.19638264179229736, "step": 7500 }, { "epoch": 2.5016677785190127, "loss": 0.421108603477478, "step": 7500 }, { "ce_loss": 0.16717898845672607, "epoch": 2.5016677785190127, "step": 7500 }, { "distill_loss": 0.10004810243844986, "epoch": 2.5016677785190127, "step": 7500 }, { "epoch": 2.5016677785190127, "ref_ce_loss": 0.11813834309577942, "step": 7500 }, { "epoch": 2.5016677785190127, "loss": 0.43692728877067566, "step": 7500 }, { "ce_loss": 0.22714455425739288, "epoch": 2.5016677785190127, "step": 7500 }, { "distill_loss": 0.12298592180013657, "epoch": 2.5016677785190127, "step": 7500 }, { "epoch": 2.5016677785190127, "ref_ce_loss": 0.08648133277893066, "step": 7500 }, { "epoch": 2.5016677785190127, "loss": 0.8263677358627319, "step": 7500 }, { "ce_loss": 0.2486896514892578, "epoch": 2.5016677785190127, "step": 7500 }, { "distill_loss": 0.230479896068573, "epoch": 2.5016677785190127, "step": 7500 }, { "epoch": 2.5016677785190127, "ref_ce_loss": 0.14610914885997772, "step": 7500 }, { "epoch": 2.505003335557038, "loss": 0.5267, "step": 7510 }, { "epoch": 2.505003335557038, "grad_norm": 3.099586248397827, "step": 7510 }, { "epoch": 2.505003335557038, "learning_rate": 0.00024123920159201267, "step": 7510 }, { "epoch": 2.505003335557038, "loss": 0.4549373984336853, "step": 7510 }, { "ce_loss": 0.1630726009607315, "epoch": 2.505003335557038, "step": 7510 }, { "distill_loss": 0.08730512112379074, "epoch": 2.505003335557038, "step": 7510 }, { "epoch": 2.505003335557038, "ref_ce_loss": 0.12055899947881699, "step": 7510 }, { "epoch": 2.505003335557038, "loss": 0.3760097324848175, "step": 7510 }, { "ce_loss": 0.130742609500885, "epoch": 2.505003335557038, "step": 7510 }, { "distill_loss": 0.12639303505420685, "epoch": 2.505003335557038, "step": 7510 }, { "epoch": 2.505003335557038, "ref_ce_loss": 0.11874870955944061, "step": 7510 }, { "epoch": 2.505003335557038, "loss": 0.5273082852363586, "step": 7510 }, { "ce_loss": 0.13833826780319214, "epoch": 2.505003335557038, "step": 7510 }, { "distill_loss": 0.09497985988855362, "epoch": 2.505003335557038, "step": 7510 }, { "epoch": 2.505003335557038, "ref_ce_loss": 0.14981545507907867, "step": 7510 }, { "epoch": 2.505003335557038, "loss": 0.5824294686317444, "step": 7510 }, { "ce_loss": 0.11524631083011627, "epoch": 2.505003335557038, "step": 7510 }, { "distill_loss": 0.09551960229873657, "epoch": 2.505003335557038, "step": 7510 }, { "epoch": 2.505003335557038, "ref_ce_loss": 0.10985547304153442, "step": 7510 }, { "epoch": 2.5083388925950634, "loss": 1.6356, "step": 7520 }, { "epoch": 2.5083388925950634, "grad_norm": 36.170310974121094, "step": 7520 }, { "epoch": 2.5083388925950634, "learning_rate": 0.00024107833798800836, "step": 7520 }, { "epoch": 2.5083388925950634, "loss": 4.801138877868652, "step": 7520 }, { "ce_loss": 3.2041690349578857, "epoch": 2.5083388925950634, "step": 7520 }, { "distill_loss": 0.11274446547031403, "epoch": 2.5083388925950634, "step": 7520 }, { "epoch": 2.5083388925950634, "ref_ce_loss": 1.380598783493042, "step": 7520 }, { "epoch": 2.5083388925950634, "loss": 5.07634162902832, "step": 7520 }, { "ce_loss": 3.3332977294921875, "epoch": 2.5083388925950634, "step": 7520 }, { "distill_loss": 0.09714693576097488, "epoch": 2.5083388925950634, "step": 7520 }, { "epoch": 2.5083388925950634, "ref_ce_loss": 1.598562240600586, "step": 7520 }, { "epoch": 2.5083388925950634, "loss": 5.017405986785889, "step": 7520 }, { "ce_loss": 3.367450714111328, "epoch": 2.5083388925950634, "step": 7520 }, { "distill_loss": 0.1504187136888504, "epoch": 2.5083388925950634, "step": 7520 }, { "epoch": 2.5083388925950634, "ref_ce_loss": 1.3405362367630005, "step": 7520 }, { "epoch": 2.5083388925950634, "loss": 5.071421146392822, "step": 7520 }, { "ce_loss": 3.21246337890625, "epoch": 2.5083388925950634, "step": 7520 }, { "distill_loss": 0.14449955523014069, "epoch": 2.5083388925950634, "step": 7520 }, { "epoch": 2.5083388925950634, "ref_ce_loss": 1.651926875114441, "step": 7520 }, { "epoch": 2.5116744496330887, "loss": 2.5051, "step": 7530 }, { "epoch": 2.5116744496330887, "grad_norm": 7.543781757354736, "step": 7530 }, { "epoch": 2.5116744496330887, "learning_rate": 0.00024091730829304303, "step": 7530 }, { "epoch": 2.5116744496330887, "loss": 1.0518018007278442, "step": 7530 }, { "ce_loss": 0.5136626958847046, "epoch": 2.5116744496330887, "step": 7530 }, { "distill_loss": 0.10988738387823105, "epoch": 2.5116744496330887, "step": 7530 }, { "epoch": 2.5116744496330887, "ref_ce_loss": 0.2461606115102768, "step": 7530 }, { "epoch": 2.5116744496330887, "loss": 0.971845805644989, "step": 7530 }, { "ce_loss": 0.5943147540092468, "epoch": 2.5116744496330887, "step": 7530 }, { "distill_loss": 0.12562216818332672, "epoch": 2.5116744496330887, "step": 7530 }, { "epoch": 2.5116744496330887, "ref_ce_loss": 0.2518536448478699, "step": 7530 }, { "epoch": 2.5116744496330887, "loss": 1.4921238422393799, "step": 7530 }, { "ce_loss": 0.5412873029708862, "epoch": 2.5116744496330887, "step": 7530 }, { "distill_loss": 0.11847780644893646, "epoch": 2.5116744496330887, "step": 7530 }, { "epoch": 2.5116744496330887, "ref_ce_loss": 0.322559118270874, "step": 7530 }, { "epoch": 2.5116744496330887, "loss": 0.9531009793281555, "step": 7530 }, { "ce_loss": 0.5480855107307434, "epoch": 2.5116744496330887, "step": 7530 }, { "distill_loss": 0.10795928537845612, "epoch": 2.5116744496330887, "step": 7530 }, { "epoch": 2.5116744496330887, "ref_ce_loss": 0.2576931118965149, "step": 7530 }, { "epoch": 2.515010006671114, "loss": 1.0296, "step": 7540 }, { "epoch": 2.515010006671114, "grad_norm": 4.333218097686768, "step": 7540 }, { "epoch": 2.515010006671114, "learning_rate": 0.00024075611280077134, "step": 7540 }, { "epoch": 2.515010006671114, "loss": 0.8116427659988403, "step": 7540 }, { "ce_loss": 0.30960413813591003, "epoch": 2.515010006671114, "step": 7540 }, { "distill_loss": 0.14997927844524384, "epoch": 2.515010006671114, "step": 7540 }, { "epoch": 2.515010006671114, "ref_ce_loss": 0.15476161241531372, "step": 7540 }, { "epoch": 2.515010006671114, "loss": 0.52923983335495, "step": 7540 }, { "ce_loss": 0.23071542382240295, "epoch": 2.515010006671114, "step": 7540 }, { "distill_loss": 0.12685328722000122, "epoch": 2.515010006671114, "step": 7540 }, { "epoch": 2.515010006671114, "ref_ce_loss": 0.17161113023757935, "step": 7540 }, { "epoch": 2.515010006671114, "loss": 0.6986463665962219, "step": 7540 }, { "ce_loss": 0.31322136521339417, "epoch": 2.515010006671114, "step": 7540 }, { "distill_loss": 0.11960118263959885, "epoch": 2.515010006671114, "step": 7540 }, { "epoch": 2.515010006671114, "ref_ce_loss": 0.14314311742782593, "step": 7540 }, { "epoch": 2.515010006671114, "loss": 0.6826801300048828, "step": 7540 }, { "ce_loss": 0.3476658761501312, "epoch": 2.515010006671114, "step": 7540 }, { "distill_loss": 0.11629290878772736, "epoch": 2.515010006671114, "step": 7540 }, { "epoch": 2.515010006671114, "ref_ce_loss": 0.1628488004207611, "step": 7540 }, { "epoch": 2.5183455637091394, "loss": 0.7002, "step": 7550 }, { "epoch": 2.5183455637091394, "grad_norm": 3.7075579166412354, "step": 7550 }, { "epoch": 2.5183455637091394, "learning_rate": 0.0002405947518051503, "step": 7550 }, { "epoch": 2.5183455637091394, "loss": 0.43118125200271606, "step": 7550 }, { "ce_loss": 0.15303654968738556, "epoch": 2.5183455637091394, "step": 7550 }, { "distill_loss": 0.14630776643753052, "epoch": 2.5183455637091394, "step": 7550 }, { "epoch": 2.5183455637091394, "ref_ce_loss": 0.13163325190544128, "step": 7550 }, { "epoch": 2.5183455637091394, "loss": 0.6882593631744385, "step": 7550 }, { "ce_loss": 0.20254714787006378, "epoch": 2.5183455637091394, "step": 7550 }, { "distill_loss": 0.16186225414276123, "epoch": 2.5183455637091394, "step": 7550 }, { "epoch": 2.5183455637091394, "ref_ce_loss": 0.12776897847652435, "step": 7550 }, { "epoch": 2.5183455637091394, "loss": 0.8160839080810547, "step": 7550 }, { "ce_loss": 0.24247363209724426, "epoch": 2.5183455637091394, "step": 7550 }, { "distill_loss": 0.20983321964740753, "epoch": 2.5183455637091394, "step": 7550 }, { "epoch": 2.5183455637091394, "ref_ce_loss": 0.1945839673280716, "step": 7550 }, { "epoch": 2.5183455637091394, "loss": 0.5754175186157227, "step": 7550 }, { "ce_loss": 0.1744595170021057, "epoch": 2.5183455637091394, "step": 7550 }, { "distill_loss": 0.1595647931098938, "epoch": 2.5183455637091394, "step": 7550 }, { "epoch": 2.5183455637091394, "ref_ce_loss": 0.15324027836322784, "step": 7550 }, { "epoch": 2.5216811207471648, "loss": 0.5811, "step": 7560 }, { "epoch": 2.5216811207471648, "grad_norm": 3.553694248199463, "step": 7560 }, { "epoch": 2.5216811207471648, "learning_rate": 0.00024043322560043863, "step": 7560 }, { "epoch": 2.5216811207471648, "loss": 0.5153651237487793, "step": 7560 }, { "ce_loss": 0.1536191999912262, "epoch": 2.5216811207471648, "step": 7560 }, { "distill_loss": 0.09473700821399689, "epoch": 2.5216811207471648, "step": 7560 }, { "epoch": 2.5216811207471648, "ref_ce_loss": 0.1389169543981552, "step": 7560 }, { "epoch": 2.5216811207471648, "loss": 0.4598870277404785, "step": 7560 }, { "ce_loss": 0.15863952040672302, "epoch": 2.5216811207471648, "step": 7560 }, { "distill_loss": 0.13414156436920166, "epoch": 2.5216811207471648, "step": 7560 }, { "epoch": 2.5216811207471648, "ref_ce_loss": 0.0958477035164833, "step": 7560 }, { "epoch": 2.5216811207471648, "loss": 0.47331902384757996, "step": 7560 }, { "ce_loss": 0.18015851080417633, "epoch": 2.5216811207471648, "step": 7560 }, { "distill_loss": 0.11007126420736313, "epoch": 2.5216811207471648, "step": 7560 }, { "epoch": 2.5216811207471648, "ref_ce_loss": 0.11271440237760544, "step": 7560 }, { "epoch": 2.5216811207471648, "loss": 0.6730250120162964, "step": 7560 }, { "ce_loss": 0.20672163367271423, "epoch": 2.5216811207471648, "step": 7560 }, { "distill_loss": 0.13213717937469482, "epoch": 2.5216811207471648, "step": 7560 }, { "epoch": 2.5216811207471648, "ref_ce_loss": 0.1556655317544937, "step": 7560 }, { "epoch": 2.52501667778519, "loss": 0.5522, "step": 7570 }, { "epoch": 2.52501667778519, "grad_norm": 3.1989450454711914, "step": 7570 }, { "epoch": 2.52501667778519, "learning_rate": 0.00024027153448119646, "step": 7570 }, { "epoch": 2.52501667778519, "loss": 0.3477581739425659, "step": 7570 }, { "ce_loss": 0.12741214036941528, "epoch": 2.52501667778519, "step": 7570 }, { "distill_loss": 0.11304664611816406, "epoch": 2.52501667778519, "step": 7570 }, { "epoch": 2.52501667778519, "ref_ce_loss": 0.1070604920387268, "step": 7570 }, { "epoch": 2.52501667778519, "loss": 0.6823329925537109, "step": 7570 }, { "ce_loss": 0.2807355523109436, "epoch": 2.52501667778519, "step": 7570 }, { "distill_loss": 0.11333362013101578, "epoch": 2.52501667778519, "step": 7570 }, { "epoch": 2.52501667778519, "ref_ce_loss": 0.20557934045791626, "step": 7570 }, { "epoch": 2.52501667778519, "loss": 0.6630694270133972, "step": 7570 }, { "ce_loss": 0.3418087363243103, "epoch": 2.52501667778519, "step": 7570 }, { "distill_loss": 0.11887340247631073, "epoch": 2.52501667778519, "step": 7570 }, { "epoch": 2.52501667778519, "ref_ce_loss": 0.12373942881822586, "step": 7570 }, { "epoch": 2.52501667778519, "loss": 0.5742785930633545, "step": 7570 }, { "ce_loss": 0.21951600909233093, "epoch": 2.52501667778519, "step": 7570 }, { "distill_loss": 0.14838585257530212, "epoch": 2.52501667778519, "step": 7570 }, { "epoch": 2.52501667778519, "ref_ce_loss": 0.16113975644111633, "step": 7570 }, { "epoch": 2.5283522348232155, "loss": 0.605, "step": 7580 }, { "epoch": 2.5283522348232155, "grad_norm": 3.2635841369628906, "step": 7580 }, { "epoch": 2.5283522348232155, "learning_rate": 0.0002401096787422846, "step": 7580 }, { "epoch": 2.5283522348232155, "loss": 0.6578416228294373, "step": 7580 }, { "ce_loss": 0.24671395123004913, "epoch": 2.5283522348232155, "step": 7580 }, { "distill_loss": 0.18206465244293213, "epoch": 2.5283522348232155, "step": 7580 }, { "epoch": 2.5283522348232155, "ref_ce_loss": 0.1655759960412979, "step": 7580 }, { "epoch": 2.5283522348232155, "loss": 0.3640592694282532, "step": 7580 }, { "ce_loss": 0.09979964792728424, "epoch": 2.5283522348232155, "step": 7580 }, { "distill_loss": 0.07907267659902573, "epoch": 2.5283522348232155, "step": 7580 }, { "epoch": 2.5283522348232155, "ref_ce_loss": 0.09818270802497864, "step": 7580 }, { "epoch": 2.5283522348232155, "loss": 0.4667300879955292, "step": 7580 }, { "ce_loss": 0.17414849996566772, "epoch": 2.5283522348232155, "step": 7580 }, { "distill_loss": 0.17536064982414246, "epoch": 2.5283522348232155, "step": 7580 }, { "epoch": 2.5283522348232155, "ref_ce_loss": 0.11710204184055328, "step": 7580 }, { "epoch": 2.5283522348232155, "loss": 0.5661674737930298, "step": 7580 }, { "ce_loss": 0.20824968814849854, "epoch": 2.5283522348232155, "step": 7580 }, { "distill_loss": 0.14784125983715057, "epoch": 2.5283522348232155, "step": 7580 }, { "epoch": 2.5283522348232155, "ref_ce_loss": 0.16290564835071564, "step": 7580 }, { "epoch": 2.531687791861241, "loss": 0.6413, "step": 7590 }, { "epoch": 2.531687791861241, "grad_norm": 6.0181193351745605, "step": 7590 }, { "epoch": 2.531687791861241, "learning_rate": 0.0002399476586788641, "step": 7590 }, { "epoch": 2.531687791861241, "loss": 0.512836217880249, "step": 7590 }, { "ce_loss": 0.1735915094614029, "epoch": 2.531687791861241, "step": 7590 }, { "distill_loss": 0.1299559324979782, "epoch": 2.531687791861241, "step": 7590 }, { "epoch": 2.531687791861241, "ref_ce_loss": 0.13969220221042633, "step": 7590 }, { "epoch": 2.531687791861241, "loss": 0.43017613887786865, "step": 7590 }, { "ce_loss": 0.0749160647392273, "epoch": 2.531687791861241, "step": 7590 }, { "distill_loss": 0.11237940937280655, "epoch": 2.531687791861241, "step": 7590 }, { "epoch": 2.531687791861241, "ref_ce_loss": 0.11628800630569458, "step": 7590 }, { "epoch": 2.531687791861241, "loss": 0.6326050162315369, "step": 7590 }, { "ce_loss": 0.26504430174827576, "epoch": 2.531687791861241, "step": 7590 }, { "distill_loss": 0.14532341063022614, "epoch": 2.531687791861241, "step": 7590 }, { "epoch": 2.531687791861241, "ref_ce_loss": 0.15772642195224762, "step": 7590 }, { "epoch": 2.531687791861241, "loss": 0.5163159370422363, "step": 7590 }, { "ce_loss": 0.21247656643390656, "epoch": 2.531687791861241, "step": 7590 }, { "distill_loss": 0.1484421044588089, "epoch": 2.531687791861241, "step": 7590 }, { "epoch": 2.531687791861241, "ref_ce_loss": 0.1257714480161667, "step": 7590 }, { "epoch": 2.535023348899266, "loss": 0.6002, "step": 7600 }, { "epoch": 2.535023348899266, "grad_norm": 3.680274724960327, "step": 7600 }, { "epoch": 2.535023348899266, "learning_rate": 0.00023978547458639566, "step": 7600 }, { "epoch": 2.535023348899266, "loss": 0.6189171075820923, "step": 7600 }, { "ce_loss": 0.19142690300941467, "epoch": 2.535023348899266, "step": 7600 }, { "distill_loss": 0.1640305519104004, "epoch": 2.535023348899266, "step": 7600 }, { "epoch": 2.535023348899266, "ref_ce_loss": 0.18159721791744232, "step": 7600 }, { "epoch": 2.535023348899266, "loss": 0.5700227618217468, "step": 7600 }, { "ce_loss": 0.18071216344833374, "epoch": 2.535023348899266, "step": 7600 }, { "distill_loss": 0.13341012597084045, "epoch": 2.535023348899266, "step": 7600 }, { "epoch": 2.535023348899266, "ref_ce_loss": 0.12869039177894592, "step": 7600 }, { "epoch": 2.535023348899266, "loss": 0.6764943599700928, "step": 7600 }, { "ce_loss": 0.25785961747169495, "epoch": 2.535023348899266, "step": 7600 }, { "distill_loss": 0.13872337341308594, "epoch": 2.535023348899266, "step": 7600 }, { "epoch": 2.535023348899266, "ref_ce_loss": 0.2371172457933426, "step": 7600 }, { "epoch": 2.535023348899266, "loss": 0.5429132580757141, "step": 7600 }, { "ce_loss": 0.23566880822181702, "epoch": 2.535023348899266, "step": 7600 }, { "distill_loss": 0.16433201730251312, "epoch": 2.535023348899266, "step": 7600 }, { "epoch": 2.535023348899266, "ref_ce_loss": 0.14133983850479126, "step": 7600 }, { "epoch": 2.5383589059372915, "loss": 0.5731, "step": 7610 }, { "epoch": 2.5383589059372915, "grad_norm": 1.8018347024917603, "step": 7610 }, { "epoch": 2.5383589059372915, "learning_rate": 0.00023962312676063905, "step": 7610 }, { "epoch": 2.5383589059372915, "loss": 0.5834633111953735, "step": 7610 }, { "ce_loss": 0.18504847586154938, "epoch": 2.5383589059372915, "step": 7610 }, { "distill_loss": 0.1272597312927246, "epoch": 2.5383589059372915, "step": 7610 }, { "epoch": 2.5383589059372915, "ref_ce_loss": 0.16280600428581238, "step": 7610 }, { "epoch": 2.5383589059372915, "loss": 0.4163941740989685, "step": 7610 }, { "ce_loss": 0.12109852582216263, "epoch": 2.5383589059372915, "step": 7610 }, { "distill_loss": 0.0993369072675705, "epoch": 2.5383589059372915, "step": 7610 }, { "epoch": 2.5383589059372915, "ref_ce_loss": 0.11387766897678375, "step": 7610 }, { "epoch": 2.5383589059372915, "loss": 0.6831787824630737, "step": 7610 }, { "ce_loss": 0.17585034668445587, "epoch": 2.5383589059372915, "step": 7610 }, { "distill_loss": 0.16444019973278046, "epoch": 2.5383589059372915, "step": 7610 }, { "epoch": 2.5383589059372915, "ref_ce_loss": 0.10961062461137772, "step": 7610 }, { "epoch": 2.5383589059372915, "loss": 0.48016393184661865, "step": 7610 }, { "ce_loss": 0.14712761342525482, "epoch": 2.5383589059372915, "step": 7610 }, { "distill_loss": 0.14475704729557037, "epoch": 2.5383589059372915, "step": 7610 }, { "epoch": 2.5383589059372915, "ref_ce_loss": 0.14661256968975067, "step": 7610 }, { "epoch": 2.541694462975317, "loss": 0.5923, "step": 7620 }, { "epoch": 2.541694462975317, "grad_norm": 2.1599721908569336, "step": 7620 }, { "epoch": 2.541694462975317, "learning_rate": 0.0002394606154976526, "step": 7620 }, { "epoch": 2.541694462975317, "loss": 0.7730429172515869, "step": 7620 }, { "ce_loss": 0.31173962354660034, "epoch": 2.541694462975317, "step": 7620 }, { "distill_loss": 0.14953872561454773, "epoch": 2.541694462975317, "step": 7620 }, { "epoch": 2.541694462975317, "ref_ce_loss": 0.16186164319515228, "step": 7620 }, { "epoch": 2.541694462975317, "loss": 0.5969633460044861, "step": 7620 }, { "ce_loss": 0.22048798203468323, "epoch": 2.541694462975317, "step": 7620 }, { "distill_loss": 0.143099844455719, "epoch": 2.541694462975317, "step": 7620 }, { "epoch": 2.541694462975317, "ref_ce_loss": 0.18054793775081635, "step": 7620 }, { "epoch": 2.541694462975317, "loss": 0.3590729534626007, "step": 7620 }, { "ce_loss": 0.12761783599853516, "epoch": 2.541694462975317, "step": 7620 }, { "distill_loss": 0.11471801996231079, "epoch": 2.541694462975317, "step": 7620 }, { "epoch": 2.541694462975317, "ref_ce_loss": 0.08587874472141266, "step": 7620 }, { "epoch": 2.541694462975317, "loss": 0.5616735816001892, "step": 7620 }, { "ce_loss": 0.19857123494148254, "epoch": 2.541694462975317, "step": 7620 }, { "distill_loss": 0.10939596593379974, "epoch": 2.541694462975317, "step": 7620 }, { "epoch": 2.541694462975317, "ref_ce_loss": 0.15047235786914825, "step": 7620 }, { "epoch": 2.545030020013342, "loss": 0.5276, "step": 7630 }, { "epoch": 2.545030020013342, "grad_norm": 3.97973370552063, "step": 7630 }, { "epoch": 2.545030020013342, "learning_rate": 0.00023929794109379287, "step": 7630 }, { "epoch": 2.545030020013342, "loss": 0.6210626363754272, "step": 7630 }, { "ce_loss": 0.17225535213947296, "epoch": 2.545030020013342, "step": 7630 }, { "distill_loss": 0.1483703851699829, "epoch": 2.545030020013342, "step": 7630 }, { "epoch": 2.545030020013342, "ref_ce_loss": 0.1286020129919052, "step": 7630 }, { "epoch": 2.545030020013342, "loss": 0.5207104682922363, "step": 7630 }, { "ce_loss": 0.1763487607240677, "epoch": 2.545030020013342, "step": 7630 }, { "distill_loss": 0.11557600647211075, "epoch": 2.545030020013342, "step": 7630 }, { "epoch": 2.545030020013342, "ref_ce_loss": 0.22873103618621826, "step": 7630 }, { "epoch": 2.545030020013342, "loss": 0.7825448513031006, "step": 7630 }, { "ce_loss": 0.18735192716121674, "epoch": 2.545030020013342, "step": 7630 }, { "distill_loss": 0.1284457892179489, "epoch": 2.545030020013342, "step": 7630 }, { "epoch": 2.545030020013342, "ref_ce_loss": 0.13140769302845, "step": 7630 }, { "epoch": 2.545030020013342, "loss": 0.5712705850601196, "step": 7630 }, { "ce_loss": 0.24999207258224487, "epoch": 2.545030020013342, "step": 7630 }, { "distill_loss": 0.1476946920156479, "epoch": 2.545030020013342, "step": 7630 }, { "epoch": 2.545030020013342, "ref_ce_loss": 0.14753498136997223, "step": 7630 }, { "epoch": 2.5483655770513676, "loss": 0.5822, "step": 7640 }, { "epoch": 2.5483655770513676, "grad_norm": 3.5894253253936768, "step": 7640 }, { "epoch": 2.5483655770513676, "learning_rate": 0.00023913510384571376, "step": 7640 }, { "epoch": 2.5483655770513676, "loss": 0.6420358419418335, "step": 7640 }, { "ce_loss": 0.23614124953746796, "epoch": 2.5483655770513676, "step": 7640 }, { "distill_loss": 0.11829626560211182, "epoch": 2.5483655770513676, "step": 7640 }, { "epoch": 2.5483655770513676, "ref_ce_loss": 0.15863285958766937, "step": 7640 }, { "epoch": 2.5483655770513676, "loss": 0.31583189964294434, "step": 7640 }, { "ce_loss": 0.10105039924383163, "epoch": 2.5483655770513676, "step": 7640 }, { "distill_loss": 0.09995920956134796, "epoch": 2.5483655770513676, "step": 7640 }, { "epoch": 2.5483655770513676, "ref_ce_loss": 0.10147218406200409, "step": 7640 }, { "epoch": 2.5483655770513676, "loss": 0.42695561051368713, "step": 7640 }, { "ce_loss": 0.18332314491271973, "epoch": 2.5483655770513676, "step": 7640 }, { "distill_loss": 0.12776078283786774, "epoch": 2.5483655770513676, "step": 7640 }, { "epoch": 2.5483655770513676, "ref_ce_loss": 0.11573680490255356, "step": 7640 }, { "epoch": 2.5483655770513676, "loss": 0.4857944846153259, "step": 7640 }, { "ce_loss": 0.15125729143619537, "epoch": 2.5483655770513676, "step": 7640 }, { "distill_loss": 0.10850798338651657, "epoch": 2.5483655770513676, "step": 7640 }, { "epoch": 2.5483655770513676, "ref_ce_loss": 0.11452734470367432, "step": 7640 }, { "epoch": 2.551701134089393, "loss": 0.5687, "step": 7650 }, { "epoch": 2.551701134089393, "grad_norm": 2.613729238510132, "step": 7650 }, { "epoch": 2.551701134089393, "learning_rate": 0.00023897210405036612, "step": 7650 }, { "epoch": 2.551701134089393, "loss": 0.2363915741443634, "step": 7650 }, { "ce_loss": 0.07392606139183044, "epoch": 2.551701134089393, "step": 7650 }, { "distill_loss": 0.07387639582157135, "epoch": 2.551701134089393, "step": 7650 }, { "epoch": 2.551701134089393, "ref_ce_loss": 0.08818396925926208, "step": 7650 }, { "epoch": 2.551701134089393, "loss": 0.4613707363605499, "step": 7650 }, { "ce_loss": 0.17740534245967865, "epoch": 2.551701134089393, "step": 7650 }, { "distill_loss": 0.10959557443857193, "epoch": 2.551701134089393, "step": 7650 }, { "epoch": 2.551701134089393, "ref_ce_loss": 0.12329917401075363, "step": 7650 }, { "epoch": 2.551701134089393, "loss": 0.4941861927509308, "step": 7650 }, { "ce_loss": 0.1800439953804016, "epoch": 2.551701134089393, "step": 7650 }, { "distill_loss": 0.11342333257198334, "epoch": 2.551701134089393, "step": 7650 }, { "epoch": 2.551701134089393, "ref_ce_loss": 0.15587930381298065, "step": 7650 }, { "epoch": 2.551701134089393, "loss": 0.5839400291442871, "step": 7650 }, { "ce_loss": 0.2504223883152008, "epoch": 2.551701134089393, "step": 7650 }, { "distill_loss": 0.13013054430484772, "epoch": 2.551701134089393, "step": 7650 }, { "epoch": 2.551701134089393, "ref_ce_loss": 0.15354035794734955, "step": 7650 }, { "epoch": 2.5550366911274183, "loss": 0.5413, "step": 7660 }, { "epoch": 2.5550366911274183, "grad_norm": 2.720123052597046, "step": 7660 }, { "epoch": 2.5550366911274183, "learning_rate": 0.00023880894200499733, "step": 7660 }, { "epoch": 2.5550366911274183, "loss": 0.5325741767883301, "step": 7660 }, { "ce_loss": 0.1940358281135559, "epoch": 2.5550366911274183, "step": 7660 }, { "distill_loss": 0.09372791647911072, "epoch": 2.5550366911274183, "step": 7660 }, { "epoch": 2.5550366911274183, "ref_ce_loss": 0.1709611415863037, "step": 7660 }, { "epoch": 2.5550366911274183, "loss": 0.3856140375137329, "step": 7660 }, { "ce_loss": 0.10936636477708817, "epoch": 2.5550366911274183, "step": 7660 }, { "distill_loss": 0.08666469901800156, "epoch": 2.5550366911274183, "step": 7660 }, { "epoch": 2.5550366911274183, "ref_ce_loss": 0.12398192286491394, "step": 7660 }, { "epoch": 2.5550366911274183, "loss": 0.494606614112854, "step": 7660 }, { "ce_loss": 0.16153691709041595, "epoch": 2.5550366911274183, "step": 7660 }, { "distill_loss": 0.09463351219892502, "epoch": 2.5550366911274183, "step": 7660 }, { "epoch": 2.5550366911274183, "ref_ce_loss": 0.1084618866443634, "step": 7660 }, { "epoch": 2.5550366911274183, "loss": 0.4320124387741089, "step": 7660 }, { "ce_loss": 0.18520474433898926, "epoch": 2.5550366911274183, "step": 7660 }, { "distill_loss": 0.1259734332561493, "epoch": 2.5550366911274183, "step": 7660 }, { "epoch": 2.5550366911274183, "ref_ce_loss": 0.12075541168451309, "step": 7660 }, { "epoch": 2.5583722481654436, "loss": 0.518, "step": 7670 }, { "epoch": 2.5583722481654436, "grad_norm": 4.928752899169922, "step": 7670 }, { "epoch": 2.5583722481654436, "learning_rate": 0.00023864561800715064, "step": 7670 }, { "epoch": 2.5583722481654436, "loss": 0.419145405292511, "step": 7670 }, { "ce_loss": 0.09538471698760986, "epoch": 2.5583722481654436, "step": 7670 }, { "distill_loss": 0.10431893914937973, "epoch": 2.5583722481654436, "step": 7670 }, { "epoch": 2.5583722481654436, "ref_ce_loss": 0.07912413030862808, "step": 7670 }, { "epoch": 2.5583722481654436, "loss": 0.42450791597366333, "step": 7670 }, { "ce_loss": 0.1916923075914383, "epoch": 2.5583722481654436, "step": 7670 }, { "distill_loss": 0.09768582880496979, "epoch": 2.5583722481654436, "step": 7670 }, { "epoch": 2.5583722481654436, "ref_ce_loss": 0.13494430482387543, "step": 7670 }, { "epoch": 2.5583722481654436, "loss": 0.4638766944408417, "step": 7670 }, { "ce_loss": 0.16185693442821503, "epoch": 2.5583722481654436, "step": 7670 }, { "distill_loss": 0.11444159597158432, "epoch": 2.5583722481654436, "step": 7670 }, { "epoch": 2.5583722481654436, "ref_ce_loss": 0.07675541192293167, "step": 7670 }, { "epoch": 2.5583722481654436, "loss": 0.3174169063568115, "step": 7670 }, { "ce_loss": 0.12475244700908661, "epoch": 2.5583722481654436, "step": 7670 }, { "distill_loss": 0.09721045196056366, "epoch": 2.5583722481654436, "step": 7670 }, { "epoch": 2.5583722481654436, "ref_ce_loss": 0.09538126736879349, "step": 7670 }, { "epoch": 2.561707805203469, "loss": 0.5129, "step": 7680 }, { "epoch": 2.561707805203469, "grad_norm": 2.8273167610168457, "step": 7680 }, { "epoch": 2.561707805203469, "learning_rate": 0.00023848213235466446, "step": 7680 }, { "epoch": 2.561707805203469, "loss": 0.41403496265411377, "step": 7680 }, { "ce_loss": 0.1246468648314476, "epoch": 2.561707805203469, "step": 7680 }, { "distill_loss": 0.10440246015787125, "epoch": 2.561707805203469, "step": 7680 }, { "epoch": 2.561707805203469, "ref_ce_loss": 0.11642558127641678, "step": 7680 }, { "epoch": 2.561707805203469, "loss": 0.4793902337551117, "step": 7680 }, { "ce_loss": 0.20326383411884308, "epoch": 2.561707805203469, "step": 7680 }, { "distill_loss": 0.1064993366599083, "epoch": 2.561707805203469, "step": 7680 }, { "epoch": 2.561707805203469, "ref_ce_loss": 0.16955675184726715, "step": 7680 }, { "epoch": 2.561707805203469, "loss": 0.5131489038467407, "step": 7680 }, { "ce_loss": 0.1686738282442093, "epoch": 2.561707805203469, "step": 7680 }, { "distill_loss": 0.1163623034954071, "epoch": 2.561707805203469, "step": 7680 }, { "epoch": 2.561707805203469, "ref_ce_loss": 0.10202865302562714, "step": 7680 }, { "epoch": 2.561707805203469, "loss": 0.5387371182441711, "step": 7680 }, { "ce_loss": 0.250522643327713, "epoch": 2.561707805203469, "step": 7680 }, { "distill_loss": 0.11985168606042862, "epoch": 2.561707805203469, "step": 7680 }, { "epoch": 2.561707805203469, "ref_ce_loss": 0.12084297090768814, "step": 7680 }, { "epoch": 2.5650433622414943, "loss": 0.525, "step": 7690 }, { "epoch": 2.5650433622414943, "grad_norm": 3.9550886154174805, "step": 7690 }, { "epoch": 2.5650433622414943, "learning_rate": 0.0002383184853456723, "step": 7690 }, { "epoch": 2.5650433622414943, "loss": 0.35102054476737976, "step": 7690 }, { "ce_loss": 0.10862553864717484, "epoch": 2.5650433622414943, "step": 7690 }, { "distill_loss": 0.1106899157166481, "epoch": 2.5650433622414943, "step": 7690 }, { "epoch": 2.5650433622414943, "ref_ce_loss": 0.08591561019420624, "step": 7690 }, { "epoch": 2.5650433622414943, "loss": 0.5251890420913696, "step": 7690 }, { "ce_loss": 0.1936178356409073, "epoch": 2.5650433622414943, "step": 7690 }, { "distill_loss": 0.12350133061408997, "epoch": 2.5650433622414943, "step": 7690 }, { "epoch": 2.5650433622414943, "ref_ce_loss": 0.14425380527973175, "step": 7690 }, { "epoch": 2.5650433622414943, "loss": 0.8277546763420105, "step": 7690 }, { "ce_loss": 0.2607184648513794, "epoch": 2.5650433622414943, "step": 7690 }, { "distill_loss": 0.10593435913324356, "epoch": 2.5650433622414943, "step": 7690 }, { "epoch": 2.5650433622414943, "ref_ce_loss": 0.16472357511520386, "step": 7690 }, { "epoch": 2.5650433622414943, "loss": 1.047260046005249, "step": 7690 }, { "ce_loss": 0.3077158033847809, "epoch": 2.5650433622414943, "step": 7690 }, { "distill_loss": 0.14151984453201294, "epoch": 2.5650433622414943, "step": 7690 }, { "epoch": 2.5650433622414943, "ref_ce_loss": 0.12401144206523895, "step": 7690 }, { "epoch": 2.5683789192795197, "loss": 0.5345, "step": 7700 }, { "epoch": 2.5683789192795197, "grad_norm": 2.5511090755462646, "step": 7700 }, { "epoch": 2.5683789192795197, "learning_rate": 0.00023815467727860163, "step": 7700 }, { "epoch": 2.5683789192795197, "loss": 0.4556001126766205, "step": 7700 }, { "ce_loss": 0.10793557018041611, "epoch": 2.5683789192795197, "step": 7700 }, { "distill_loss": 0.09077011793851852, "epoch": 2.5683789192795197, "step": 7700 }, { "epoch": 2.5683789192795197, "ref_ce_loss": 0.1416168063879013, "step": 7700 }, { "epoch": 2.5683789192795197, "loss": 0.3976340591907501, "step": 7700 }, { "ce_loss": 0.1789923459291458, "epoch": 2.5683789192795197, "step": 7700 }, { "distill_loss": 0.08997838199138641, "epoch": 2.5683789192795197, "step": 7700 }, { "epoch": 2.5683789192795197, "ref_ce_loss": 0.08977121859788895, "step": 7700 }, { "epoch": 2.5683789192795197, "loss": 0.43043792247772217, "step": 7700 }, { "ce_loss": 0.10186466574668884, "epoch": 2.5683789192795197, "step": 7700 }, { "distill_loss": 0.0814705491065979, "epoch": 2.5683789192795197, "step": 7700 }, { "epoch": 2.5683789192795197, "ref_ce_loss": 0.11523747444152832, "step": 7700 }, { "epoch": 2.5683789192795197, "loss": 0.7596093416213989, "step": 7700 }, { "ce_loss": 0.3001655638217926, "epoch": 2.5683789192795197, "step": 7700 }, { "distill_loss": 0.13424482941627502, "epoch": 2.5683789192795197, "step": 7700 }, { "epoch": 2.5683789192795197, "ref_ce_loss": 0.19055742025375366, "step": 7700 }, { "epoch": 2.571714476317545, "loss": 0.5771, "step": 7710 }, { "epoch": 2.571714476317545, "grad_norm": 3.4679062366485596, "step": 7710 }, { "epoch": 2.571714476317545, "learning_rate": 0.00023799070845217381, "step": 7710 }, { "epoch": 2.571714476317545, "loss": 0.33432021737098694, "step": 7710 }, { "ce_loss": 0.1009223535656929, "epoch": 2.571714476317545, "step": 7710 }, { "distill_loss": 0.0869167223572731, "epoch": 2.571714476317545, "step": 7710 }, { "epoch": 2.571714476317545, "ref_ce_loss": 0.08801834285259247, "step": 7710 }, { "epoch": 2.571714476317545, "loss": 0.4419139325618744, "step": 7710 }, { "ce_loss": 0.1773403137922287, "epoch": 2.571714476317545, "step": 7710 }, { "distill_loss": 0.12590724229812622, "epoch": 2.571714476317545, "step": 7710 }, { "epoch": 2.571714476317545, "ref_ce_loss": 0.137704998254776, "step": 7710 }, { "epoch": 2.571714476317545, "loss": 0.505905032157898, "step": 7710 }, { "ce_loss": 0.21214526891708374, "epoch": 2.571714476317545, "step": 7710 }, { "distill_loss": 0.14007237553596497, "epoch": 2.571714476317545, "step": 7710 }, { "epoch": 2.571714476317545, "ref_ce_loss": 0.1139286682009697, "step": 7710 }, { "epoch": 2.571714476317545, "loss": 0.47575730085372925, "step": 7710 }, { "ce_loss": 0.11864303797483444, "epoch": 2.571714476317545, "step": 7710 }, { "distill_loss": 0.12276129424571991, "epoch": 2.571714476317545, "step": 7710 }, { "epoch": 2.571714476317545, "ref_ce_loss": 0.12149628251791, "step": 7710 }, { "epoch": 2.5750500333555704, "loss": 0.5578, "step": 7720 }, { "epoch": 2.5750500333555704, "grad_norm": 4.5611138343811035, "step": 7720 }, { "epoch": 2.5750500333555704, "learning_rate": 0.00023782657916540325, "step": 7720 }, { "epoch": 2.5750500333555704, "loss": 0.6805188655853271, "step": 7720 }, { "ce_loss": 0.2107529491186142, "epoch": 2.5750500333555704, "step": 7720 }, { "distill_loss": 0.12623649835586548, "epoch": 2.5750500333555704, "step": 7720 }, { "epoch": 2.5750500333555704, "ref_ce_loss": 0.146802619099617, "step": 7720 }, { "epoch": 2.5750500333555704, "loss": 0.6122359037399292, "step": 7720 }, { "ce_loss": 0.23602664470672607, "epoch": 2.5750500333555704, "step": 7720 }, { "distill_loss": 0.1204591616988182, "epoch": 2.5750500333555704, "step": 7720 }, { "epoch": 2.5750500333555704, "ref_ce_loss": 0.17816196382045746, "step": 7720 }, { "epoch": 2.5750500333555704, "loss": 0.5088629722595215, "step": 7720 }, { "ce_loss": 0.16029979288578033, "epoch": 2.5750500333555704, "step": 7720 }, { "distill_loss": 0.11510752141475677, "epoch": 2.5750500333555704, "step": 7720 }, { "epoch": 2.5750500333555704, "ref_ce_loss": 0.11622966080904007, "step": 7720 }, { "epoch": 2.5750500333555704, "loss": 0.4893638491630554, "step": 7720 }, { "ce_loss": 0.09988599270582199, "epoch": 2.5750500333555704, "step": 7720 }, { "distill_loss": 0.12076612561941147, "epoch": 2.5750500333555704, "step": 7720 }, { "epoch": 2.5750500333555704, "ref_ce_loss": 0.1032547876238823, "step": 7720 }, { "epoch": 2.5783855903935957, "loss": 0.5313, "step": 7730 }, { "epoch": 2.5783855903935957, "grad_norm": 3.259249687194824, "step": 7730 }, { "epoch": 2.5783855903935957, "learning_rate": 0.00023766228971759706, "step": 7730 }, { "epoch": 2.5783855903935957, "loss": 0.4709855914115906, "step": 7730 }, { "ce_loss": 0.17486350238323212, "epoch": 2.5783855903935957, "step": 7730 }, { "distill_loss": 0.11171098798513412, "epoch": 2.5783855903935957, "step": 7730 }, { "epoch": 2.5783855903935957, "ref_ce_loss": 0.13182541728019714, "step": 7730 }, { "epoch": 2.5783855903935957, "loss": 0.4668689966201782, "step": 7730 }, { "ce_loss": 0.18540440499782562, "epoch": 2.5783855903935957, "step": 7730 }, { "distill_loss": 0.15899895131587982, "epoch": 2.5783855903935957, "step": 7730 }, { "epoch": 2.5783855903935957, "ref_ce_loss": 0.08990299701690674, "step": 7730 }, { "epoch": 2.5783855903935957, "loss": 0.48005780577659607, "step": 7730 }, { "ce_loss": 0.21282799541950226, "epoch": 2.5783855903935957, "step": 7730 }, { "distill_loss": 0.15303613245487213, "epoch": 2.5783855903935957, "step": 7730 }, { "epoch": 2.5783855903935957, "ref_ce_loss": 0.09080526977777481, "step": 7730 }, { "epoch": 2.5783855903935957, "loss": 0.6611260175704956, "step": 7730 }, { "ce_loss": 0.18075811862945557, "epoch": 2.5783855903935957, "step": 7730 }, { "distill_loss": 0.17533832788467407, "epoch": 2.5783855903935957, "step": 7730 }, { "epoch": 2.5783855903935957, "ref_ce_loss": 0.1334233433008194, "step": 7730 }, { "epoch": 2.581721147431621, "loss": 0.5627, "step": 7740 }, { "epoch": 2.581721147431621, "grad_norm": 3.2500455379486084, "step": 7740 }, { "epoch": 2.581721147431621, "learning_rate": 0.00023749784040835438, "step": 7740 }, { "epoch": 2.581721147431621, "loss": 0.6345797777175903, "step": 7740 }, { "ce_loss": 0.21962440013885498, "epoch": 2.581721147431621, "step": 7740 }, { "distill_loss": 0.10905186831951141, "epoch": 2.581721147431621, "step": 7740 }, { "epoch": 2.581721147431621, "ref_ce_loss": 0.14250461757183075, "step": 7740 }, { "epoch": 2.581721147431621, "loss": 0.34530290961265564, "step": 7740 }, { "ce_loss": 0.1335408091545105, "epoch": 2.581721147431621, "step": 7740 }, { "distill_loss": 0.09475085884332657, "epoch": 2.581721147431621, "step": 7740 }, { "epoch": 2.581721147431621, "ref_ce_loss": 0.11538613587617874, "step": 7740 }, { "epoch": 2.581721147431621, "loss": 0.5248610973358154, "step": 7740 }, { "ce_loss": 0.19531312584877014, "epoch": 2.581721147431621, "step": 7740 }, { "distill_loss": 0.13128426671028137, "epoch": 2.581721147431621, "step": 7740 }, { "epoch": 2.581721147431621, "ref_ce_loss": 0.14382454752922058, "step": 7740 }, { "epoch": 2.581721147431621, "loss": 0.5610891580581665, "step": 7740 }, { "ce_loss": 0.18045581877231598, "epoch": 2.581721147431621, "step": 7740 }, { "distill_loss": 0.11995457112789154, "epoch": 2.581721147431621, "step": 7740 }, { "epoch": 2.581721147431621, "ref_ce_loss": 0.16676151752471924, "step": 7740 }, { "epoch": 2.5850567044696464, "loss": 0.5274, "step": 7750 }, { "epoch": 2.5850567044696464, "grad_norm": 2.872438669204712, "step": 7750 }, { "epoch": 2.5850567044696464, "learning_rate": 0.00023733323153756587, "step": 7750 }, { "epoch": 2.5850567044696464, "loss": 0.717305064201355, "step": 7750 }, { "ce_loss": 0.1562374085187912, "epoch": 2.5850567044696464, "step": 7750 }, { "distill_loss": 0.09478169679641724, "epoch": 2.5850567044696464, "step": 7750 }, { "epoch": 2.5850567044696464, "ref_ce_loss": 0.14355888962745667, "step": 7750 }, { "epoch": 2.5850567044696464, "loss": 0.5134044289588928, "step": 7750 }, { "ce_loss": 0.18214938044548035, "epoch": 2.5850567044696464, "step": 7750 }, { "distill_loss": 0.12028078734874725, "epoch": 2.5850567044696464, "step": 7750 }, { "epoch": 2.5850567044696464, "ref_ce_loss": 0.12155014276504517, "step": 7750 }, { "epoch": 2.5850567044696464, "loss": 0.6256336569786072, "step": 7750 }, { "ce_loss": 0.06659332662820816, "epoch": 2.5850567044696464, "step": 7750 }, { "distill_loss": 0.07671971619129181, "epoch": 2.5850567044696464, "step": 7750 }, { "epoch": 2.5850567044696464, "ref_ce_loss": 0.09547872841358185, "step": 7750 }, { "epoch": 2.5850567044696464, "loss": 0.7374293804168701, "step": 7750 }, { "ce_loss": 0.24218425154685974, "epoch": 2.5850567044696464, "step": 7750 }, { "distill_loss": 0.12191884964704514, "epoch": 2.5850567044696464, "step": 7750 }, { "epoch": 2.5850567044696464, "ref_ce_loss": 0.1148589551448822, "step": 7750 }, { "epoch": 2.5883922615076718, "loss": 0.5772, "step": 7760 }, { "epoch": 2.5883922615076718, "grad_norm": 4.004894256591797, "step": 7760 }, { "epoch": 2.5883922615076718, "learning_rate": 0.00023716846340541317, "step": 7760 }, { "epoch": 2.5883922615076718, "loss": 0.6149783134460449, "step": 7760 }, { "ce_loss": 0.13599584996700287, "epoch": 2.5883922615076718, "step": 7760 }, { "distill_loss": 0.09181113541126251, "epoch": 2.5883922615076718, "step": 7760 }, { "epoch": 2.5883922615076718, "ref_ce_loss": 0.0897497609257698, "step": 7760 }, { "epoch": 2.5883922615076718, "loss": 0.5708010196685791, "step": 7760 }, { "ce_loss": 0.22451792657375336, "epoch": 2.5883922615076718, "step": 7760 }, { "distill_loss": 0.1451644003391266, "epoch": 2.5883922615076718, "step": 7760 }, { "epoch": 2.5883922615076718, "ref_ce_loss": 0.14718560874462128, "step": 7760 }, { "epoch": 2.5883922615076718, "loss": 0.28180474042892456, "step": 7760 }, { "ce_loss": 0.07587552815675735, "epoch": 2.5883922615076718, "step": 7760 }, { "distill_loss": 0.08645644783973694, "epoch": 2.5883922615076718, "step": 7760 }, { "epoch": 2.5883922615076718, "ref_ce_loss": 0.08731941133737564, "step": 7760 }, { "epoch": 2.5883922615076718, "loss": 0.5717758536338806, "step": 7760 }, { "ce_loss": 0.14076487720012665, "epoch": 2.5883922615076718, "step": 7760 }, { "distill_loss": 0.11443852633237839, "epoch": 2.5883922615076718, "step": 7760 }, { "epoch": 2.5883922615076718, "ref_ce_loss": 0.08949923515319824, "step": 7760 }, { "epoch": 2.591727818545697, "loss": 0.5554, "step": 7770 }, { "epoch": 2.591727818545697, "grad_norm": 3.8314449787139893, "step": 7770 }, { "epoch": 2.591727818545697, "learning_rate": 0.00023700353631236838, "step": 7770 }, { "epoch": 2.591727818545697, "loss": 0.4185023307800293, "step": 7770 }, { "ce_loss": 0.15791524946689606, "epoch": 2.591727818545697, "step": 7770 }, { "distill_loss": 0.12973660230636597, "epoch": 2.591727818545697, "step": 7770 }, { "epoch": 2.591727818545697, "ref_ce_loss": 0.13063375651836395, "step": 7770 }, { "epoch": 2.591727818545697, "loss": 0.610988438129425, "step": 7770 }, { "ce_loss": 0.20933449268341064, "epoch": 2.591727818545697, "step": 7770 }, { "distill_loss": 0.10694743692874908, "epoch": 2.591727818545697, "step": 7770 }, { "epoch": 2.591727818545697, "ref_ce_loss": 0.15038563311100006, "step": 7770 }, { "epoch": 2.591727818545697, "loss": 0.43110722303390503, "step": 7770 }, { "ce_loss": 0.17192485928535461, "epoch": 2.591727818545697, "step": 7770 }, { "distill_loss": 0.13432137668132782, "epoch": 2.591727818545697, "step": 7770 }, { "epoch": 2.591727818545697, "ref_ce_loss": 0.12439984828233719, "step": 7770 }, { "epoch": 2.591727818545697, "loss": 0.3572756052017212, "step": 7770 }, { "ce_loss": 0.11541720479726791, "epoch": 2.591727818545697, "step": 7770 }, { "distill_loss": 0.09471176564693451, "epoch": 2.591727818545697, "step": 7770 }, { "epoch": 2.591727818545697, "ref_ce_loss": 0.07893867790699005, "step": 7770 }, { "epoch": 2.5950633755837225, "loss": 0.5271, "step": 7780 }, { "epoch": 2.5950633755837225, "grad_norm": 2.763399839401245, "step": 7780 }, { "epoch": 2.5950633755837225, "learning_rate": 0.00023683845055919348, "step": 7780 }, { "epoch": 2.5950633755837225, "loss": 0.8820345997810364, "step": 7780 }, { "ce_loss": 0.15303368866443634, "epoch": 2.5950633755837225, "step": 7780 }, { "distill_loss": 0.10918956249952316, "epoch": 2.5950633755837225, "step": 7780 }, { "epoch": 2.5950633755837225, "ref_ce_loss": 0.1406395137310028, "step": 7780 }, { "epoch": 2.5950633755837225, "loss": 0.5381264686584473, "step": 7780 }, { "ce_loss": 0.09407369047403336, "epoch": 2.5950633755837225, "step": 7780 }, { "distill_loss": 0.09477350115776062, "epoch": 2.5950633755837225, "step": 7780 }, { "epoch": 2.5950633755837225, "ref_ce_loss": 0.10281086713075638, "step": 7780 }, { "epoch": 2.5950633755837225, "loss": 0.5149239301681519, "step": 7780 }, { "ce_loss": 0.16611416637897491, "epoch": 2.5950633755837225, "step": 7780 }, { "distill_loss": 0.10362285375595093, "epoch": 2.5950633755837225, "step": 7780 }, { "epoch": 2.5950633755837225, "ref_ce_loss": 0.16059048473834991, "step": 7780 }, { "epoch": 2.5950633755837225, "loss": 0.34199070930480957, "step": 7780 }, { "ce_loss": 0.1551412045955658, "epoch": 2.5950633755837225, "step": 7780 }, { "distill_loss": 0.09238369017839432, "epoch": 2.5950633755837225, "step": 7780 }, { "epoch": 2.5950633755837225, "ref_ce_loss": 0.06881777942180634, "step": 7780 }, { "epoch": 2.598398932621748, "loss": 0.5522, "step": 7790 }, { "epoch": 2.598398932621748, "grad_norm": 5.211056232452393, "step": 7790 }, { "epoch": 2.598398932621748, "learning_rate": 0.00023667320644693972, "step": 7790 }, { "epoch": 2.598398932621748, "loss": 0.4736732244491577, "step": 7790 }, { "ce_loss": 0.16912175714969635, "epoch": 2.598398932621748, "step": 7790 }, { "distill_loss": 0.13457514345645905, "epoch": 2.598398932621748, "step": 7790 }, { "epoch": 2.598398932621748, "ref_ce_loss": 0.10892794281244278, "step": 7790 }, { "epoch": 2.598398932621748, "loss": 0.7895961999893188, "step": 7790 }, { "ce_loss": 0.12324633449316025, "epoch": 2.598398932621748, "step": 7790 }, { "distill_loss": 0.13549686968326569, "epoch": 2.598398932621748, "step": 7790 }, { "epoch": 2.598398932621748, "ref_ce_loss": 0.13592009246349335, "step": 7790 }, { "epoch": 2.598398932621748, "loss": 0.42667579650878906, "step": 7790 }, { "ce_loss": 0.15414555370807648, "epoch": 2.598398932621748, "step": 7790 }, { "distill_loss": 0.1340002715587616, "epoch": 2.598398932621748, "step": 7790 }, { "epoch": 2.598398932621748, "ref_ce_loss": 0.10248945653438568, "step": 7790 }, { "epoch": 2.598398932621748, "loss": 0.6181153059005737, "step": 7790 }, { "ce_loss": 0.20727142691612244, "epoch": 2.598398932621748, "step": 7790 }, { "distill_loss": 0.1465279459953308, "epoch": 2.598398932621748, "step": 7790 }, { "epoch": 2.598398932621748, "ref_ce_loss": 0.10699298232793808, "step": 7790 }, { "epoch": 2.601734489659773, "loss": 0.5686, "step": 7800 }, { "epoch": 2.601734489659773, "grad_norm": 2.9544246196746826, "step": 7800 }, { "epoch": 2.601734489659773, "learning_rate": 0.0002365078042769472, "step": 7800 }, { "epoch": 2.601734489659773, "loss": 0.7977356910705566, "step": 7800 }, { "ce_loss": 0.14952455461025238, "epoch": 2.601734489659773, "step": 7800 }, { "distill_loss": 0.13543793559074402, "epoch": 2.601734489659773, "step": 7800 }, { "epoch": 2.601734489659773, "ref_ce_loss": 0.13750240206718445, "step": 7800 }, { "epoch": 2.601734489659773, "loss": 0.5125139355659485, "step": 7800 }, { "ce_loss": 0.11975100636482239, "epoch": 2.601734489659773, "step": 7800 }, { "distill_loss": 0.11853191256523132, "epoch": 2.601734489659773, "step": 7800 }, { "epoch": 2.601734489659773, "ref_ce_loss": 0.11989688873291016, "step": 7800 }, { "epoch": 2.601734489659773, "loss": 0.6820304989814758, "step": 7800 }, { "ce_loss": 0.25920236110687256, "epoch": 2.601734489659773, "step": 7800 }, { "distill_loss": 0.16221410036087036, "epoch": 2.601734489659773, "step": 7800 }, { "epoch": 2.601734489659773, "ref_ce_loss": 0.13963061571121216, "step": 7800 }, { "epoch": 2.601734489659773, "loss": 0.48601311445236206, "step": 7800 }, { "ce_loss": 0.16878481209278107, "epoch": 2.601734489659773, "step": 7800 }, { "distill_loss": 0.12551510334014893, "epoch": 2.601734489659773, "step": 7800 }, { "epoch": 2.601734489659773, "ref_ce_loss": 0.13169850409030914, "step": 7800 }, { "epoch": 2.6050700466977985, "loss": 0.5393, "step": 7810 }, { "epoch": 2.6050700466977985, "grad_norm": 3.2493793964385986, "step": 7810 }, { "epoch": 2.6050700466977985, "learning_rate": 0.00023634224435084417, "step": 7810 }, { "epoch": 2.6050700466977985, "loss": 0.6498173475265503, "step": 7810 }, { "ce_loss": 0.22611533105373383, "epoch": 2.6050700466977985, "step": 7810 }, { "distill_loss": 0.22686871886253357, "epoch": 2.6050700466977985, "step": 7810 }, { "epoch": 2.6050700466977985, "ref_ce_loss": 0.17064939439296722, "step": 7810 }, { "epoch": 2.6050700466977985, "loss": 0.624129056930542, "step": 7810 }, { "ce_loss": 0.23449084162712097, "epoch": 2.6050700466977985, "step": 7810 }, { "distill_loss": 0.19048425555229187, "epoch": 2.6050700466977985, "step": 7810 }, { "epoch": 2.6050700466977985, "ref_ce_loss": 0.1580566167831421, "step": 7810 }, { "epoch": 2.6050700466977985, "loss": 0.5485934019088745, "step": 7810 }, { "ce_loss": 0.21335983276367188, "epoch": 2.6050700466977985, "step": 7810 }, { "distill_loss": 0.17096811532974243, "epoch": 2.6050700466977985, "step": 7810 }, { "epoch": 2.6050700466977985, "ref_ce_loss": 0.12088602781295776, "step": 7810 }, { "epoch": 2.6050700466977985, "loss": 0.32192063331604004, "step": 7810 }, { "ce_loss": 0.08201882988214493, "epoch": 2.6050700466977985, "step": 7810 }, { "distill_loss": 0.15331301093101501, "epoch": 2.6050700466977985, "step": 7810 }, { "epoch": 2.6050700466977985, "ref_ce_loss": 0.08554382622241974, "step": 7810 }, { "epoch": 2.608405603735824, "loss": 0.586, "step": 7820 }, { "epoch": 2.608405603735824, "grad_norm": 3.0376055240631104, "step": 7820 }, { "epoch": 2.608405603735824, "learning_rate": 0.00023617652697054673, "step": 7820 }, { "epoch": 2.608405603735824, "loss": 0.6930296421051025, "step": 7820 }, { "ce_loss": 0.2070154845714569, "epoch": 2.608405603735824, "step": 7820 }, { "distill_loss": 0.13998956978321075, "epoch": 2.608405603735824, "step": 7820 }, { "epoch": 2.608405603735824, "ref_ce_loss": 0.1303083449602127, "step": 7820 }, { "epoch": 2.608405603735824, "loss": 0.5493748784065247, "step": 7820 }, { "ce_loss": 0.11979985982179642, "epoch": 2.608405603735824, "step": 7820 }, { "distill_loss": 0.11463198810815811, "epoch": 2.608405603735824, "step": 7820 }, { "epoch": 2.608405603735824, "ref_ce_loss": 0.10523556917905807, "step": 7820 }, { "epoch": 2.608405603735824, "loss": 0.4238487482070923, "step": 7820 }, { "ce_loss": 0.1376940906047821, "epoch": 2.608405603735824, "step": 7820 }, { "distill_loss": 0.1326865404844284, "epoch": 2.608405603735824, "step": 7820 }, { "epoch": 2.608405603735824, "ref_ce_loss": 0.12547308206558228, "step": 7820 }, { "epoch": 2.608405603735824, "loss": 0.639647901058197, "step": 7820 }, { "ce_loss": 0.30324020981788635, "epoch": 2.608405603735824, "step": 7820 }, { "distill_loss": 0.11521129310131073, "epoch": 2.608405603735824, "step": 7820 }, { "epoch": 2.608405603735824, "ref_ce_loss": 0.22053390741348267, "step": 7820 }, { "epoch": 2.611741160773849, "loss": 0.5546, "step": 7830 }, { "epoch": 2.611741160773849, "grad_norm": 4.630809783935547, "step": 7830 }, { "epoch": 2.611741160773849, "learning_rate": 0.00023601065243825795, "step": 7830 }, { "epoch": 2.611741160773849, "loss": 0.4851466715335846, "step": 7830 }, { "ce_loss": 0.18493600189685822, "epoch": 2.611741160773849, "step": 7830 }, { "distill_loss": 0.12290176749229431, "epoch": 2.611741160773849, "step": 7830 }, { "epoch": 2.611741160773849, "ref_ce_loss": 0.11735208332538605, "step": 7830 }, { "epoch": 2.611741160773849, "loss": 0.5059221982955933, "step": 7830 }, { "ce_loss": 0.21578659117221832, "epoch": 2.611741160773849, "step": 7830 }, { "distill_loss": 0.17096026241779327, "epoch": 2.611741160773849, "step": 7830 }, { "epoch": 2.611741160773849, "ref_ce_loss": 0.11815796047449112, "step": 7830 }, { "epoch": 2.611741160773849, "loss": 0.5575016140937805, "step": 7830 }, { "ce_loss": 0.18043434619903564, "epoch": 2.611741160773849, "step": 7830 }, { "distill_loss": 0.12575297057628632, "epoch": 2.611741160773849, "step": 7830 }, { "epoch": 2.611741160773849, "ref_ce_loss": 0.15635573863983154, "step": 7830 }, { "epoch": 2.611741160773849, "loss": 0.31332072615623474, "step": 7830 }, { "ce_loss": 0.09760003536939621, "epoch": 2.611741160773849, "step": 7830 }, { "distill_loss": 0.10402275621891022, "epoch": 2.611741160773849, "step": 7830 }, { "epoch": 2.611741160773849, "ref_ce_loss": 0.11161261796951294, "step": 7830 }, { "epoch": 2.6150767178118746, "loss": 0.5233, "step": 7840 }, { "epoch": 2.6150767178118746, "grad_norm": 2.5343832969665527, "step": 7840 }, { "epoch": 2.6150767178118746, "learning_rate": 0.00023584462105646754, "step": 7840 }, { "epoch": 2.6150767178118746, "loss": 0.5742359757423401, "step": 7840 }, { "ce_loss": 0.22729800641536713, "epoch": 2.6150767178118746, "step": 7840 }, { "distill_loss": 0.1504513919353485, "epoch": 2.6150767178118746, "step": 7840 }, { "epoch": 2.6150767178118746, "ref_ce_loss": 0.14994561672210693, "step": 7840 }, { "epoch": 2.6150767178118746, "loss": 0.5492827892303467, "step": 7840 }, { "ce_loss": 0.14900290966033936, "epoch": 2.6150767178118746, "step": 7840 }, { "distill_loss": 0.13081350922584534, "epoch": 2.6150767178118746, "step": 7840 }, { "epoch": 2.6150767178118746, "ref_ce_loss": 0.10538212954998016, "step": 7840 }, { "epoch": 2.6150767178118746, "loss": 0.3085349500179291, "step": 7840 }, { "ce_loss": 0.09896077960729599, "epoch": 2.6150767178118746, "step": 7840 }, { "distill_loss": 0.10900798439979553, "epoch": 2.6150767178118746, "step": 7840 }, { "epoch": 2.6150767178118746, "ref_ce_loss": 0.10040585696697235, "step": 7840 }, { "epoch": 2.6150767178118746, "loss": 0.3425389528274536, "step": 7840 }, { "ce_loss": 0.1181720420718193, "epoch": 2.6150767178118746, "step": 7840 }, { "distill_loss": 0.0888860896229744, "epoch": 2.6150767178118746, "step": 7840 }, { "epoch": 2.6150767178118746, "ref_ce_loss": 0.1350540816783905, "step": 7840 }, { "epoch": 2.6184122748499, "loss": 0.5468, "step": 7850 }, { "epoch": 2.6184122748499, "grad_norm": 3.092991828918457, "step": 7850 }, { "epoch": 2.6184122748499, "learning_rate": 0.0002356784331279513, "step": 7850 }, { "epoch": 2.6184122748499, "loss": 0.3807421922683716, "step": 7850 }, { "ce_loss": 0.1497032344341278, "epoch": 2.6184122748499, "step": 7850 }, { "distill_loss": 0.13074158132076263, "epoch": 2.6184122748499, "step": 7850 }, { "epoch": 2.6184122748499, "ref_ce_loss": 0.10015086084604263, "step": 7850 }, { "epoch": 2.6184122748499, "loss": 0.5140164494514465, "step": 7850 }, { "ce_loss": 0.07548467069864273, "epoch": 2.6184122748499, "step": 7850 }, { "distill_loss": 0.09762059152126312, "epoch": 2.6184122748499, "step": 7850 }, { "epoch": 2.6184122748499, "ref_ce_loss": 0.09445083141326904, "step": 7850 }, { "epoch": 2.6184122748499, "loss": 0.5912425518035889, "step": 7850 }, { "ce_loss": 0.15175721049308777, "epoch": 2.6184122748499, "step": 7850 }, { "distill_loss": 0.1264921873807907, "epoch": 2.6184122748499, "step": 7850 }, { "epoch": 2.6184122748499, "ref_ce_loss": 0.15066170692443848, "step": 7850 }, { "epoch": 2.6184122748499, "loss": 0.5041841864585876, "step": 7850 }, { "ce_loss": 0.19449378550052643, "epoch": 2.6184122748499, "step": 7850 }, { "distill_loss": 0.12563063204288483, "epoch": 2.6184122748499, "step": 7850 }, { "epoch": 2.6184122748499, "ref_ce_loss": 0.1535726934671402, "step": 7850 }, { "epoch": 2.6217478318879253, "loss": 0.4922, "step": 7860 }, { "epoch": 2.6217478318879253, "grad_norm": 2.572794198989868, "step": 7860 }, { "epoch": 2.6217478318879253, "learning_rate": 0.00023551208895577038, "step": 7860 }, { "epoch": 2.6217478318879253, "loss": 0.3305208086967468, "step": 7860 }, { "ce_loss": 0.07853977382183075, "epoch": 2.6217478318879253, "step": 7860 }, { "distill_loss": 0.11406944692134857, "epoch": 2.6217478318879253, "step": 7860 }, { "epoch": 2.6217478318879253, "ref_ce_loss": 0.07194974273443222, "step": 7860 }, { "epoch": 2.6217478318879253, "loss": 0.40856409072875977, "step": 7860 }, { "ce_loss": 0.12992922961711884, "epoch": 2.6217478318879253, "step": 7860 }, { "distill_loss": 0.1046760231256485, "epoch": 2.6217478318879253, "step": 7860 }, { "epoch": 2.6217478318879253, "ref_ce_loss": 0.126336470246315, "step": 7860 }, { "epoch": 2.6217478318879253, "loss": 0.5514043569564819, "step": 7860 }, { "ce_loss": 0.2061086893081665, "epoch": 2.6217478318879253, "step": 7860 }, { "distill_loss": 0.10851980000734329, "epoch": 2.6217478318879253, "step": 7860 }, { "epoch": 2.6217478318879253, "ref_ce_loss": 0.181810200214386, "step": 7860 }, { "epoch": 2.6217478318879253, "loss": 0.6222173571586609, "step": 7860 }, { "ce_loss": 0.1734030842781067, "epoch": 2.6217478318879253, "step": 7860 }, { "distill_loss": 0.1488630175590515, "epoch": 2.6217478318879253, "step": 7860 }, { "epoch": 2.6217478318879253, "ref_ce_loss": 0.12546145915985107, "step": 7860 }, { "epoch": 2.6250833889259506, "loss": 0.5458, "step": 7870 }, { "epoch": 2.6250833889259506, "grad_norm": 2.7927324771881104, "step": 7870 }, { "epoch": 2.6250833889259506, "learning_rate": 0.000235345588843271, "step": 7870 }, { "epoch": 2.6250833889259506, "loss": 0.32617244124412537, "step": 7870 }, { "ce_loss": 0.052165593951940536, "epoch": 2.6250833889259506, "step": 7870 }, { "distill_loss": 0.09367629140615463, "epoch": 2.6250833889259506, "step": 7870 }, { "epoch": 2.6250833889259506, "ref_ce_loss": 0.09321639686822891, "step": 7870 }, { "epoch": 2.6250833889259506, "loss": 0.6275125741958618, "step": 7870 }, { "ce_loss": 0.15996789932250977, "epoch": 2.6250833889259506, "step": 7870 }, { "distill_loss": 0.10561716556549072, "epoch": 2.6250833889259506, "step": 7870 }, { "epoch": 2.6250833889259506, "ref_ce_loss": 0.11221954226493835, "step": 7870 }, { "epoch": 2.6250833889259506, "loss": 0.43381965160369873, "step": 7870 }, { "ce_loss": 0.1376059651374817, "epoch": 2.6250833889259506, "step": 7870 }, { "distill_loss": 0.1076732650399208, "epoch": 2.6250833889259506, "step": 7870 }, { "epoch": 2.6250833889259506, "ref_ce_loss": 0.15326763689517975, "step": 7870 }, { "epoch": 2.6250833889259506, "loss": 0.4509509801864624, "step": 7870 }, { "ce_loss": 0.1815311163663864, "epoch": 2.6250833889259506, "step": 7870 }, { "distill_loss": 0.09893777966499329, "epoch": 2.6250833889259506, "step": 7870 }, { "epoch": 2.6250833889259506, "ref_ce_loss": 0.14123815298080444, "step": 7870 }, { "epoch": 2.628418945963976, "loss": 0.5622, "step": 7880 }, { "epoch": 2.628418945963976, "grad_norm": 3.051100492477417, "step": 7880 }, { "epoch": 2.628418945963976, "learning_rate": 0.0002351789330940836, "step": 7880 }, { "epoch": 2.628418945963976, "loss": 0.4621231257915497, "step": 7880 }, { "ce_loss": 0.20068912208080292, "epoch": 2.628418945963976, "step": 7880 }, { "distill_loss": 0.10784181952476501, "epoch": 2.628418945963976, "step": 7880 }, { "epoch": 2.628418945963976, "ref_ce_loss": 0.10929391533136368, "step": 7880 }, { "epoch": 2.628418945963976, "loss": 0.5110466480255127, "step": 7880 }, { "ce_loss": 0.14794176816940308, "epoch": 2.628418945963976, "step": 7880 }, { "distill_loss": 0.08582356572151184, "epoch": 2.628418945963976, "step": 7880 }, { "epoch": 2.628418945963976, "ref_ce_loss": 0.09186380356550217, "step": 7880 }, { "epoch": 2.628418945963976, "loss": 0.31455156207084656, "step": 7880 }, { "ce_loss": 0.09946168214082718, "epoch": 2.628418945963976, "step": 7880 }, { "distill_loss": 0.08664387464523315, "epoch": 2.628418945963976, "step": 7880 }, { "epoch": 2.628418945963976, "ref_ce_loss": 0.09416986256837845, "step": 7880 }, { "epoch": 2.628418945963976, "loss": 0.7443742752075195, "step": 7880 }, { "ce_loss": 0.17656545341014862, "epoch": 2.628418945963976, "step": 7880 }, { "distill_loss": 0.11620085686445236, "epoch": 2.628418945963976, "step": 7880 }, { "epoch": 2.628418945963976, "ref_ce_loss": 0.09211236983537674, "step": 7880 }, { "epoch": 2.6317545030020013, "loss": 0.5989, "step": 7890 }, { "epoch": 2.6317545030020013, "grad_norm": 2.1629552841186523, "step": 7890 }, { "epoch": 2.6317545030020013, "learning_rate": 0.00023501212201212262, "step": 7890 }, { "epoch": 2.6317545030020013, "loss": 0.5096309185028076, "step": 7890 }, { "ce_loss": 0.09982051700353622, "epoch": 2.6317545030020013, "step": 7890 }, { "distill_loss": 0.11478620767593384, "epoch": 2.6317545030020013, "step": 7890 }, { "epoch": 2.6317545030020013, "ref_ce_loss": 0.14122353494167328, "step": 7890 }, { "epoch": 2.6317545030020013, "loss": 0.608100414276123, "step": 7890 }, { "ce_loss": 0.20234572887420654, "epoch": 2.6317545030020013, "step": 7890 }, { "distill_loss": 0.13174159824848175, "epoch": 2.6317545030020013, "step": 7890 }, { "epoch": 2.6317545030020013, "ref_ce_loss": 0.14330333471298218, "step": 7890 }, { "epoch": 2.6317545030020013, "loss": 0.5019533634185791, "step": 7890 }, { "ce_loss": 0.18130330741405487, "epoch": 2.6317545030020013, "step": 7890 }, { "distill_loss": 0.10709169507026672, "epoch": 2.6317545030020013, "step": 7890 }, { "epoch": 2.6317545030020013, "ref_ce_loss": 0.15589384734630585, "step": 7890 }, { "epoch": 2.6317545030020013, "loss": 0.4099520742893219, "step": 7890 }, { "ce_loss": 0.1622970849275589, "epoch": 2.6317545030020013, "step": 7890 }, { "distill_loss": 0.13596194982528687, "epoch": 2.6317545030020013, "step": 7890 }, { "epoch": 2.6317545030020013, "ref_ce_loss": 0.11146480590105057, "step": 7890 }, { "epoch": 2.6350900600400267, "loss": 0.5707, "step": 7900 }, { "epoch": 2.6350900600400267, "grad_norm": 2.643676996231079, "step": 7900 }, { "epoch": 2.6350900600400267, "learning_rate": 0.00023484515590158566, "step": 7900 }, { "epoch": 2.6350900600400267, "loss": 0.37032589316368103, "step": 7900 }, { "ce_loss": 0.1284305900335312, "epoch": 2.6350900600400267, "step": 7900 }, { "distill_loss": 0.10827270150184631, "epoch": 2.6350900600400267, "step": 7900 }, { "epoch": 2.6350900600400267, "ref_ce_loss": 0.08193215727806091, "step": 7900 }, { "epoch": 2.6350900600400267, "loss": 0.3987683951854706, "step": 7900 }, { "ce_loss": 0.1566096395254135, "epoch": 2.6350900600400267, "step": 7900 }, { "distill_loss": 0.14688178896903992, "epoch": 2.6350900600400267, "step": 7900 }, { "epoch": 2.6350900600400267, "ref_ce_loss": 0.09464366734027863, "step": 7900 }, { "epoch": 2.6350900600400267, "loss": 0.9638165235519409, "step": 7900 }, { "ce_loss": 0.27305978536605835, "epoch": 2.6350900600400267, "step": 7900 }, { "distill_loss": 0.18029692769050598, "epoch": 2.6350900600400267, "step": 7900 }, { "epoch": 2.6350900600400267, "ref_ce_loss": 0.15340293943881989, "step": 7900 }, { "epoch": 2.6350900600400267, "loss": 0.4966169595718384, "step": 7900 }, { "ce_loss": 0.13691580295562744, "epoch": 2.6350900600400267, "step": 7900 }, { "distill_loss": 0.11072009056806564, "epoch": 2.6350900600400267, "step": 7900 }, { "epoch": 2.6350900600400267, "ref_ce_loss": 0.14098653197288513, "step": 7900 }, { "epoch": 2.638425617078052, "loss": 0.5957, "step": 7910 }, { "epoch": 2.638425617078052, "grad_norm": 1.870453953742981, "step": 7910 }, { "epoch": 2.638425617078052, "learning_rate": 0.00023467803506695305, "step": 7910 }, { "epoch": 2.638425617078052, "loss": 0.3408009707927704, "step": 7910 }, { "ce_loss": 0.1164073795080185, "epoch": 2.638425617078052, "step": 7910 }, { "distill_loss": 0.11458929628133774, "epoch": 2.638425617078052, "step": 7910 }, { "epoch": 2.638425617078052, "ref_ce_loss": 0.0619593970477581, "step": 7910 }, { "epoch": 2.638425617078052, "loss": 0.6032022833824158, "step": 7910 }, { "ce_loss": 0.13740438222885132, "epoch": 2.638425617078052, "step": 7910 }, { "distill_loss": 0.18423528969287872, "epoch": 2.638425617078052, "step": 7910 }, { "epoch": 2.638425617078052, "ref_ce_loss": 0.15096089243888855, "step": 7910 }, { "epoch": 2.638425617078052, "loss": 0.3866853713989258, "step": 7910 }, { "ce_loss": 0.11736882477998734, "epoch": 2.638425617078052, "step": 7910 }, { "distill_loss": 0.1122303456068039, "epoch": 2.638425617078052, "step": 7910 }, { "epoch": 2.638425617078052, "ref_ce_loss": 0.10660815238952637, "step": 7910 }, { "epoch": 2.638425617078052, "loss": 0.4587579369544983, "step": 7910 }, { "ce_loss": 0.15818382799625397, "epoch": 2.638425617078052, "step": 7910 }, { "distill_loss": 0.12847086787223816, "epoch": 2.638425617078052, "step": 7910 }, { "epoch": 2.638425617078052, "ref_ce_loss": 0.11664664000272751, "step": 7910 }, { "epoch": 2.6417611741160774, "loss": 0.5464, "step": 7920 }, { "epoch": 2.6417611741160774, "grad_norm": 2.7910468578338623, "step": 7920 }, { "epoch": 2.6417611741160774, "learning_rate": 0.00023451075981298716, "step": 7920 }, { "epoch": 2.6417611741160774, "loss": 0.4135196805000305, "step": 7920 }, { "ce_loss": 0.162226140499115, "epoch": 2.6417611741160774, "step": 7920 }, { "distill_loss": 0.12755383551120758, "epoch": 2.6417611741160774, "step": 7920 }, { "epoch": 2.6417611741160774, "ref_ce_loss": 0.092817023396492, "step": 7920 }, { "epoch": 2.6417611741160774, "loss": 0.96152663230896, "step": 7920 }, { "ce_loss": 0.183192178606987, "epoch": 2.6417611741160774, "step": 7920 }, { "distill_loss": 0.17160940170288086, "epoch": 2.6417611741160774, "step": 7920 }, { "epoch": 2.6417611741160774, "ref_ce_loss": 0.09496646374464035, "step": 7920 }, { "epoch": 2.6417611741160774, "loss": 0.5214636921882629, "step": 7920 }, { "ce_loss": 0.14947235584259033, "epoch": 2.6417611741160774, "step": 7920 }, { "distill_loss": 0.11900770664215088, "epoch": 2.6417611741160774, "step": 7920 }, { "epoch": 2.6417611741160774, "ref_ce_loss": 0.11163744330406189, "step": 7920 }, { "epoch": 2.6417611741160774, "loss": 0.2927347719669342, "step": 7920 }, { "ce_loss": 0.08152108639478683, "epoch": 2.6417611741160774, "step": 7920 }, { "distill_loss": 0.10856892913579941, "epoch": 2.6417611741160774, "step": 7920 }, { "epoch": 2.6417611741160774, "ref_ce_loss": 0.10249659419059753, "step": 7920 }, { "epoch": 2.6450967311541027, "loss": 0.5723, "step": 7930 }, { "epoch": 2.6450967311541027, "grad_norm": 4.5653533935546875, "step": 7930 }, { "epoch": 2.6450967311541027, "learning_rate": 0.00023434333044473215, "step": 7930 }, { "epoch": 2.6450967311541027, "loss": 0.5235077738761902, "step": 7930 }, { "ce_loss": 0.13343635201454163, "epoch": 2.6450967311541027, "step": 7930 }, { "distill_loss": 0.257224977016449, "epoch": 2.6450967311541027, "step": 7930 }, { "epoch": 2.6450967311541027, "ref_ce_loss": 0.10070498287677765, "step": 7930 }, { "epoch": 2.6450967311541027, "loss": 0.4883321225643158, "step": 7930 }, { "ce_loss": 0.130856454372406, "epoch": 2.6450967311541027, "step": 7930 }, { "distill_loss": 0.15088599920272827, "epoch": 2.6450967311541027, "step": 7930 }, { "epoch": 2.6450967311541027, "ref_ce_loss": 0.16542506217956543, "step": 7930 }, { "epoch": 2.6450967311541027, "loss": 0.7994807958602905, "step": 7930 }, { "ce_loss": 0.1741330325603485, "epoch": 2.6450967311541027, "step": 7930 }, { "distill_loss": 0.3023594319820404, "epoch": 2.6450967311541027, "step": 7930 }, { "epoch": 2.6450967311541027, "ref_ce_loss": 0.11508870869874954, "step": 7930 }, { "epoch": 2.6450967311541027, "loss": 0.5890263319015503, "step": 7930 }, { "ce_loss": 0.17294292151927948, "epoch": 2.6450967311541027, "step": 7930 }, { "distill_loss": 0.20210260152816772, "epoch": 2.6450967311541027, "step": 7930 }, { "epoch": 2.6450967311541027, "ref_ce_loss": 0.15082262456417084, "step": 7930 }, { "epoch": 2.648432288192128, "loss": 0.6198, "step": 7940 }, { "epoch": 2.648432288192128, "grad_norm": 2.13840913772583, "step": 7940 }, { "epoch": 2.648432288192128, "learning_rate": 0.00023417574726751318, "step": 7940 }, { "epoch": 2.648432288192128, "loss": 0.7031892538070679, "step": 7940 }, { "ce_loss": 0.13717004656791687, "epoch": 2.648432288192128, "step": 7940 }, { "distill_loss": 0.14210714399814606, "epoch": 2.648432288192128, "step": 7940 }, { "epoch": 2.648432288192128, "ref_ce_loss": 0.11063428968191147, "step": 7940 }, { "epoch": 2.648432288192128, "loss": 0.7125575542449951, "step": 7940 }, { "ce_loss": 0.20230668783187866, "epoch": 2.648432288192128, "step": 7940 }, { "distill_loss": 0.25111111998558044, "epoch": 2.648432288192128, "step": 7940 }, { "epoch": 2.648432288192128, "ref_ce_loss": 0.2036392092704773, "step": 7940 }, { "epoch": 2.648432288192128, "loss": 0.5634617805480957, "step": 7940 }, { "ce_loss": 0.20257918536663055, "epoch": 2.648432288192128, "step": 7940 }, { "distill_loss": 0.17622129619121552, "epoch": 2.648432288192128, "step": 7940 }, { "epoch": 2.648432288192128, "ref_ce_loss": 0.13330873847007751, "step": 7940 }, { "epoch": 2.648432288192128, "loss": 0.7062669396400452, "step": 7940 }, { "ce_loss": 0.08234570175409317, "epoch": 2.648432288192128, "step": 7940 }, { "distill_loss": 0.24133436381816864, "epoch": 2.648432288192128, "step": 7940 }, { "epoch": 2.648432288192128, "ref_ce_loss": 0.10071347653865814, "step": 7940 }, { "epoch": 2.6517678452301534, "loss": 0.5858, "step": 7950 }, { "epoch": 2.6517678452301534, "grad_norm": 2.677126884460449, "step": 7950 }, { "epoch": 2.6517678452301534, "learning_rate": 0.0002340080105869358, "step": 7950 }, { "epoch": 2.6517678452301534, "loss": 0.5237668752670288, "step": 7950 }, { "ce_loss": 0.20540541410446167, "epoch": 2.6517678452301534, "step": 7950 }, { "distill_loss": 0.11389851570129395, "epoch": 2.6517678452301534, "step": 7950 }, { "epoch": 2.6517678452301534, "ref_ce_loss": 0.13045738637447357, "step": 7950 }, { "epoch": 2.6517678452301534, "loss": 0.5186854600906372, "step": 7950 }, { "ce_loss": 0.18487921357154846, "epoch": 2.6517678452301534, "step": 7950 }, { "distill_loss": 0.14122274518013, "epoch": 2.6517678452301534, "step": 7950 }, { "epoch": 2.6517678452301534, "ref_ce_loss": 0.15152312815189362, "step": 7950 }, { "epoch": 2.6517678452301534, "loss": 0.49603432416915894, "step": 7950 }, { "ce_loss": 0.1827073097229004, "epoch": 2.6517678452301534, "step": 7950 }, { "distill_loss": 0.1372450590133667, "epoch": 2.6517678452301534, "step": 7950 }, { "epoch": 2.6517678452301534, "ref_ce_loss": 0.12451291084289551, "step": 7950 }, { "epoch": 2.6517678452301534, "loss": 0.6272179484367371, "step": 7950 }, { "ce_loss": 0.27008381485939026, "epoch": 2.6517678452301534, "step": 7950 }, { "distill_loss": 0.1309342086315155, "epoch": 2.6517678452301534, "step": 7950 }, { "epoch": 2.6517678452301534, "ref_ce_loss": 0.1763468235731125, "step": 7950 }, { "epoch": 2.6551034022681788, "loss": 0.6062, "step": 7960 }, { "epoch": 2.6551034022681788, "grad_norm": 1.8981552124023438, "step": 7960 }, { "epoch": 2.6551034022681788, "learning_rate": 0.00023384012070888557, "step": 7960 }, { "epoch": 2.6551034022681788, "loss": 0.6313665509223938, "step": 7960 }, { "ce_loss": 0.12360011041164398, "epoch": 2.6551034022681788, "step": 7960 }, { "distill_loss": 0.1303849071264267, "epoch": 2.6551034022681788, "step": 7960 }, { "epoch": 2.6551034022681788, "ref_ce_loss": 0.13423965871334076, "step": 7960 }, { "epoch": 2.6551034022681788, "loss": 0.4521801173686981, "step": 7960 }, { "ce_loss": 0.20214208960533142, "epoch": 2.6551034022681788, "step": 7960 }, { "distill_loss": 0.13033844530582428, "epoch": 2.6551034022681788, "step": 7960 }, { "epoch": 2.6551034022681788, "ref_ce_loss": 0.11876943707466125, "step": 7960 }, { "epoch": 2.6551034022681788, "loss": 0.7230217456817627, "step": 7960 }, { "ce_loss": 0.20204299688339233, "epoch": 2.6551034022681788, "step": 7960 }, { "distill_loss": 0.14705559611320496, "epoch": 2.6551034022681788, "step": 7960 }, { "epoch": 2.6551034022681788, "ref_ce_loss": 0.12485090643167496, "step": 7960 }, { "epoch": 2.6551034022681788, "loss": 0.33352601528167725, "step": 7960 }, { "ce_loss": 0.08580795675516129, "epoch": 2.6551034022681788, "step": 7960 }, { "distill_loss": 0.08311134576797485, "epoch": 2.6551034022681788, "step": 7960 }, { "epoch": 2.6551034022681788, "ref_ce_loss": 0.0709758847951889, "step": 7960 }, { "epoch": 2.658438959306204, "loss": 0.5841, "step": 7970 }, { "epoch": 2.658438959306204, "grad_norm": 7.362756729125977, "step": 7970 }, { "epoch": 2.658438959306204, "learning_rate": 0.00023367207793952737, "step": 7970 }, { "epoch": 2.658438959306204, "loss": 0.38419678807258606, "step": 7970 }, { "ce_loss": 0.09084519743919373, "epoch": 2.658438959306204, "step": 7970 }, { "distill_loss": 0.13838332891464233, "epoch": 2.658438959306204, "step": 7970 }, { "epoch": 2.658438959306204, "ref_ce_loss": 0.12994177639484406, "step": 7970 }, { "epoch": 2.658438959306204, "loss": 0.5091409683227539, "step": 7970 }, { "ce_loss": 0.17732951045036316, "epoch": 2.658438959306204, "step": 7970 }, { "distill_loss": 0.12083851546049118, "epoch": 2.658438959306204, "step": 7970 }, { "epoch": 2.658438959306204, "ref_ce_loss": 0.13549870252609253, "step": 7970 }, { "epoch": 2.658438959306204, "loss": 0.5226490497589111, "step": 7970 }, { "ce_loss": 0.17717614769935608, "epoch": 2.658438959306204, "step": 7970 }, { "distill_loss": 0.131042018532753, "epoch": 2.658438959306204, "step": 7970 }, { "epoch": 2.658438959306204, "ref_ce_loss": 0.15869306027889252, "step": 7970 }, { "epoch": 2.658438959306204, "loss": 0.5586010813713074, "step": 7970 }, { "ce_loss": 0.16853399574756622, "epoch": 2.658438959306204, "step": 7970 }, { "distill_loss": 0.21672941744327545, "epoch": 2.658438959306204, "step": 7970 }, { "epoch": 2.658438959306204, "ref_ce_loss": 0.121253103017807, "step": 7970 }, { "epoch": 2.6617745163442295, "loss": 0.5451, "step": 7980 }, { "epoch": 2.6617745163442295, "grad_norm": 2.5755162239074707, "step": 7980 }, { "epoch": 2.6617745163442295, "learning_rate": 0.00023350388258530497, "step": 7980 }, { "epoch": 2.6617745163442295, "loss": 0.48963046073913574, "step": 7980 }, { "ce_loss": 0.16120007634162903, "epoch": 2.6617745163442295, "step": 7980 }, { "distill_loss": 0.20963573455810547, "epoch": 2.6617745163442295, "step": 7980 }, { "epoch": 2.6617745163442295, "ref_ce_loss": 0.118538036942482, "step": 7980 }, { "epoch": 2.6617745163442295, "loss": 0.5607742071151733, "step": 7980 }, { "ce_loss": 0.12266329675912857, "epoch": 2.6617745163442295, "step": 7980 }, { "distill_loss": 0.13333964347839355, "epoch": 2.6617745163442295, "step": 7980 }, { "epoch": 2.6617745163442295, "ref_ce_loss": 0.10077905654907227, "step": 7980 }, { "epoch": 2.6617745163442295, "loss": 0.364191472530365, "step": 7980 }, { "ce_loss": 0.07570328563451767, "epoch": 2.6617745163442295, "step": 7980 }, { "distill_loss": 0.08897201716899872, "epoch": 2.6617745163442295, "step": 7980 }, { "epoch": 2.6617745163442295, "ref_ce_loss": 0.08781538903713226, "step": 7980 }, { "epoch": 2.6617745163442295, "loss": 0.5703341364860535, "step": 7980 }, { "ce_loss": 0.13742899894714355, "epoch": 2.6617745163442295, "step": 7980 }, { "distill_loss": 0.16016842424869537, "epoch": 2.6617745163442295, "step": 7980 }, { "epoch": 2.6617745163442295, "ref_ce_loss": 0.11559545993804932, "step": 7980 }, { "epoch": 2.665110073382255, "loss": 0.5638, "step": 7990 }, { "epoch": 2.665110073382255, "grad_norm": 4.504335880279541, "step": 7990 }, { "epoch": 2.665110073382255, "learning_rate": 0.0002333355349529403, "step": 7990 }, { "epoch": 2.665110073382255, "loss": 0.7399834394454956, "step": 7990 }, { "ce_loss": 0.1619812250137329, "epoch": 2.665110073382255, "step": 7990 }, { "distill_loss": 0.1331026256084442, "epoch": 2.665110073382255, "step": 7990 }, { "epoch": 2.665110073382255, "ref_ce_loss": 0.11494561284780502, "step": 7990 }, { "epoch": 2.665110073382255, "loss": 0.4661237597465515, "step": 7990 }, { "ce_loss": 0.07970134913921356, "epoch": 2.665110073382255, "step": 7990 }, { "distill_loss": 0.11089581996202469, "epoch": 2.665110073382255, "step": 7990 }, { "epoch": 2.665110073382255, "ref_ce_loss": 0.09639148414134979, "step": 7990 }, { "epoch": 2.665110073382255, "loss": 0.8837971091270447, "step": 7990 }, { "ce_loss": 0.12881000339984894, "epoch": 2.665110073382255, "step": 7990 }, { "distill_loss": 0.13882938027381897, "epoch": 2.665110073382255, "step": 7990 }, { "epoch": 2.665110073382255, "ref_ce_loss": 0.10138195753097534, "step": 7990 }, { "epoch": 2.665110073382255, "loss": 0.6575859785079956, "step": 7990 }, { "ce_loss": 0.17015022039413452, "epoch": 2.665110073382255, "step": 7990 }, { "distill_loss": 0.1299816071987152, "epoch": 2.665110073382255, "step": 7990 }, { "epoch": 2.665110073382255, "ref_ce_loss": 0.11390574276447296, "step": 7990 }, { "epoch": 2.66844563042028, "loss": 0.5827, "step": 8000 }, { "epoch": 2.66844563042028, "grad_norm": 2.684920310974121, "step": 8000 }, { "epoch": 2.66844563042028, "learning_rate": 0.0002331670353494331, "step": 8000 }, { "epoch": 2.66844563042028, "loss": 1.0271755456924438, "step": 8000 }, { "ce_loss": 0.20616409182548523, "epoch": 2.66844563042028, "step": 8000 }, { "distill_loss": 0.13934825360774994, "epoch": 2.66844563042028, "step": 8000 }, { "epoch": 2.66844563042028, "ref_ce_loss": 0.1512710601091385, "step": 8000 }, { "epoch": 2.66844563042028, "loss": 0.4182243347167969, "step": 8000 }, { "ce_loss": 0.1766296625137329, "epoch": 2.66844563042028, "step": 8000 }, { "distill_loss": 0.13832080364227295, "epoch": 2.66844563042028, "step": 8000 }, { "epoch": 2.66844563042028, "ref_ce_loss": 0.10316318273544312, "step": 8000 }, { "epoch": 2.66844563042028, "loss": 0.4399473965167999, "step": 8000 }, { "ce_loss": 0.17161087691783905, "epoch": 2.66844563042028, "step": 8000 }, { "distill_loss": 0.1129341796040535, "epoch": 2.66844563042028, "step": 8000 }, { "epoch": 2.66844563042028, "ref_ce_loss": 0.10985776782035828, "step": 8000 }, { "epoch": 2.66844563042028, "loss": 0.4889751076698303, "step": 8000 }, { "ce_loss": 0.16917948424816132, "epoch": 2.66844563042028, "step": 8000 }, { "distill_loss": 0.16153065860271454, "epoch": 2.66844563042028, "step": 8000 }, { "epoch": 2.66844563042028, "ref_ce_loss": 0.12116784602403641, "step": 8000 }, { "epoch": 2.6717811874583055, "loss": 0.5545, "step": 8010 }, { "epoch": 2.6717811874583055, "grad_norm": 3.7803685665130615, "step": 8010 }, { "epoch": 2.6717811874583055, "learning_rate": 0.00023299838408206015, "step": 8010 }, { "epoch": 2.6717811874583055, "loss": 0.5689238905906677, "step": 8010 }, { "ce_loss": 0.17813092470169067, "epoch": 2.6717811874583055, "step": 8010 }, { "distill_loss": 0.22044917941093445, "epoch": 2.6717811874583055, "step": 8010 }, { "epoch": 2.6717811874583055, "ref_ce_loss": 0.08347884565591812, "step": 8010 }, { "epoch": 2.6717811874583055, "loss": 0.888495922088623, "step": 8010 }, { "ce_loss": 0.10351214557886124, "epoch": 2.6717811874583055, "step": 8010 }, { "distill_loss": 0.11769212782382965, "epoch": 2.6717811874583055, "step": 8010 }, { "epoch": 2.6717811874583055, "ref_ce_loss": 0.0803191289305687, "step": 8010 }, { "epoch": 2.6717811874583055, "loss": 0.4115409553050995, "step": 8010 }, { "ce_loss": 0.1512393355369568, "epoch": 2.6717811874583055, "step": 8010 }, { "distill_loss": 0.126437708735466, "epoch": 2.6717811874583055, "step": 8010 }, { "epoch": 2.6717811874583055, "ref_ce_loss": 0.10095108300447464, "step": 8010 }, { "epoch": 2.6717811874583055, "loss": 1.0289809703826904, "step": 8010 }, { "ce_loss": 0.2210640162229538, "epoch": 2.6717811874583055, "step": 8010 }, { "distill_loss": 0.14857910573482513, "epoch": 2.6717811874583055, "step": 8010 }, { "epoch": 2.6717811874583055, "ref_ce_loss": 0.12264930456876755, "step": 8010 }, { "epoch": 2.675116744496331, "loss": 0.6117, "step": 8020 }, { "epoch": 2.675116744496331, "grad_norm": 2.5475668907165527, "step": 8020 }, { "epoch": 2.675116744496331, "learning_rate": 0.00023282958145837477, "step": 8020 }, { "epoch": 2.675116744496331, "loss": 0.4965721666812897, "step": 8020 }, { "ce_loss": 0.1637820154428482, "epoch": 2.675116744496331, "step": 8020 }, { "distill_loss": 0.12568223476409912, "epoch": 2.675116744496331, "step": 8020 }, { "epoch": 2.675116744496331, "ref_ce_loss": 0.13360291719436646, "step": 8020 }, { "epoch": 2.675116744496331, "loss": 0.28336021304130554, "step": 8020 }, { "ce_loss": 0.07376381009817123, "epoch": 2.675116744496331, "step": 8020 }, { "distill_loss": 0.1208600327372551, "epoch": 2.675116744496331, "step": 8020 }, { "epoch": 2.675116744496331, "ref_ce_loss": 0.08844497054815292, "step": 8020 }, { "epoch": 2.675116744496331, "loss": 0.5349476933479309, "step": 8020 }, { "ce_loss": 0.16752538084983826, "epoch": 2.675116744496331, "step": 8020 }, { "distill_loss": 0.13089683651924133, "epoch": 2.675116744496331, "step": 8020 }, { "epoch": 2.675116744496331, "ref_ce_loss": 0.07995325326919556, "step": 8020 }, { "epoch": 2.675116744496331, "loss": 0.6904729604721069, "step": 8020 }, { "ce_loss": 0.13420693576335907, "epoch": 2.675116744496331, "step": 8020 }, { "distill_loss": 0.17605559527873993, "epoch": 2.675116744496331, "step": 8020 }, { "epoch": 2.675116744496331, "ref_ce_loss": 0.08207875490188599, "step": 8020 }, { "epoch": 2.678452301534356, "loss": 0.5091, "step": 8030 }, { "epoch": 2.678452301534356, "grad_norm": 2.528420925140381, "step": 8030 }, { "epoch": 2.678452301534356, "learning_rate": 0.00023266062778620647, "step": 8030 }, { "epoch": 2.678452301534356, "loss": 0.4364303946495056, "step": 8030 }, { "ce_loss": 0.15353445708751678, "epoch": 2.678452301534356, "step": 8030 }, { "distill_loss": 0.13547055423259735, "epoch": 2.678452301534356, "step": 8030 }, { "epoch": 2.678452301534356, "ref_ce_loss": 0.10869955271482468, "step": 8030 }, { "epoch": 2.678452301534356, "loss": 0.49167340993881226, "step": 8030 }, { "ce_loss": 0.14930841326713562, "epoch": 2.678452301534356, "step": 8030 }, { "distill_loss": 0.11774240434169769, "epoch": 2.678452301534356, "step": 8030 }, { "epoch": 2.678452301534356, "ref_ce_loss": 0.10742420703172684, "step": 8030 }, { "epoch": 2.678452301534356, "loss": 0.6432998776435852, "step": 8030 }, { "ce_loss": 0.2559523284435272, "epoch": 2.678452301534356, "step": 8030 }, { "distill_loss": 0.14143937826156616, "epoch": 2.678452301534356, "step": 8030 }, { "epoch": 2.678452301534356, "ref_ce_loss": 0.18032102286815643, "step": 8030 }, { "epoch": 2.678452301534356, "loss": 0.44476890563964844, "step": 8030 }, { "ce_loss": 0.08300906419754028, "epoch": 2.678452301534356, "step": 8030 }, { "distill_loss": 0.11552564054727554, "epoch": 2.678452301534356, "step": 8030 }, { "epoch": 2.678452301534356, "ref_ce_loss": 0.07628441601991653, "step": 8030 }, { "epoch": 2.6817878585723816, "loss": 0.5758, "step": 8040 }, { "epoch": 2.6817878585723816, "grad_norm": 2.6837093830108643, "step": 8040 }, { "epoch": 2.6817878585723816, "learning_rate": 0.00023249152337366, "step": 8040 }, { "epoch": 2.6817878585723816, "loss": 0.6911327838897705, "step": 8040 }, { "ce_loss": 0.08783482015132904, "epoch": 2.6817878585723816, "step": 8040 }, { "distill_loss": 0.1832849681377411, "epoch": 2.6817878585723816, "step": 8040 }, { "epoch": 2.6817878585723816, "ref_ce_loss": 0.1677989512681961, "step": 8040 }, { "epoch": 2.6817878585723816, "loss": 0.508490800857544, "step": 8040 }, { "ce_loss": 0.1933222860097885, "epoch": 2.6817878585723816, "step": 8040 }, { "distill_loss": 0.1114601269364357, "epoch": 2.6817878585723816, "step": 8040 }, { "epoch": 2.6817878585723816, "ref_ce_loss": 0.15833140909671783, "step": 8040 }, { "epoch": 2.6817878585723816, "loss": 0.3330030143260956, "step": 8040 }, { "ce_loss": 0.11011120676994324, "epoch": 2.6817878585723816, "step": 8040 }, { "distill_loss": 0.1153588593006134, "epoch": 2.6817878585723816, "step": 8040 }, { "epoch": 2.6817878585723816, "ref_ce_loss": 0.10705897212028503, "step": 8040 }, { "epoch": 2.6817878585723816, "loss": 0.6220952868461609, "step": 8040 }, { "ce_loss": 0.2026032656431198, "epoch": 2.6817878585723816, "step": 8040 }, { "distill_loss": 0.12767234444618225, "epoch": 2.6817878585723816, "step": 8040 }, { "epoch": 2.6817878585723816, "ref_ce_loss": 0.13173550367355347, "step": 8040 }, { "epoch": 2.685123415610407, "loss": 0.489, "step": 8050 }, { "epoch": 2.685123415610407, "grad_norm": 3.3889288902282715, "step": 8050 }, { "epoch": 2.685123415610407, "learning_rate": 0.0002323222685291152, "step": 8050 }, { "epoch": 2.685123415610407, "loss": 0.6236046552658081, "step": 8050 }, { "ce_loss": 0.1862117052078247, "epoch": 2.685123415610407, "step": 8050 }, { "distill_loss": 0.15372738242149353, "epoch": 2.685123415610407, "step": 8050 }, { "epoch": 2.685123415610407, "ref_ce_loss": 0.11308329552412033, "step": 8050 }, { "epoch": 2.685123415610407, "loss": 0.8643714189529419, "step": 8050 }, { "ce_loss": 0.29614153504371643, "epoch": 2.685123415610407, "step": 8050 }, { "distill_loss": 0.1857837736606598, "epoch": 2.685123415610407, "step": 8050 }, { "epoch": 2.685123415610407, "ref_ce_loss": 0.15275658667087555, "step": 8050 }, { "epoch": 2.685123415610407, "loss": 0.40307721495628357, "step": 8050 }, { "ce_loss": 0.12835247814655304, "epoch": 2.685123415610407, "step": 8050 }, { "distill_loss": 0.13174714148044586, "epoch": 2.685123415610407, "step": 8050 }, { "epoch": 2.685123415610407, "ref_ce_loss": 0.11546413600444794, "step": 8050 }, { "epoch": 2.685123415610407, "loss": 1.036539077758789, "step": 8050 }, { "ce_loss": 0.22426684200763702, "epoch": 2.685123415610407, "step": 8050 }, { "distill_loss": 0.15441244840621948, "epoch": 2.685123415610407, "step": 8050 }, { "epoch": 2.685123415610407, "ref_ce_loss": 0.15819288790225983, "step": 8050 }, { "epoch": 2.6884589726484323, "loss": 0.5733, "step": 8060 }, { "epoch": 2.6884589726484323, "grad_norm": 3.0571844577789307, "step": 8060 }, { "epoch": 2.6884589726484323, "learning_rate": 0.000232152863561226, "step": 8060 }, { "epoch": 2.6884589726484323, "loss": 0.4697049856185913, "step": 8060 }, { "ce_loss": 0.1788596659898758, "epoch": 2.6884589726484323, "step": 8060 }, { "distill_loss": 0.12807488441467285, "epoch": 2.6884589726484323, "step": 8060 }, { "epoch": 2.6884589726484323, "ref_ce_loss": 0.08363891392946243, "step": 8060 }, { "epoch": 2.6884589726484323, "loss": 0.38888436555862427, "step": 8060 }, { "ce_loss": 0.04981951415538788, "epoch": 2.6884589726484323, "step": 8060 }, { "distill_loss": 0.10500740259885788, "epoch": 2.6884589726484323, "step": 8060 }, { "epoch": 2.6884589726484323, "ref_ce_loss": 0.08457271754741669, "step": 8060 }, { "epoch": 2.6884589726484323, "loss": 0.5909742712974548, "step": 8060 }, { "ce_loss": 0.20369437336921692, "epoch": 2.6884589726484323, "step": 8060 }, { "distill_loss": 0.16423441469669342, "epoch": 2.6884589726484323, "step": 8060 }, { "epoch": 2.6884589726484323, "ref_ce_loss": 0.10368082672357559, "step": 8060 }, { "epoch": 2.6884589726484323, "loss": 0.6657927632331848, "step": 8060 }, { "ce_loss": 0.27945613861083984, "epoch": 2.6884589726484323, "step": 8060 }, { "distill_loss": 0.1881733387708664, "epoch": 2.6884589726484323, "step": 8060 }, { "epoch": 2.6884589726484323, "ref_ce_loss": 0.1429987996816635, "step": 8060 }, { "epoch": 2.6917945296864576, "loss": 0.5357, "step": 8070 }, { "epoch": 2.6917945296864576, "grad_norm": 5.027276039123535, "step": 8070 }, { "epoch": 2.6917945296864576, "learning_rate": 0.0002319833087789204, "step": 8070 }, { "epoch": 2.6917945296864576, "loss": 0.39156532287597656, "step": 8070 }, { "ce_loss": 0.12410548329353333, "epoch": 2.6917945296864576, "step": 8070 }, { "distill_loss": 0.10390803962945938, "epoch": 2.6917945296864576, "step": 8070 }, { "epoch": 2.6917945296864576, "ref_ce_loss": 0.10742595791816711, "step": 8070 }, { "epoch": 2.6917945296864576, "loss": 0.3632297217845917, "step": 8070 }, { "ce_loss": 0.12438895553350449, "epoch": 2.6917945296864576, "step": 8070 }, { "distill_loss": 0.11003535240888596, "epoch": 2.6917945296864576, "step": 8070 }, { "epoch": 2.6917945296864576, "ref_ce_loss": 0.08501872420310974, "step": 8070 }, { "epoch": 2.6917945296864576, "loss": 0.5452225208282471, "step": 8070 }, { "ce_loss": 0.1348339021205902, "epoch": 2.6917945296864576, "step": 8070 }, { "distill_loss": 0.1079145073890686, "epoch": 2.6917945296864576, "step": 8070 }, { "epoch": 2.6917945296864576, "ref_ce_loss": 0.12742206454277039, "step": 8070 }, { "epoch": 2.6917945296864576, "loss": 0.5933875441551208, "step": 8070 }, { "ce_loss": 0.13612274825572968, "epoch": 2.6917945296864576, "step": 8070 }, { "distill_loss": 0.1913990080356598, "epoch": 2.6917945296864576, "step": 8070 }, { "epoch": 2.6917945296864576, "ref_ce_loss": 0.08002374321222305, "step": 8070 }, { "epoch": 2.695130086724483, "loss": 0.559, "step": 8080 }, { "epoch": 2.695130086724483, "grad_norm": 2.6058237552642822, "step": 8080 }, { "epoch": 2.695130086724483, "learning_rate": 0.00023181360449139936, "step": 8080 }, { "epoch": 2.695130086724483, "loss": 0.5928658246994019, "step": 8080 }, { "ce_loss": 0.16962631046772003, "epoch": 2.695130086724483, "step": 8080 }, { "distill_loss": 0.1335127204656601, "epoch": 2.695130086724483, "step": 8080 }, { "epoch": 2.695130086724483, "ref_ce_loss": 0.10307545959949493, "step": 8080 }, { "epoch": 2.695130086724483, "loss": 0.31053030490875244, "step": 8080 }, { "ce_loss": 0.1039867177605629, "epoch": 2.695130086724483, "step": 8080 }, { "distill_loss": 0.09887486696243286, "epoch": 2.695130086724483, "step": 8080 }, { "epoch": 2.695130086724483, "ref_ce_loss": 0.07038954645395279, "step": 8080 }, { "epoch": 2.695130086724483, "loss": 0.4417012631893158, "step": 8080 }, { "ce_loss": 0.10867048054933548, "epoch": 2.695130086724483, "step": 8080 }, { "distill_loss": 0.11061249673366547, "epoch": 2.695130086724483, "step": 8080 }, { "epoch": 2.695130086724483, "ref_ce_loss": 0.1626960039138794, "step": 8080 }, { "epoch": 2.695130086724483, "loss": 0.5297577977180481, "step": 8080 }, { "ce_loss": 0.13766439259052277, "epoch": 2.695130086724483, "step": 8080 }, { "distill_loss": 0.1209607645869255, "epoch": 2.695130086724483, "step": 8080 }, { "epoch": 2.695130086724483, "ref_ce_loss": 0.09336847811937332, "step": 8080 }, { "epoch": 2.6984656437625083, "loss": 0.5166, "step": 8090 }, { "epoch": 2.6984656437625083, "grad_norm": 4.001850605010986, "step": 8090 }, { "epoch": 2.6984656437625083, "learning_rate": 0.00023164375100813656, "step": 8090 }, { "epoch": 2.6984656437625083, "loss": 0.5130573511123657, "step": 8090 }, { "ce_loss": 0.09124461561441422, "epoch": 2.6984656437625083, "step": 8090 }, { "distill_loss": 0.1190020740032196, "epoch": 2.6984656437625083, "step": 8090 }, { "epoch": 2.6984656437625083, "ref_ce_loss": 0.0816359743475914, "step": 8090 }, { "epoch": 2.6984656437625083, "loss": 0.5037546157836914, "step": 8090 }, { "ce_loss": 0.16275212168693542, "epoch": 2.6984656437625083, "step": 8090 }, { "distill_loss": 0.11124013364315033, "epoch": 2.6984656437625083, "step": 8090 }, { "epoch": 2.6984656437625083, "ref_ce_loss": 0.11581555753946304, "step": 8090 }, { "epoch": 2.6984656437625083, "loss": 0.37682679295539856, "step": 8090 }, { "ce_loss": 0.09656783938407898, "epoch": 2.6984656437625083, "step": 8090 }, { "distill_loss": 0.12740042805671692, "epoch": 2.6984656437625083, "step": 8090 }, { "epoch": 2.6984656437625083, "ref_ce_loss": 0.08813125640153885, "step": 8090 }, { "epoch": 2.6984656437625083, "loss": 0.41288870573043823, "step": 8090 }, { "ce_loss": 0.09926676005125046, "epoch": 2.6984656437625083, "step": 8090 }, { "distill_loss": 0.13379068672657013, "epoch": 2.6984656437625083, "step": 8090 }, { "epoch": 2.6984656437625083, "ref_ce_loss": 0.08153785765171051, "step": 8090 }, { "epoch": 2.7018012008005337, "loss": 0.5768, "step": 8100 }, { "epoch": 2.7018012008005337, "grad_norm": 3.228931427001953, "step": 8100 }, { "epoch": 2.7018012008005337, "learning_rate": 0.00023147374863887772, "step": 8100 }, { "epoch": 2.7018012008005337, "loss": 0.4492276906967163, "step": 8100 }, { "ce_loss": 0.056972142308950424, "epoch": 2.7018012008005337, "step": 8100 }, { "distill_loss": 0.11906185001134872, "epoch": 2.7018012008005337, "step": 8100 }, { "epoch": 2.7018012008005337, "ref_ce_loss": 0.09090115875005722, "step": 8100 }, { "epoch": 2.7018012008005337, "loss": 0.3158268332481384, "step": 8100 }, { "ce_loss": 0.07716682553291321, "epoch": 2.7018012008005337, "step": 8100 }, { "distill_loss": 0.128421813249588, "epoch": 2.7018012008005337, "step": 8100 }, { "epoch": 2.7018012008005337, "ref_ce_loss": 0.11011645942926407, "step": 8100 }, { "epoch": 2.7018012008005337, "loss": 0.7365747690200806, "step": 8100 }, { "ce_loss": 0.17575332522392273, "epoch": 2.7018012008005337, "step": 8100 }, { "distill_loss": 0.1499844342470169, "epoch": 2.7018012008005337, "step": 8100 }, { "epoch": 2.7018012008005337, "ref_ce_loss": 0.12130838632583618, "step": 8100 }, { "epoch": 2.7018012008005337, "loss": 0.629986047744751, "step": 8100 }, { "ce_loss": 0.17869074642658234, "epoch": 2.7018012008005337, "step": 8100 }, { "distill_loss": 0.18918591737747192, "epoch": 2.7018012008005337, "step": 8100 }, { "epoch": 2.7018012008005337, "ref_ce_loss": 0.12489745765924454, "step": 8100 }, { "epoch": 2.705136757838559, "loss": 0.5244, "step": 8110 }, { "epoch": 2.705136757838559, "grad_norm": 1.915163278579712, "step": 8110 }, { "epoch": 2.705136757838559, "learning_rate": 0.00023130359769364016, "step": 8110 }, { "epoch": 2.705136757838559, "loss": 0.3695540726184845, "step": 8110 }, { "ce_loss": 0.1474255472421646, "epoch": 2.705136757838559, "step": 8110 }, { "distill_loss": 0.12527930736541748, "epoch": 2.705136757838559, "step": 8110 }, { "epoch": 2.705136757838559, "ref_ce_loss": 0.07345467805862427, "step": 8110 }, { "epoch": 2.705136757838559, "loss": 0.45366865396499634, "step": 8110 }, { "ce_loss": 0.1327705830335617, "epoch": 2.705136757838559, "step": 8110 }, { "distill_loss": 0.11853218078613281, "epoch": 2.705136757838559, "step": 8110 }, { "epoch": 2.705136757838559, "ref_ce_loss": 0.142308309674263, "step": 8110 }, { "epoch": 2.705136757838559, "loss": 0.7480238080024719, "step": 8110 }, { "ce_loss": 0.17574192583560944, "epoch": 2.705136757838559, "step": 8110 }, { "distill_loss": 0.17148953676223755, "epoch": 2.705136757838559, "step": 8110 }, { "epoch": 2.705136757838559, "ref_ce_loss": 0.1413096785545349, "step": 8110 }, { "epoch": 2.705136757838559, "loss": 0.5942070484161377, "step": 8110 }, { "ce_loss": 0.18245285749435425, "epoch": 2.705136757838559, "step": 8110 }, { "distill_loss": 0.12027274817228317, "epoch": 2.705136757838559, "step": 8110 }, { "epoch": 2.705136757838559, "ref_ce_loss": 0.11873797327280045, "step": 8110 }, { "epoch": 2.7084723148765844, "loss": 0.5295, "step": 8120 }, { "epoch": 2.7084723148765844, "grad_norm": 2.6392195224761963, "step": 8120 }, { "epoch": 2.7084723148765844, "learning_rate": 0.00023113329848271203, "step": 8120 }, { "epoch": 2.7084723148765844, "loss": 0.46645209193229675, "step": 8120 }, { "ce_loss": 0.18638621270656586, "epoch": 2.7084723148765844, "step": 8120 }, { "distill_loss": 0.12224185466766357, "epoch": 2.7084723148765844, "step": 8120 }, { "epoch": 2.7084723148765844, "ref_ce_loss": 0.11621517688035965, "step": 8120 }, { "epoch": 2.7084723148765844, "loss": 0.47254177927970886, "step": 8120 }, { "ce_loss": 0.20265349745750427, "epoch": 2.7084723148765844, "step": 8120 }, { "distill_loss": 0.12762364745140076, "epoch": 2.7084723148765844, "step": 8120 }, { "epoch": 2.7084723148765844, "ref_ce_loss": 0.10792994499206543, "step": 8120 }, { "epoch": 2.7084723148765844, "loss": 0.7535987496376038, "step": 8120 }, { "ce_loss": 0.2545333504676819, "epoch": 2.7084723148765844, "step": 8120 }, { "distill_loss": 0.17720168828964233, "epoch": 2.7084723148765844, "step": 8120 }, { "epoch": 2.7084723148765844, "ref_ce_loss": 0.12154602259397507, "step": 8120 }, { "epoch": 2.7084723148765844, "loss": 0.5472800731658936, "step": 8120 }, { "ce_loss": 0.24012410640716553, "epoch": 2.7084723148765844, "step": 8120 }, { "distill_loss": 0.14520911872386932, "epoch": 2.7084723148765844, "step": 8120 }, { "epoch": 2.7084723148765844, "ref_ce_loss": 0.09373628348112106, "step": 8120 }, { "epoch": 2.7118078719146097, "loss": 0.5461, "step": 8130 }, { "epoch": 2.7118078719146097, "grad_norm": 2.5390615463256836, "step": 8130 }, { "epoch": 2.7118078719146097, "learning_rate": 0.00023096285131665197, "step": 8130 }, { "epoch": 2.7118078719146097, "loss": 0.4835377335548401, "step": 8130 }, { "ce_loss": 0.12601253390312195, "epoch": 2.7118078719146097, "step": 8130 }, { "distill_loss": 0.10771089792251587, "epoch": 2.7118078719146097, "step": 8130 }, { "epoch": 2.7118078719146097, "ref_ce_loss": 0.09932011365890503, "step": 8130 }, { "epoch": 2.7118078719146097, "loss": 0.2281503975391388, "step": 8130 }, { "ce_loss": 0.04772244021296501, "epoch": 2.7118078719146097, "step": 8130 }, { "distill_loss": 0.08223338425159454, "epoch": 2.7118078719146097, "step": 8130 }, { "epoch": 2.7118078719146097, "ref_ce_loss": 0.06510313600301743, "step": 8130 }, { "epoch": 2.7118078719146097, "loss": 0.4231301546096802, "step": 8130 }, { "ce_loss": 0.1172071024775505, "epoch": 2.7118078719146097, "step": 8130 }, { "distill_loss": 0.08207279443740845, "epoch": 2.7118078719146097, "step": 8130 }, { "epoch": 2.7118078719146097, "ref_ce_loss": 0.07831268012523651, "step": 8130 }, { "epoch": 2.7118078719146097, "loss": 0.4807983934879303, "step": 8130 }, { "ce_loss": 0.17932242155075073, "epoch": 2.7118078719146097, "step": 8130 }, { "distill_loss": 0.0898590013384819, "epoch": 2.7118078719146097, "step": 8130 }, { "epoch": 2.7118078719146097, "ref_ce_loss": 0.12309905141592026, "step": 8130 }, { "epoch": 2.715143428952635, "loss": 0.4989, "step": 8140 }, { "epoch": 2.715143428952635, "grad_norm": 2.704934597015381, "step": 8140 }, { "epoch": 2.715143428952635, "learning_rate": 0.00023079225650628836, "step": 8140 }, { "epoch": 2.715143428952635, "loss": 0.6727844476699829, "step": 8140 }, { "ce_loss": 0.223181813955307, "epoch": 2.715143428952635, "step": 8140 }, { "distill_loss": 0.15185865759849548, "epoch": 2.715143428952635, "step": 8140 }, { "epoch": 2.715143428952635, "ref_ce_loss": 0.11520495265722275, "step": 8140 }, { "epoch": 2.715143428952635, "loss": 0.46417367458343506, "step": 8140 }, { "ce_loss": 0.14673767983913422, "epoch": 2.715143428952635, "step": 8140 }, { "distill_loss": 0.12357361614704132, "epoch": 2.715143428952635, "step": 8140 }, { "epoch": 2.715143428952635, "ref_ce_loss": 0.10955695062875748, "step": 8140 }, { "epoch": 2.715143428952635, "loss": 0.6096020936965942, "step": 8140 }, { "ce_loss": 0.19598866999149323, "epoch": 2.715143428952635, "step": 8140 }, { "distill_loss": 0.1323723942041397, "epoch": 2.715143428952635, "step": 8140 }, { "epoch": 2.715143428952635, "ref_ce_loss": 0.15653467178344727, "step": 8140 }, { "epoch": 2.715143428952635, "loss": 0.37559083104133606, "step": 8140 }, { "ce_loss": 0.12198042124509811, "epoch": 2.715143428952635, "step": 8140 }, { "distill_loss": 0.11107443273067474, "epoch": 2.715143428952635, "step": 8140 }, { "epoch": 2.715143428952635, "ref_ce_loss": 0.14184345304965973, "step": 8140 }, { "epoch": 2.7184789859906604, "loss": 0.4915, "step": 8150 }, { "epoch": 2.7184789859906604, "grad_norm": 2.9659900665283203, "step": 8150 }, { "epoch": 2.7184789859906604, "learning_rate": 0.00023062151436271876, "step": 8150 }, { "epoch": 2.7184789859906604, "loss": 0.5116956830024719, "step": 8150 }, { "ce_loss": 0.21129418909549713, "epoch": 2.7184789859906604, "step": 8150 }, { "distill_loss": 0.11995405703783035, "epoch": 2.7184789859906604, "step": 8150 }, { "epoch": 2.7184789859906604, "ref_ce_loss": 0.12191393226385117, "step": 8150 }, { "epoch": 2.7184789859906604, "loss": 0.4112318754196167, "step": 8150 }, { "ce_loss": 0.16436515748500824, "epoch": 2.7184789859906604, "step": 8150 }, { "distill_loss": 0.11617519706487656, "epoch": 2.7184789859906604, "step": 8150 }, { "epoch": 2.7184789859906604, "ref_ce_loss": 0.0827714130282402, "step": 8150 }, { "epoch": 2.7184789859906604, "loss": 0.32239362597465515, "step": 8150 }, { "ce_loss": 0.08988597244024277, "epoch": 2.7184789859906604, "step": 8150 }, { "distill_loss": 0.11604972928762436, "epoch": 2.7184789859906604, "step": 8150 }, { "epoch": 2.7184789859906604, "ref_ce_loss": 0.08602083474397659, "step": 8150 }, { "epoch": 2.7184789859906604, "loss": 0.2883583903312683, "step": 8150 }, { "ce_loss": 0.03574028238654137, "epoch": 2.7184789859906604, "step": 8150 }, { "distill_loss": 0.0718480795621872, "epoch": 2.7184789859906604, "step": 8150 }, { "epoch": 2.7184789859906604, "ref_ce_loss": 0.06966337561607361, "step": 8150 }, { "epoch": 2.7218145430286858, "loss": 0.5195, "step": 8160 }, { "epoch": 2.7218145430286858, "grad_norm": 2.757476329803467, "step": 8160 }, { "epoch": 2.7218145430286858, "learning_rate": 0.0002304506251973096, "step": 8160 }, { "epoch": 2.7218145430286858, "loss": 0.42667993903160095, "step": 8160 }, { "ce_loss": 0.15945284068584442, "epoch": 2.7218145430286858, "step": 8160 }, { "distill_loss": 0.15174272656440735, "epoch": 2.7218145430286858, "step": 8160 }, { "epoch": 2.7218145430286858, "ref_ce_loss": 0.1153591200709343, "step": 8160 }, { "epoch": 2.7218145430286858, "loss": 0.47556787729263306, "step": 8160 }, { "ce_loss": 0.11233219504356384, "epoch": 2.7218145430286858, "step": 8160 }, { "distill_loss": 0.12334011495113373, "epoch": 2.7218145430286858, "step": 8160 }, { "epoch": 2.7218145430286858, "ref_ce_loss": 0.12544937431812286, "step": 8160 }, { "epoch": 2.7218145430286858, "loss": 0.5468754172325134, "step": 8160 }, { "ce_loss": 0.11044180393218994, "epoch": 2.7218145430286858, "step": 8160 }, { "distill_loss": 0.10472964495420456, "epoch": 2.7218145430286858, "step": 8160 }, { "epoch": 2.7218145430286858, "ref_ce_loss": 0.15179766714572906, "step": 8160 }, { "epoch": 2.7218145430286858, "loss": 0.4966447949409485, "step": 8160 }, { "ce_loss": 0.035888370126485825, "epoch": 2.7218145430286858, "step": 8160 }, { "distill_loss": 0.08420006185770035, "epoch": 2.7218145430286858, "step": 8160 }, { "epoch": 2.7218145430286858, "ref_ce_loss": 0.06462906301021576, "step": 8160 }, { "epoch": 2.725150100066711, "loss": 0.5431, "step": 8170 }, { "epoch": 2.725150100066711, "grad_norm": 2.061748504638672, "step": 8170 }, { "epoch": 2.725150100066711, "learning_rate": 0.0002302795893216953, "step": 8170 }, { "epoch": 2.725150100066711, "loss": 0.3870605230331421, "step": 8170 }, { "ce_loss": 0.14253829419612885, "epoch": 2.725150100066711, "step": 8170 }, { "distill_loss": 0.11652734875679016, "epoch": 2.725150100066711, "step": 8170 }, { "epoch": 2.725150100066711, "ref_ce_loss": 0.08488757163286209, "step": 8170 }, { "epoch": 2.725150100066711, "loss": 0.49011707305908203, "step": 8170 }, { "ce_loss": 0.15634660422801971, "epoch": 2.725150100066711, "step": 8170 }, { "distill_loss": 0.16235539317131042, "epoch": 2.725150100066711, "step": 8170 }, { "epoch": 2.725150100066711, "ref_ce_loss": 0.13577237725257874, "step": 8170 }, { "epoch": 2.725150100066711, "loss": 0.3121011555194855, "step": 8170 }, { "ce_loss": 0.11659318208694458, "epoch": 2.725150100066711, "step": 8170 }, { "distill_loss": 0.10463510453701019, "epoch": 2.725150100066711, "step": 8170 }, { "epoch": 2.725150100066711, "ref_ce_loss": 0.09063059836626053, "step": 8170 }, { "epoch": 2.725150100066711, "loss": 0.46912479400634766, "step": 8170 }, { "ce_loss": 0.13982565701007843, "epoch": 2.725150100066711, "step": 8170 }, { "distill_loss": 0.14655548334121704, "epoch": 2.725150100066711, "step": 8170 }, { "epoch": 2.725150100066711, "ref_ce_loss": 0.0921240821480751, "step": 8170 }, { "epoch": 2.7284856571047365, "loss": 0.5177, "step": 8180 }, { "epoch": 2.7284856571047365, "grad_norm": 2.698974132537842, "step": 8180 }, { "epoch": 2.7284856571047365, "learning_rate": 0.00023010840704777773, "step": 8180 }, { "epoch": 2.7284856571047365, "loss": 0.3805660307407379, "step": 8180 }, { "ce_loss": 0.14469732344150543, "epoch": 2.7284856571047365, "step": 8180 }, { "distill_loss": 0.1307322382926941, "epoch": 2.7284856571047365, "step": 8180 }, { "epoch": 2.7284856571047365, "ref_ce_loss": 0.10495664924383163, "step": 8180 }, { "epoch": 2.7284856571047365, "loss": 0.4237486720085144, "step": 8180 }, { "ce_loss": 0.11302044242620468, "epoch": 2.7284856571047365, "step": 8180 }, { "distill_loss": 0.18344856798648834, "epoch": 2.7284856571047365, "step": 8180 }, { "epoch": 2.7284856571047365, "ref_ce_loss": 0.10202555358409882, "step": 8180 }, { "epoch": 2.7284856571047365, "loss": 0.9929975271224976, "step": 8180 }, { "ce_loss": 0.14125335216522217, "epoch": 2.7284856571047365, "step": 8180 }, { "distill_loss": 0.16645291447639465, "epoch": 2.7284856571047365, "step": 8180 }, { "epoch": 2.7284856571047365, "ref_ce_loss": 0.12964120507240295, "step": 8180 }, { "epoch": 2.7284856571047365, "loss": 0.36327147483825684, "step": 8180 }, { "ce_loss": 0.07714872062206268, "epoch": 2.7284856571047365, "step": 8180 }, { "distill_loss": 0.09731505066156387, "epoch": 2.7284856571047365, "step": 8180 }, { "epoch": 2.7284856571047365, "ref_ce_loss": 0.07459522783756256, "step": 8180 }, { "epoch": 2.731821214142762, "loss": 0.5482, "step": 8190 }, { "epoch": 2.731821214142762, "grad_norm": 3.1269516944885254, "step": 8190 }, { "epoch": 2.731821214142762, "learning_rate": 0.0002299370786877259, "step": 8190 }, { "epoch": 2.731821214142762, "loss": 0.31505829095840454, "step": 8190 }, { "ce_loss": 0.0993107482790947, "epoch": 2.731821214142762, "step": 8190 }, { "distill_loss": 0.08881866931915283, "epoch": 2.731821214142762, "step": 8190 }, { "epoch": 2.731821214142762, "ref_ce_loss": 0.08592408150434494, "step": 8190 }, { "epoch": 2.731821214142762, "loss": 0.518826961517334, "step": 8190 }, { "ce_loss": 0.20128180086612701, "epoch": 2.731821214142762, "step": 8190 }, { "distill_loss": 0.11471576988697052, "epoch": 2.731821214142762, "step": 8190 }, { "epoch": 2.731821214142762, "ref_ce_loss": 0.11057179421186447, "step": 8190 }, { "epoch": 2.731821214142762, "loss": 0.7147469520568848, "step": 8190 }, { "ce_loss": 0.2774195969104767, "epoch": 2.731821214142762, "step": 8190 }, { "distill_loss": 0.14111217856407166, "epoch": 2.731821214142762, "step": 8190 }, { "epoch": 2.731821214142762, "ref_ce_loss": 0.15037333965301514, "step": 8190 }, { "epoch": 2.731821214142762, "loss": 0.5206362009048462, "step": 8190 }, { "ce_loss": 0.12582939863204956, "epoch": 2.731821214142762, "step": 8190 }, { "distill_loss": 0.11385433375835419, "epoch": 2.731821214142762, "step": 8190 }, { "epoch": 2.731821214142762, "ref_ce_loss": 0.090955950319767, "step": 8190 }, { "epoch": 2.735156771180787, "loss": 0.5921, "step": 8200 }, { "epoch": 2.735156771180787, "grad_norm": 3.280219793319702, "step": 8200 }, { "epoch": 2.735156771180787, "learning_rate": 0.00022976560455397518, "step": 8200 }, { "epoch": 2.735156771180787, "loss": 0.5218157172203064, "step": 8200 }, { "ce_loss": 0.10749214887619019, "epoch": 2.735156771180787, "step": 8200 }, { "distill_loss": 0.10494686663150787, "epoch": 2.735156771180787, "step": 8200 }, { "epoch": 2.735156771180787, "ref_ce_loss": 0.12046821415424347, "step": 8200 }, { "epoch": 2.735156771180787, "loss": 0.35924088954925537, "step": 8200 }, { "ce_loss": 0.13240492343902588, "epoch": 2.735156771180787, "step": 8200 }, { "distill_loss": 0.10641516745090485, "epoch": 2.735156771180787, "step": 8200 }, { "epoch": 2.735156771180787, "ref_ce_loss": 0.12035661935806274, "step": 8200 }, { "epoch": 2.735156771180787, "loss": 0.5238202214241028, "step": 8200 }, { "ce_loss": 0.24740082025527954, "epoch": 2.735156771180787, "step": 8200 }, { "distill_loss": 0.10919897258281708, "epoch": 2.735156771180787, "step": 8200 }, { "epoch": 2.735156771180787, "ref_ce_loss": 0.13922780752182007, "step": 8200 }, { "epoch": 2.735156771180787, "loss": 0.6976079940795898, "step": 8200 }, { "ce_loss": 0.11878570169210434, "epoch": 2.735156771180787, "step": 8200 }, { "distill_loss": 0.10893769562244415, "epoch": 2.735156771180787, "step": 8200 }, { "epoch": 2.735156771180787, "ref_ce_loss": 0.059214212000370026, "step": 8200 }, { "epoch": 2.7384923282188125, "loss": 0.6152, "step": 8210 }, { "epoch": 2.7384923282188125, "grad_norm": 3.968153953552246, "step": 8210 }, { "epoch": 2.7384923282188125, "learning_rate": 0.00022959398495922667, "step": 8210 }, { "epoch": 2.7384923282188125, "loss": 0.5968283414840698, "step": 8210 }, { "ce_loss": 0.24610240757465363, "epoch": 2.7384923282188125, "step": 8210 }, { "distill_loss": 0.18845278024673462, "epoch": 2.7384923282188125, "step": 8210 }, { "epoch": 2.7384923282188125, "ref_ce_loss": 0.16213765740394592, "step": 8210 }, { "epoch": 2.7384923282188125, "loss": 0.609846830368042, "step": 8210 }, { "ce_loss": 0.21443380415439606, "epoch": 2.7384923282188125, "step": 8210 }, { "distill_loss": 0.1477302461862564, "epoch": 2.7384923282188125, "step": 8210 }, { "epoch": 2.7384923282188125, "ref_ce_loss": 0.12607622146606445, "step": 8210 }, { "epoch": 2.7384923282188125, "loss": 0.6559703946113586, "step": 8210 }, { "ce_loss": 0.15517359972000122, "epoch": 2.7384923282188125, "step": 8210 }, { "distill_loss": 0.1313042789697647, "epoch": 2.7384923282188125, "step": 8210 }, { "epoch": 2.7384923282188125, "ref_ce_loss": 0.14865265786647797, "step": 8210 }, { "epoch": 2.7384923282188125, "loss": 0.42476436495780945, "step": 8210 }, { "ce_loss": 0.11427508294582367, "epoch": 2.7384923282188125, "step": 8210 }, { "distill_loss": 0.11243674904108047, "epoch": 2.7384923282188125, "step": 8210 }, { "epoch": 2.7384923282188125, "ref_ce_loss": 0.07395554333925247, "step": 8210 }, { "epoch": 2.741827885256838, "loss": 0.5614, "step": 8220 }, { "epoch": 2.741827885256838, "grad_norm": 2.2540650367736816, "step": 8220 }, { "epoch": 2.741827885256838, "learning_rate": 0.00022942222021644693, "step": 8220 }, { "epoch": 2.741827885256838, "loss": 0.3177460730075836, "step": 8220 }, { "ce_loss": 0.09816362708806992, "epoch": 2.741827885256838, "step": 8220 }, { "distill_loss": 0.1109558641910553, "epoch": 2.741827885256838, "step": 8220 }, { "epoch": 2.741827885256838, "ref_ce_loss": 0.10840912908315659, "step": 8220 }, { "epoch": 2.741827885256838, "loss": 0.3551897406578064, "step": 8220 }, { "ce_loss": 0.14152449369430542, "epoch": 2.741827885256838, "step": 8220 }, { "distill_loss": 0.11100196838378906, "epoch": 2.741827885256838, "step": 8220 }, { "epoch": 2.741827885256838, "ref_ce_loss": 0.07071300595998764, "step": 8220 }, { "epoch": 2.741827885256838, "loss": 0.7158865332603455, "step": 8220 }, { "ce_loss": 0.23116154968738556, "epoch": 2.741827885256838, "step": 8220 }, { "distill_loss": 0.132449209690094, "epoch": 2.741827885256838, "step": 8220 }, { "epoch": 2.741827885256838, "ref_ce_loss": 0.1193322092294693, "step": 8220 }, { "epoch": 2.741827885256838, "loss": 0.9159722328186035, "step": 8220 }, { "ce_loss": 0.08125410228967667, "epoch": 2.741827885256838, "step": 8220 }, { "distill_loss": 0.10034830868244171, "epoch": 2.741827885256838, "step": 8220 }, { "epoch": 2.741827885256838, "ref_ce_loss": 0.10371002554893494, "step": 8220 }, { "epoch": 2.745163442294863, "loss": 0.5366, "step": 8230 }, { "epoch": 2.745163442294863, "grad_norm": 2.331559181213379, "step": 8230 }, { "epoch": 2.745163442294863, "learning_rate": 0.00022925031063886694, "step": 8230 }, { "epoch": 2.745163442294863, "loss": 0.437788724899292, "step": 8230 }, { "ce_loss": 0.16893485188484192, "epoch": 2.745163442294863, "step": 8230 }, { "distill_loss": 0.12921595573425293, "epoch": 2.745163442294863, "step": 8230 }, { "epoch": 2.745163442294863, "ref_ce_loss": 0.10241150856018066, "step": 8230 }, { "epoch": 2.745163442294863, "loss": 0.30514878034591675, "step": 8230 }, { "ce_loss": 0.12337757647037506, "epoch": 2.745163442294863, "step": 8230 }, { "distill_loss": 0.10096579790115356, "epoch": 2.745163442294863, "step": 8230 }, { "epoch": 2.745163442294863, "ref_ce_loss": 0.07967543601989746, "step": 8230 }, { "epoch": 2.745163442294863, "loss": 0.35450685024261475, "step": 8230 }, { "ce_loss": 0.14865685999393463, "epoch": 2.745163442294863, "step": 8230 }, { "distill_loss": 0.11411884427070618, "epoch": 2.745163442294863, "step": 8230 }, { "epoch": 2.745163442294863, "ref_ce_loss": 0.09148301184177399, "step": 8230 }, { "epoch": 2.745163442294863, "loss": 0.4512288570404053, "step": 8230 }, { "ce_loss": 0.1382657140493393, "epoch": 2.745163442294863, "step": 8230 }, { "distill_loss": 0.10865326970815659, "epoch": 2.745163442294863, "step": 8230 }, { "epoch": 2.745163442294863, "ref_ce_loss": 0.13190177083015442, "step": 8230 }, { "epoch": 2.7484989993328885, "loss": 0.5102, "step": 8240 }, { "epoch": 2.7484989993328885, "grad_norm": 2.7061686515808105, "step": 8240 }, { "epoch": 2.7484989993328885, "learning_rate": 0.00022907825653998212, "step": 8240 }, { "epoch": 2.7484989993328885, "loss": 0.5048376321792603, "step": 8240 }, { "ce_loss": 0.12719407677650452, "epoch": 2.7484989993328885, "step": 8240 }, { "distill_loss": 0.1403333842754364, "epoch": 2.7484989993328885, "step": 8240 }, { "epoch": 2.7484989993328885, "ref_ce_loss": 0.16057303547859192, "step": 8240 }, { "epoch": 2.7484989993328885, "loss": 0.5376629829406738, "step": 8240 }, { "ce_loss": 0.10417266190052032, "epoch": 2.7484989993328885, "step": 8240 }, { "distill_loss": 0.11442543566226959, "epoch": 2.7484989993328885, "step": 8240 }, { "epoch": 2.7484989993328885, "ref_ce_loss": 0.07191190123558044, "step": 8240 }, { "epoch": 2.7484989993328885, "loss": 0.37093302607536316, "step": 8240 }, { "ce_loss": 0.10794583708047867, "epoch": 2.7484989993328885, "step": 8240 }, { "distill_loss": 0.10681041330099106, "epoch": 2.7484989993328885, "step": 8240 }, { "epoch": 2.7484989993328885, "ref_ce_loss": 0.1053520068526268, "step": 8240 }, { "epoch": 2.7484989993328885, "loss": 0.42045828700065613, "step": 8240 }, { "ce_loss": 0.14314033091068268, "epoch": 2.7484989993328885, "step": 8240 }, { "distill_loss": 0.15592116117477417, "epoch": 2.7484989993328885, "step": 8240 }, { "epoch": 2.7484989993328885, "ref_ce_loss": 0.09708252549171448, "step": 8240 }, { "epoch": 2.751834556370914, "loss": 0.5688, "step": 8250 }, { "epoch": 2.751834556370914, "grad_norm": 4.964454650878906, "step": 8250 }, { "epoch": 2.751834556370914, "learning_rate": 0.00022890605823355117, "step": 8250 }, { "epoch": 2.751834556370914, "loss": 0.48830074071884155, "step": 8250 }, { "ce_loss": 0.1908160150051117, "epoch": 2.751834556370914, "step": 8250 }, { "distill_loss": 0.14335381984710693, "epoch": 2.751834556370914, "step": 8250 }, { "epoch": 2.751834556370914, "ref_ce_loss": 0.15347762405872345, "step": 8250 }, { "epoch": 2.751834556370914, "loss": 0.34461137652397156, "step": 8250 }, { "ce_loss": 0.11253272742033005, "epoch": 2.751834556370914, "step": 8250 }, { "distill_loss": 0.10419323295354843, "epoch": 2.751834556370914, "step": 8250 }, { "epoch": 2.751834556370914, "ref_ce_loss": 0.07173644006252289, "step": 8250 }, { "epoch": 2.751834556370914, "loss": 0.48704829812049866, "step": 8250 }, { "ce_loss": 0.22168782353401184, "epoch": 2.751834556370914, "step": 8250 }, { "distill_loss": 0.11275321990251541, "epoch": 2.751834556370914, "step": 8250 }, { "epoch": 2.751834556370914, "ref_ce_loss": 0.12069100886583328, "step": 8250 }, { "epoch": 2.751834556370914, "loss": 0.253851979970932, "step": 8250 }, { "ce_loss": 0.08856004476547241, "epoch": 2.751834556370914, "step": 8250 }, { "distill_loss": 0.09050433337688446, "epoch": 2.751834556370914, "step": 8250 }, { "epoch": 2.751834556370914, "ref_ce_loss": 0.07416380196809769, "step": 8250 }, { "epoch": 2.7551701134089392, "loss": 0.509, "step": 8260 }, { "epoch": 2.7551701134089392, "grad_norm": 2.1522469520568848, "step": 8260 }, { "epoch": 2.7551701134089392, "learning_rate": 0.00022873371603359587, "step": 8260 }, { "epoch": 2.7551701134089392, "loss": 0.3628401756286621, "step": 8260 }, { "ce_loss": 0.06882809102535248, "epoch": 2.7551701134089392, "step": 8260 }, { "distill_loss": 0.11147104203701019, "epoch": 2.7551701134089392, "step": 8260 }, { "epoch": 2.7551701134089392, "ref_ce_loss": 0.1081618219614029, "step": 8260 }, { "epoch": 2.7551701134089392, "loss": 0.35818222165107727, "step": 8260 }, { "ce_loss": 0.09174513816833496, "epoch": 2.7551701134089392, "step": 8260 }, { "distill_loss": 0.11023850739002228, "epoch": 2.7551701134089392, "step": 8260 }, { "epoch": 2.7551701134089392, "ref_ce_loss": 0.10617993026971817, "step": 8260 }, { "epoch": 2.7551701134089392, "loss": 0.5432717800140381, "step": 8260 }, { "ce_loss": 0.19231824576854706, "epoch": 2.7551701134089392, "step": 8260 }, { "distill_loss": 0.15208996832370758, "epoch": 2.7551701134089392, "step": 8260 }, { "epoch": 2.7551701134089392, "ref_ce_loss": 0.12890948355197906, "step": 8260 }, { "epoch": 2.7551701134089392, "loss": 0.41913700103759766, "step": 8260 }, { "ce_loss": 0.14565613865852356, "epoch": 2.7551701134089392, "step": 8260 }, { "distill_loss": 0.11267106235027313, "epoch": 2.7551701134089392, "step": 8260 }, { "epoch": 2.7551701134089392, "ref_ce_loss": 0.11973301321268082, "step": 8260 }, { "epoch": 2.7585056704469646, "loss": 0.501, "step": 8270 }, { "epoch": 2.7585056704469646, "grad_norm": 2.99421763420105, "step": 8270 }, { "epoch": 2.7585056704469646, "learning_rate": 0.00022856123025440046, "step": 8270 }, { "epoch": 2.7585056704469646, "loss": 0.9480876326560974, "step": 8270 }, { "ce_loss": 0.17917704582214355, "epoch": 2.7585056704469646, "step": 8270 }, { "distill_loss": 0.16749121248722076, "epoch": 2.7585056704469646, "step": 8270 }, { "epoch": 2.7585056704469646, "ref_ce_loss": 0.15291234850883484, "step": 8270 }, { "epoch": 2.7585056704469646, "loss": 0.5085225105285645, "step": 8270 }, { "ce_loss": 0.19998572766780853, "epoch": 2.7585056704469646, "step": 8270 }, { "distill_loss": 0.15323159098625183, "epoch": 2.7585056704469646, "step": 8270 }, { "epoch": 2.7585056704469646, "ref_ce_loss": 0.15509232878684998, "step": 8270 }, { "epoch": 2.7585056704469646, "loss": 0.640496015548706, "step": 8270 }, { "ce_loss": 0.19523316621780396, "epoch": 2.7585056704469646, "step": 8270 }, { "distill_loss": 0.17683634161949158, "epoch": 2.7585056704469646, "step": 8270 }, { "epoch": 2.7585056704469646, "ref_ce_loss": 0.13698916137218475, "step": 8270 }, { "epoch": 2.7585056704469646, "loss": 0.47830960154533386, "step": 8270 }, { "ce_loss": 0.12321220338344574, "epoch": 2.7585056704469646, "step": 8270 }, { "distill_loss": 0.1424240916967392, "epoch": 2.7585056704469646, "step": 8270 }, { "epoch": 2.7585056704469646, "ref_ce_loss": 0.17582586407661438, "step": 8270 }, { "epoch": 2.76184122748499, "loss": 0.587, "step": 8280 }, { "epoch": 2.76184122748499, "grad_norm": 2.557399272918701, "step": 8280 }, { "epoch": 2.76184122748499, "learning_rate": 0.00022838860121051098, "step": 8280 }, { "epoch": 2.76184122748499, "loss": 0.5317304134368896, "step": 8280 }, { "ce_loss": 0.12961804866790771, "epoch": 2.76184122748499, "step": 8280 }, { "distill_loss": 0.1425638645887375, "epoch": 2.76184122748499, "step": 8280 }, { "epoch": 2.76184122748499, "ref_ce_loss": 0.10886183381080627, "step": 8280 }, { "epoch": 2.76184122748499, "loss": 0.6594823002815247, "step": 8280 }, { "ce_loss": 0.13466475903987885, "epoch": 2.76184122748499, "step": 8280 }, { "distill_loss": 0.1664295643568039, "epoch": 2.76184122748499, "step": 8280 }, { "epoch": 2.76184122748499, "ref_ce_loss": 0.10923914611339569, "step": 8280 }, { "epoch": 2.76184122748499, "loss": 0.3929463028907776, "step": 8280 }, { "ce_loss": 0.117137610912323, "epoch": 2.76184122748499, "step": 8280 }, { "distill_loss": 0.13499777019023895, "epoch": 2.76184122748499, "step": 8280 }, { "epoch": 2.76184122748499, "ref_ce_loss": 0.09361618757247925, "step": 8280 }, { "epoch": 2.76184122748499, "loss": 0.4497656226158142, "step": 8280 }, { "ce_loss": 0.11880599707365036, "epoch": 2.76184122748499, "step": 8280 }, { "distill_loss": 0.1401321291923523, "epoch": 2.76184122748499, "step": 8280 }, { "epoch": 2.76184122748499, "ref_ce_loss": 0.1344946324825287, "step": 8280 }, { "epoch": 2.7651767845230153, "loss": 0.5636, "step": 8290 }, { "epoch": 2.7651767845230153, "grad_norm": 2.3959028720855713, "step": 8290 }, { "epoch": 2.7651767845230153, "learning_rate": 0.0002282158292167346, "step": 8290 }, { "epoch": 2.7651767845230153, "loss": 0.3913949728012085, "step": 8290 }, { "ce_loss": 0.08157963305711746, "epoch": 2.7651767845230153, "step": 8290 }, { "distill_loss": 0.14165902137756348, "epoch": 2.7651767845230153, "step": 8290 }, { "epoch": 2.7651767845230153, "ref_ce_loss": 0.11059600859880447, "step": 8290 }, { "epoch": 2.7651767845230153, "loss": 0.6704709529876709, "step": 8290 }, { "ce_loss": 0.20668728649616241, "epoch": 2.7651767845230153, "step": 8290 }, { "distill_loss": 0.18045960366725922, "epoch": 2.7651767845230153, "step": 8290 }, { "epoch": 2.7651767845230153, "ref_ce_loss": 0.10442976653575897, "step": 8290 }, { "epoch": 2.7651767845230153, "loss": 0.5117354393005371, "step": 8290 }, { "ce_loss": 0.12252218276262283, "epoch": 2.7651767845230153, "step": 8290 }, { "distill_loss": 0.16555725038051605, "epoch": 2.7651767845230153, "step": 8290 }, { "epoch": 2.7651767845230153, "ref_ce_loss": 0.07605114579200745, "step": 8290 }, { "epoch": 2.7651767845230153, "loss": 0.4941989481449127, "step": 8290 }, { "ce_loss": 0.14770467579364777, "epoch": 2.7651767845230153, "step": 8290 }, { "distill_loss": 0.15609993040561676, "epoch": 2.7651767845230153, "step": 8290 }, { "epoch": 2.7651767845230153, "ref_ce_loss": 0.09984522312879562, "step": 8290 }, { "epoch": 2.7685123415610406, "loss": 0.5792, "step": 8300 }, { "epoch": 2.7685123415610406, "grad_norm": 2.6024227142333984, "step": 8300 }, { "epoch": 2.7685123415610406, "learning_rate": 0.0002280429145881394, "step": 8300 }, { "epoch": 2.7685123415610406, "loss": 0.3573533296585083, "step": 8300 }, { "ce_loss": 0.10002897679805756, "epoch": 2.7685123415610406, "step": 8300 }, { "distill_loss": 0.1016823798418045, "epoch": 2.7685123415610406, "step": 8300 }, { "epoch": 2.7685123415610406, "ref_ce_loss": 0.1555853933095932, "step": 8300 }, { "epoch": 2.7685123415610406, "loss": 0.35432833433151245, "step": 8300 }, { "ce_loss": 0.15280495584011078, "epoch": 2.7685123415610406, "step": 8300 }, { "distill_loss": 0.10303471982479095, "epoch": 2.7685123415610406, "step": 8300 }, { "epoch": 2.7685123415610406, "ref_ce_loss": 0.07455757260322571, "step": 8300 }, { "epoch": 2.7685123415610406, "loss": 0.5166060924530029, "step": 8300 }, { "ce_loss": 0.19417652487754822, "epoch": 2.7685123415610406, "step": 8300 }, { "distill_loss": 0.1766192615032196, "epoch": 2.7685123415610406, "step": 8300 }, { "epoch": 2.7685123415610406, "ref_ce_loss": 0.11216067522764206, "step": 8300 }, { "epoch": 2.7685123415610406, "loss": 0.3201092779636383, "step": 8300 }, { "ce_loss": 0.11336085945367813, "epoch": 2.7685123415610406, "step": 8300 }, { "distill_loss": 0.09292483329772949, "epoch": 2.7685123415610406, "step": 8300 }, { "epoch": 2.7685123415610406, "ref_ce_loss": 0.09047871828079224, "step": 8300 }, { "epoch": 2.771847898599066, "loss": 0.4947, "step": 8310 }, { "epoch": 2.771847898599066, "grad_norm": 2.7137691974639893, "step": 8310 }, { "epoch": 2.771847898599066, "learning_rate": 0.00022786985764005344, "step": 8310 }, { "epoch": 2.771847898599066, "loss": 0.7143236398696899, "step": 8310 }, { "ce_loss": 0.26414355635643005, "epoch": 2.771847898599066, "step": 8310 }, { "distill_loss": 0.1972856968641281, "epoch": 2.771847898599066, "step": 8310 }, { "epoch": 2.771847898599066, "ref_ce_loss": 0.21074679493904114, "step": 8310 }, { "epoch": 2.771847898599066, "loss": 0.3790608048439026, "step": 8310 }, { "ce_loss": 0.10393885523080826, "epoch": 2.771847898599066, "step": 8310 }, { "distill_loss": 0.14341627061367035, "epoch": 2.771847898599066, "step": 8310 }, { "epoch": 2.771847898599066, "ref_ce_loss": 0.08679898828268051, "step": 8310 }, { "epoch": 2.771847898599066, "loss": 0.6864500045776367, "step": 8310 }, { "ce_loss": 0.23026344180107117, "epoch": 2.771847898599066, "step": 8310 }, { "distill_loss": 0.23133046925067902, "epoch": 2.771847898599066, "step": 8310 }, { "epoch": 2.771847898599066, "ref_ce_loss": 0.13559125363826752, "step": 8310 }, { "epoch": 2.771847898599066, "loss": 0.5029471516609192, "step": 8310 }, { "ce_loss": 0.045588862150907516, "epoch": 2.771847898599066, "step": 8310 }, { "distill_loss": 0.11035850644111633, "epoch": 2.771847898599066, "step": 8310 }, { "epoch": 2.771847898599066, "ref_ce_loss": 0.05270203575491905, "step": 8310 }, { "epoch": 2.7751834556370913, "loss": 0.6118, "step": 8320 }, { "epoch": 2.7751834556370913, "grad_norm": 2.5758280754089355, "step": 8320 }, { "epoch": 2.7751834556370913, "learning_rate": 0.0002276966586880642, "step": 8320 }, { "epoch": 2.7751834556370913, "loss": 0.5190055966377258, "step": 8320 }, { "ce_loss": 0.17583709955215454, "epoch": 2.7751834556370913, "step": 8320 }, { "distill_loss": 0.1923707276582718, "epoch": 2.7751834556370913, "step": 8320 }, { "epoch": 2.7751834556370913, "ref_ce_loss": 0.15066786110401154, "step": 8320 }, { "epoch": 2.7751834556370913, "loss": 0.43477120995521545, "step": 8320 }, { "ce_loss": 0.12779302895069122, "epoch": 2.7751834556370913, "step": 8320 }, { "distill_loss": 0.1646592915058136, "epoch": 2.7751834556370913, "step": 8320 }, { "epoch": 2.7751834556370913, "ref_ce_loss": 0.14228765666484833, "step": 8320 }, { "epoch": 2.7751834556370913, "loss": 0.5125862956047058, "step": 8320 }, { "ce_loss": 0.22525674104690552, "epoch": 2.7751834556370913, "step": 8320 }, { "distill_loss": 0.15953534841537476, "epoch": 2.7751834556370913, "step": 8320 }, { "epoch": 2.7751834556370913, "ref_ce_loss": 0.12673068046569824, "step": 8320 }, { "epoch": 2.7751834556370913, "loss": 0.47959643602371216, "step": 8320 }, { "ce_loss": 0.14553089439868927, "epoch": 2.7751834556370913, "step": 8320 }, { "distill_loss": 0.15800155699253082, "epoch": 2.7751834556370913, "step": 8320 }, { "epoch": 2.7751834556370913, "ref_ce_loss": 0.07305330038070679, "step": 8320 }, { "epoch": 2.7785190126751167, "loss": 0.6519, "step": 8330 }, { "epoch": 2.7785190126751167, "grad_norm": 31.961301803588867, "step": 8330 }, { "epoch": 2.7785190126751167, "learning_rate": 0.00022752331804801843, "step": 8330 }, { "epoch": 2.7785190126751167, "loss": 0.3530633747577667, "step": 8330 }, { "ce_loss": 0.1247534304857254, "epoch": 2.7785190126751167, "step": 8330 }, { "distill_loss": 0.0954834520816803, "epoch": 2.7785190126751167, "step": 8330 }, { "epoch": 2.7785190126751167, "ref_ce_loss": 0.13278992474079132, "step": 8330 }, { "epoch": 2.7785190126751167, "loss": 0.5396177172660828, "step": 8330 }, { "ce_loss": 0.233219712972641, "epoch": 2.7785190126751167, "step": 8330 }, { "distill_loss": 0.11943552643060684, "epoch": 2.7785190126751167, "step": 8330 }, { "epoch": 2.7785190126751167, "ref_ce_loss": 0.14556007087230682, "step": 8330 }, { "epoch": 2.7785190126751167, "loss": 0.814037561416626, "step": 8330 }, { "ce_loss": 0.34219565987586975, "epoch": 2.7785190126751167, "step": 8330 }, { "distill_loss": 0.1418711394071579, "epoch": 2.7785190126751167, "step": 8330 }, { "epoch": 2.7785190126751167, "ref_ce_loss": 0.17148993909358978, "step": 8330 }, { "epoch": 2.7785190126751167, "loss": 0.6103036403656006, "step": 8330 }, { "ce_loss": 0.2459089308977127, "epoch": 2.7785190126751167, "step": 8330 }, { "distill_loss": 0.12533476948738098, "epoch": 2.7785190126751167, "step": 8330 }, { "epoch": 2.7785190126751167, "ref_ce_loss": 0.11649178713560104, "step": 8330 }, { "epoch": 2.781854569713142, "loss": 0.5584, "step": 8340 }, { "epoch": 2.781854569713142, "grad_norm": 2.948256492614746, "step": 8340 }, { "epoch": 2.781854569713142, "learning_rate": 0.000227349836036021, "step": 8340 }, { "epoch": 2.781854569713142, "loss": 0.8184687495231628, "step": 8340 }, { "ce_loss": 0.15261517465114594, "epoch": 2.781854569713142, "step": 8340 }, { "distill_loss": 0.15436916053295135, "epoch": 2.781854569713142, "step": 8340 }, { "epoch": 2.781854569713142, "ref_ce_loss": 0.08771957457065582, "step": 8340 }, { "epoch": 2.781854569713142, "loss": 0.32824552059173584, "step": 8340 }, { "ce_loss": 0.13067448139190674, "epoch": 2.781854569713142, "step": 8340 }, { "distill_loss": 0.1223086267709732, "epoch": 2.781854569713142, "step": 8340 }, { "epoch": 2.781854569713142, "ref_ce_loss": 0.07519922405481339, "step": 8340 }, { "epoch": 2.781854569713142, "loss": 0.4334113299846649, "step": 8340 }, { "ce_loss": 0.1789129078388214, "epoch": 2.781854569713142, "step": 8340 }, { "distill_loss": 0.1433398723602295, "epoch": 2.781854569713142, "step": 8340 }, { "epoch": 2.781854569713142, "ref_ce_loss": 0.08752244710922241, "step": 8340 }, { "epoch": 2.781854569713142, "loss": 0.585452139377594, "step": 8340 }, { "ce_loss": 0.2676447629928589, "epoch": 2.781854569713142, "step": 8340 }, { "distill_loss": 0.1586453765630722, "epoch": 2.781854569713142, "step": 8340 }, { "epoch": 2.781854569713142, "ref_ce_loss": 0.10227011144161224, "step": 8340 }, { "epoch": 2.7851901267511674, "loss": 0.5426, "step": 8350 }, { "epoch": 2.7851901267511674, "grad_norm": 3.5256121158599854, "step": 8350 }, { "epoch": 2.7851901267511674, "learning_rate": 0.0002271762129684346, "step": 8350 }, { "epoch": 2.7851901267511674, "loss": 0.5500671863555908, "step": 8350 }, { "ce_loss": 0.1510569006204605, "epoch": 2.7851901267511674, "step": 8350 }, { "distill_loss": 0.12426368147134781, "epoch": 2.7851901267511674, "step": 8350 }, { "epoch": 2.7851901267511674, "ref_ce_loss": 0.177829310297966, "step": 8350 }, { "epoch": 2.7851901267511674, "loss": 0.4813498258590698, "step": 8350 }, { "ce_loss": 0.15770815312862396, "epoch": 2.7851901267511674, "step": 8350 }, { "distill_loss": 0.11759153753519058, "epoch": 2.7851901267511674, "step": 8350 }, { "epoch": 2.7851901267511674, "ref_ce_loss": 0.10133638978004456, "step": 8350 }, { "epoch": 2.7851901267511674, "loss": 0.6662505865097046, "step": 8350 }, { "ce_loss": 0.11393112689256668, "epoch": 2.7851901267511674, "step": 8350 }, { "distill_loss": 0.10494688153266907, "epoch": 2.7851901267511674, "step": 8350 }, { "epoch": 2.7851901267511674, "ref_ce_loss": 0.08218897879123688, "step": 8350 }, { "epoch": 2.7851901267511674, "loss": 0.5542718172073364, "step": 8350 }, { "ce_loss": 0.15418872237205505, "epoch": 2.7851901267511674, "step": 8350 }, { "distill_loss": 0.1185954362154007, "epoch": 2.7851901267511674, "step": 8350 }, { "epoch": 2.7851901267511674, "ref_ce_loss": 0.16383036971092224, "step": 8350 }, { "epoch": 2.7885256837891927, "loss": 0.5317, "step": 8360 }, { "epoch": 2.7885256837891927, "grad_norm": 2.987302303314209, "step": 8360 }, { "epoch": 2.7885256837891927, "learning_rate": 0.00022700244916187934, "step": 8360 }, { "epoch": 2.7885256837891927, "loss": 0.5641549229621887, "step": 8360 }, { "ce_loss": 0.1565759927034378, "epoch": 2.7885256837891927, "step": 8360 }, { "distill_loss": 0.14667509496212006, "epoch": 2.7885256837891927, "step": 8360 }, { "epoch": 2.7885256837891927, "ref_ce_loss": 0.15718014538288116, "step": 8360 }, { "epoch": 2.7885256837891927, "loss": 0.2849089503288269, "step": 8360 }, { "ce_loss": 0.07774198800325394, "epoch": 2.7885256837891927, "step": 8360 }, { "distill_loss": 0.09604795277118683, "epoch": 2.7885256837891927, "step": 8360 }, { "epoch": 2.7885256837891927, "ref_ce_loss": 0.05384528264403343, "step": 8360 }, { "epoch": 2.7885256837891927, "loss": 0.5093336701393127, "step": 8360 }, { "ce_loss": 0.20802956819534302, "epoch": 2.7885256837891927, "step": 8360 }, { "distill_loss": 0.15040580928325653, "epoch": 2.7885256837891927, "step": 8360 }, { "epoch": 2.7885256837891927, "ref_ce_loss": 0.15086127817630768, "step": 8360 }, { "epoch": 2.7885256837891927, "loss": 0.6868488788604736, "step": 8360 }, { "ce_loss": 0.10189005732536316, "epoch": 2.7885256837891927, "step": 8360 }, { "distill_loss": 0.142466738820076, "epoch": 2.7885256837891927, "step": 8360 }, { "epoch": 2.7885256837891927, "ref_ce_loss": 0.13379935920238495, "step": 8360 }, { "epoch": 2.791861240827218, "loss": 0.5346, "step": 8370 }, { "epoch": 2.791861240827218, "grad_norm": 3.1927130222320557, "step": 8370 }, { "epoch": 2.791861240827218, "learning_rate": 0.0002268285449332317, "step": 8370 }, { "epoch": 2.791861240827218, "loss": 0.9847922921180725, "step": 8370 }, { "ce_loss": 0.12005869299173355, "epoch": 2.791861240827218, "step": 8370 }, { "distill_loss": 0.1328408420085907, "epoch": 2.791861240827218, "step": 8370 }, { "epoch": 2.791861240827218, "ref_ce_loss": 0.1106012612581253, "step": 8370 }, { "epoch": 2.791861240827218, "loss": 0.3865183889865875, "step": 8370 }, { "ce_loss": 0.16827566921710968, "epoch": 2.791861240827218, "step": 8370 }, { "distill_loss": 0.10213863849639893, "epoch": 2.791861240827218, "step": 8370 }, { "epoch": 2.791861240827218, "ref_ce_loss": 0.11604784429073334, "step": 8370 }, { "epoch": 2.791861240827218, "loss": 0.7096348404884338, "step": 8370 }, { "ce_loss": 0.13443057239055634, "epoch": 2.791861240827218, "step": 8370 }, { "distill_loss": 0.11132686585187912, "epoch": 2.791861240827218, "step": 8370 }, { "epoch": 2.791861240827218, "ref_ce_loss": 0.10445878654718399, "step": 8370 }, { "epoch": 2.791861240827218, "loss": 0.6106353998184204, "step": 8370 }, { "ce_loss": 0.25030606985092163, "epoch": 2.791861240827218, "step": 8370 }, { "distill_loss": 0.1418341100215912, "epoch": 2.791861240827218, "step": 8370 }, { "epoch": 2.791861240827218, "ref_ce_loss": 0.1690225601196289, "step": 8370 }, { "epoch": 2.7951967978652434, "loss": 0.566, "step": 8380 }, { "epoch": 2.7951967978652434, "grad_norm": 2.1361160278320312, "step": 8380 }, { "epoch": 2.7951967978652434, "learning_rate": 0.00022665450059962457, "step": 8380 }, { "epoch": 2.7951967978652434, "loss": 0.547217845916748, "step": 8380 }, { "ce_loss": 0.13093866407871246, "epoch": 2.7951967978652434, "step": 8380 }, { "distill_loss": 0.13123691082000732, "epoch": 2.7951967978652434, "step": 8380 }, { "epoch": 2.7951967978652434, "ref_ce_loss": 0.18233349919319153, "step": 8380 }, { "epoch": 2.7951967978652434, "loss": 0.47471001744270325, "step": 8380 }, { "ce_loss": 0.1193649023771286, "epoch": 2.7951967978652434, "step": 8380 }, { "distill_loss": 0.13772958517074585, "epoch": 2.7951967978652434, "step": 8380 }, { "epoch": 2.7951967978652434, "ref_ce_loss": 0.14166118204593658, "step": 8380 }, { "epoch": 2.7951967978652434, "loss": 0.5118974447250366, "step": 8380 }, { "ce_loss": 0.19769543409347534, "epoch": 2.7951967978652434, "step": 8380 }, { "distill_loss": 0.109315425157547, "epoch": 2.7951967978652434, "step": 8380 }, { "epoch": 2.7951967978652434, "ref_ce_loss": 0.13785843551158905, "step": 8380 }, { "epoch": 2.7951967978652434, "loss": 0.27766746282577515, "step": 8380 }, { "ce_loss": 0.06863567233085632, "epoch": 2.7951967978652434, "step": 8380 }, { "distill_loss": 0.09863778203725815, "epoch": 2.7951967978652434, "step": 8380 }, { "epoch": 2.7951967978652434, "ref_ce_loss": 0.11031028628349304, "step": 8380 }, { "epoch": 2.798532354903269, "loss": 0.5368, "step": 8390 }, { "epoch": 2.798532354903269, "grad_norm": 3.499161958694458, "step": 8390 }, { "epoch": 2.798532354903269, "learning_rate": 0.00022648031647844606, "step": 8390 }, { "epoch": 2.798532354903269, "loss": 0.5296912789344788, "step": 8390 }, { "ce_loss": 0.12095022201538086, "epoch": 2.798532354903269, "step": 8390 }, { "distill_loss": 0.27068638801574707, "epoch": 2.798532354903269, "step": 8390 }, { "epoch": 2.798532354903269, "ref_ce_loss": 0.053802739828825, "step": 8390 }, { "epoch": 2.798532354903269, "loss": 0.5279688239097595, "step": 8390 }, { "ce_loss": 0.17168818414211273, "epoch": 2.798532354903269, "step": 8390 }, { "distill_loss": 0.1919613629579544, "epoch": 2.798532354903269, "step": 8390 }, { "epoch": 2.798532354903269, "ref_ce_loss": 0.14083920419216156, "step": 8390 }, { "epoch": 2.798532354903269, "loss": 0.5897954106330872, "step": 8390 }, { "ce_loss": 0.16287176311016083, "epoch": 2.798532354903269, "step": 8390 }, { "distill_loss": 0.24030818045139313, "epoch": 2.798532354903269, "step": 8390 }, { "epoch": 2.798532354903269, "ref_ce_loss": 0.10864845663309097, "step": 8390 }, { "epoch": 2.798532354903269, "loss": 0.5427618622779846, "step": 8390 }, { "ce_loss": 0.16201457381248474, "epoch": 2.798532354903269, "step": 8390 }, { "distill_loss": 0.28070592880249023, "epoch": 2.798532354903269, "step": 8390 }, { "epoch": 2.798532354903269, "ref_ce_loss": 0.09987376630306244, "step": 8390 }, { "epoch": 2.801867911941294, "loss": 0.562, "step": 8400 }, { "epoch": 2.801867911941294, "grad_norm": 2.6044695377349854, "step": 8400 }, { "epoch": 2.801867911941294, "learning_rate": 0.0002263059928873393, "step": 8400 }, { "epoch": 2.801867911941294, "loss": 0.5035500526428223, "step": 8400 }, { "ce_loss": 0.14720116555690765, "epoch": 2.801867911941294, "step": 8400 }, { "distill_loss": 0.1462814062833786, "epoch": 2.801867911941294, "step": 8400 }, { "epoch": 2.801867911941294, "ref_ce_loss": 0.10444889962673187, "step": 8400 }, { "epoch": 2.801867911941294, "loss": 0.4154942035675049, "step": 8400 }, { "ce_loss": 0.09291025251150131, "epoch": 2.801867911941294, "step": 8400 }, { "distill_loss": 0.12177328765392303, "epoch": 2.801867911941294, "step": 8400 }, { "epoch": 2.801867911941294, "ref_ce_loss": 0.11366147547960281, "step": 8400 }, { "epoch": 2.801867911941294, "loss": 0.7797332406044006, "step": 8400 }, { "ce_loss": 0.2732406556606293, "epoch": 2.801867911941294, "step": 8400 }, { "distill_loss": 0.3229169547557831, "epoch": 2.801867911941294, "step": 8400 }, { "epoch": 2.801867911941294, "ref_ce_loss": 0.18208087980747223, "step": 8400 }, { "epoch": 2.801867911941294, "loss": 0.37686821818351746, "step": 8400 }, { "ce_loss": 0.10438577085733414, "epoch": 2.801867911941294, "step": 8400 }, { "distill_loss": 0.19975148141384125, "epoch": 2.801867911941294, "step": 8400 }, { "epoch": 2.801867911941294, "ref_ce_loss": 0.07263541966676712, "step": 8400 }, { "epoch": 2.8052034689793195, "loss": 0.5641, "step": 8410 }, { "epoch": 2.8052034689793195, "grad_norm": 3.1431491374969482, "step": 8410 }, { "epoch": 2.8052034689793195, "learning_rate": 0.0002261315301442018, "step": 8410 }, { "epoch": 2.8052034689793195, "loss": 0.316400945186615, "step": 8410 }, { "ce_loss": 0.0983375534415245, "epoch": 2.8052034689793195, "step": 8410 }, { "distill_loss": 0.11273324489593506, "epoch": 2.8052034689793195, "step": 8410 }, { "epoch": 2.8052034689793195, "ref_ce_loss": 0.10528012365102768, "step": 8410 }, { "epoch": 2.8052034689793195, "loss": 0.3205793797969818, "step": 8410 }, { "ce_loss": 0.07206525653600693, "epoch": 2.8052034689793195, "step": 8410 }, { "distill_loss": 0.11558610200881958, "epoch": 2.8052034689793195, "step": 8410 }, { "epoch": 2.8052034689793195, "ref_ce_loss": 0.1002664789557457, "step": 8410 }, { "epoch": 2.8052034689793195, "loss": 0.44273141026496887, "step": 8410 }, { "ce_loss": 0.14200718700885773, "epoch": 2.8052034689793195, "step": 8410 }, { "distill_loss": 0.16369619965553284, "epoch": 2.8052034689793195, "step": 8410 }, { "epoch": 2.8052034689793195, "ref_ce_loss": 0.13701307773590088, "step": 8410 }, { "epoch": 2.8052034689793195, "loss": 0.3176065683364868, "step": 8410 }, { "ce_loss": 0.09353161603212357, "epoch": 2.8052034689793195, "step": 8410 }, { "distill_loss": 0.12242200970649719, "epoch": 2.8052034689793195, "step": 8410 }, { "epoch": 2.8052034689793195, "ref_ce_loss": 0.07452066242694855, "step": 8410 }, { "epoch": 2.808539026017345, "loss": 0.5706, "step": 8420 }, { "epoch": 2.808539026017345, "grad_norm": 4.882355213165283, "step": 8420 }, { "epoch": 2.808539026017345, "learning_rate": 0.00022595692856718474, "step": 8420 }, { "epoch": 2.808539026017345, "loss": 0.5670602321624756, "step": 8420 }, { "ce_loss": 0.15604117512702942, "epoch": 2.808539026017345, "step": 8420 }, { "distill_loss": 0.15158408880233765, "epoch": 2.808539026017345, "step": 8420 }, { "epoch": 2.808539026017345, "ref_ce_loss": 0.15006743371486664, "step": 8420 }, { "epoch": 2.808539026017345, "loss": 0.43875157833099365, "step": 8420 }, { "ce_loss": 0.10357891023159027, "epoch": 2.808539026017345, "step": 8420 }, { "distill_loss": 0.09662986546754837, "epoch": 2.808539026017345, "step": 8420 }, { "epoch": 2.808539026017345, "ref_ce_loss": 0.12920770049095154, "step": 8420 }, { "epoch": 2.808539026017345, "loss": 0.723873496055603, "step": 8420 }, { "ce_loss": 0.15056408941745758, "epoch": 2.808539026017345, "step": 8420 }, { "distill_loss": 0.12403174489736557, "epoch": 2.808539026017345, "step": 8420 }, { "epoch": 2.808539026017345, "ref_ce_loss": 0.11592728644609451, "step": 8420 }, { "epoch": 2.808539026017345, "loss": 0.34784793853759766, "step": 8420 }, { "ce_loss": 0.12006805092096329, "epoch": 2.808539026017345, "step": 8420 }, { "distill_loss": 0.07333897799253464, "epoch": 2.808539026017345, "step": 8420 }, { "epoch": 2.808539026017345, "ref_ce_loss": 0.07565838098526001, "step": 8420 }, { "epoch": 2.81187458305537, "loss": 0.5974, "step": 8430 }, { "epoch": 2.81187458305537, "grad_norm": 6.5593581199646, "step": 8430 }, { "epoch": 2.81187458305537, "learning_rate": 0.00022578218847469253, "step": 8430 }, { "epoch": 2.81187458305537, "loss": 0.38125699758529663, "step": 8430 }, { "ce_loss": 0.10962362587451935, "epoch": 2.81187458305537, "step": 8430 }, { "distill_loss": 0.09133778512477875, "epoch": 2.81187458305537, "step": 8430 }, { "epoch": 2.81187458305537, "ref_ce_loss": 0.08129521459341049, "step": 8430 }, { "epoch": 2.81187458305537, "loss": 0.5570087432861328, "step": 8430 }, { "ce_loss": 0.2145906537771225, "epoch": 2.81187458305537, "step": 8430 }, { "distill_loss": 0.14080560207366943, "epoch": 2.81187458305537, "step": 8430 }, { "epoch": 2.81187458305537, "ref_ce_loss": 0.10355406254529953, "step": 8430 }, { "epoch": 2.81187458305537, "loss": 0.3802940547466278, "step": 8430 }, { "ce_loss": 0.11159483343362808, "epoch": 2.81187458305537, "step": 8430 }, { "distill_loss": 0.07547979056835175, "epoch": 2.81187458305537, "step": 8430 }, { "epoch": 2.81187458305537, "ref_ce_loss": 0.08214235305786133, "step": 8430 }, { "epoch": 2.81187458305537, "loss": 0.29137495160102844, "step": 8430 }, { "ce_loss": 0.11853925883769989, "epoch": 2.81187458305537, "step": 8430 }, { "distill_loss": 0.10666890442371368, "epoch": 2.81187458305537, "step": 8430 }, { "epoch": 2.81187458305537, "ref_ce_loss": 0.06610703468322754, "step": 8430 }, { "epoch": 2.8152101400933955, "loss": 0.4931, "step": 8440 }, { "epoch": 2.8152101400933955, "grad_norm": 2.802873134613037, "step": 8440 }, { "epoch": 2.8152101400933955, "learning_rate": 0.00022560731018538222, "step": 8440 }, { "epoch": 2.8152101400933955, "loss": 0.25064149498939514, "step": 8440 }, { "ce_loss": 0.08421101421117783, "epoch": 2.8152101400933955, "step": 8440 }, { "distill_loss": 0.09479227662086487, "epoch": 2.8152101400933955, "step": 8440 }, { "epoch": 2.8152101400933955, "ref_ce_loss": 0.05247608572244644, "step": 8440 }, { "epoch": 2.8152101400933955, "loss": 0.532885730266571, "step": 8440 }, { "ce_loss": 0.2541835904121399, "epoch": 2.8152101400933955, "step": 8440 }, { "distill_loss": 0.16259463131427765, "epoch": 2.8152101400933955, "step": 8440 }, { "epoch": 2.8152101400933955, "ref_ce_loss": 0.09999735653400421, "step": 8440 }, { "epoch": 2.8152101400933955, "loss": 0.40857672691345215, "step": 8440 }, { "ce_loss": 0.15303872525691986, "epoch": 2.8152101400933955, "step": 8440 }, { "distill_loss": 0.1471904218196869, "epoch": 2.8152101400933955, "step": 8440 }, { "epoch": 2.8152101400933955, "ref_ce_loss": 0.1081681102514267, "step": 8440 }, { "epoch": 2.8152101400933955, "loss": 0.6053186058998108, "step": 8440 }, { "ce_loss": 0.23108021914958954, "epoch": 2.8152101400933955, "step": 8440 }, { "distill_loss": 0.21530982851982117, "epoch": 2.8152101400933955, "step": 8440 }, { "epoch": 2.8152101400933955, "ref_ce_loss": 0.11973507702350616, "step": 8440 }, { "epoch": 2.818545697131421, "loss": 0.5707, "step": 8450 }, { "epoch": 2.818545697131421, "grad_norm": 5.992776870727539, "step": 8450 }, { "epoch": 2.818545697131421, "learning_rate": 0.00022543229401816275, "step": 8450 }, { "epoch": 2.818545697131421, "loss": 0.5136671662330627, "step": 8450 }, { "ce_loss": 0.1235622763633728, "epoch": 2.818545697131421, "step": 8450 }, { "distill_loss": 0.12064239382743835, "epoch": 2.818545697131421, "step": 8450 }, { "epoch": 2.818545697131421, "ref_ce_loss": 0.09468799829483032, "step": 8450 }, { "epoch": 2.818545697131421, "loss": 0.8985491991043091, "step": 8450 }, { "ce_loss": 0.1961958259344101, "epoch": 2.818545697131421, "step": 8450 }, { "distill_loss": 0.18243150413036346, "epoch": 2.818545697131421, "step": 8450 }, { "epoch": 2.818545697131421, "ref_ce_loss": 0.10728978365659714, "step": 8450 }, { "epoch": 2.818545697131421, "loss": 0.5325456857681274, "step": 8450 }, { "ce_loss": 0.11725273728370667, "epoch": 2.818545697131421, "step": 8450 }, { "distill_loss": 0.12861433625221252, "epoch": 2.818545697131421, "step": 8450 }, { "epoch": 2.818545697131421, "ref_ce_loss": 0.11166153848171234, "step": 8450 }, { "epoch": 2.818545697131421, "loss": 0.5713732838630676, "step": 8450 }, { "ce_loss": 0.22279508411884308, "epoch": 2.818545697131421, "step": 8450 }, { "distill_loss": 0.176978200674057, "epoch": 2.818545697131421, "step": 8450 }, { "epoch": 2.818545697131421, "ref_ce_loss": 0.11572866886854172, "step": 8450 }, { "epoch": 2.8218812541694462, "loss": 0.5003, "step": 8460 }, { "epoch": 2.8218812541694462, "grad_norm": 1.9595544338226318, "step": 8460 }, { "epoch": 2.8218812541694462, "learning_rate": 0.00022525714029219453, "step": 8460 }, { "epoch": 2.8218812541694462, "loss": 0.421317994594574, "step": 8460 }, { "ce_loss": 0.1695428192615509, "epoch": 2.8218812541694462, "step": 8460 }, { "distill_loss": 0.11726689338684082, "epoch": 2.8218812541694462, "step": 8460 }, { "epoch": 2.8218812541694462, "ref_ce_loss": 0.13403357565402985, "step": 8460 }, { "epoch": 2.8218812541694462, "loss": 0.5824626684188843, "step": 8460 }, { "ce_loss": 0.2222665250301361, "epoch": 2.8218812541694462, "step": 8460 }, { "distill_loss": 0.10591816902160645, "epoch": 2.8218812541694462, "step": 8460 }, { "epoch": 2.8218812541694462, "ref_ce_loss": 0.11349005252122879, "step": 8460 }, { "epoch": 2.8218812541694462, "loss": 0.5425556302070618, "step": 8460 }, { "ce_loss": 0.18343253433704376, "epoch": 2.8218812541694462, "step": 8460 }, { "distill_loss": 0.14774461090564728, "epoch": 2.8218812541694462, "step": 8460 }, { "epoch": 2.8218812541694462, "ref_ce_loss": 0.11161774396896362, "step": 8460 }, { "epoch": 2.8218812541694462, "loss": 0.5546457767486572, "step": 8460 }, { "ce_loss": 0.1968740075826645, "epoch": 2.8218812541694462, "step": 8460 }, { "distill_loss": 0.15189194679260254, "epoch": 2.8218812541694462, "step": 8460 }, { "epoch": 2.8218812541694462, "ref_ce_loss": 0.14472328126430511, "step": 8460 }, { "epoch": 2.8252168112074716, "loss": 0.5254, "step": 8470 }, { "epoch": 2.8252168112074716, "grad_norm": 2.391209602355957, "step": 8470 }, { "epoch": 2.8252168112074716, "learning_rate": 0.00022508184932688903, "step": 8470 }, { "epoch": 2.8252168112074716, "loss": 0.9489675760269165, "step": 8470 }, { "ce_loss": 0.13908937573432922, "epoch": 2.8252168112074716, "step": 8470 }, { "distill_loss": 0.1329590082168579, "epoch": 2.8252168112074716, "step": 8470 }, { "epoch": 2.8252168112074716, "ref_ce_loss": 0.1170150637626648, "step": 8470 }, { "epoch": 2.8252168112074716, "loss": 0.6241788864135742, "step": 8470 }, { "ce_loss": 0.19099032878875732, "epoch": 2.8252168112074716, "step": 8470 }, { "distill_loss": 0.10044612735509872, "epoch": 2.8252168112074716, "step": 8470 }, { "epoch": 2.8252168112074716, "ref_ce_loss": 0.17403073608875275, "step": 8470 }, { "epoch": 2.8252168112074716, "loss": 0.4744298458099365, "step": 8470 }, { "ce_loss": 0.1622345745563507, "epoch": 2.8252168112074716, "step": 8470 }, { "distill_loss": 0.11944203078746796, "epoch": 2.8252168112074716, "step": 8470 }, { "epoch": 2.8252168112074716, "ref_ce_loss": 0.14652585983276367, "step": 8470 }, { "epoch": 2.8252168112074716, "loss": 0.38154834508895874, "step": 8470 }, { "ce_loss": 0.13473011553287506, "epoch": 2.8252168112074716, "step": 8470 }, { "distill_loss": 0.11211343854665756, "epoch": 2.8252168112074716, "step": 8470 }, { "epoch": 2.8252168112074716, "ref_ce_loss": 0.10368572175502777, "step": 8470 }, { "epoch": 2.828552368245497, "loss": 0.5224, "step": 8480 }, { "epoch": 2.828552368245497, "grad_norm": 3.044171094894409, "step": 8480 }, { "epoch": 2.828552368245497, "learning_rate": 0.00022490642144190774, "step": 8480 }, { "epoch": 2.828552368245497, "loss": 0.5300574898719788, "step": 8480 }, { "ce_loss": 0.18093335628509521, "epoch": 2.828552368245497, "step": 8480 }, { "distill_loss": 0.14212843775749207, "epoch": 2.828552368245497, "step": 8480 }, { "epoch": 2.828552368245497, "ref_ce_loss": 0.09574255347251892, "step": 8480 }, { "epoch": 2.828552368245497, "loss": 0.9047300815582275, "step": 8480 }, { "ce_loss": 0.23037096858024597, "epoch": 2.828552368245497, "step": 8480 }, { "distill_loss": 0.1440839022397995, "epoch": 2.828552368245497, "step": 8480 }, { "epoch": 2.828552368245497, "ref_ce_loss": 0.13643290102481842, "step": 8480 }, { "epoch": 2.828552368245497, "loss": 0.5250870585441589, "step": 8480 }, { "ce_loss": 0.10960116982460022, "epoch": 2.828552368245497, "step": 8480 }, { "distill_loss": 0.17659179866313934, "epoch": 2.828552368245497, "step": 8480 }, { "epoch": 2.828552368245497, "ref_ce_loss": 0.18103836476802826, "step": 8480 }, { "epoch": 2.828552368245497, "loss": 0.376802921295166, "step": 8480 }, { "ce_loss": 0.15248258411884308, "epoch": 2.828552368245497, "step": 8480 }, { "distill_loss": 0.10913614183664322, "epoch": 2.828552368245497, "step": 8480 }, { "epoch": 2.828552368245497, "ref_ce_loss": 0.11468908935785294, "step": 8480 }, { "epoch": 2.8318879252835223, "loss": 0.5157, "step": 8490 }, { "epoch": 2.8318879252835223, "grad_norm": 2.604387044906616, "step": 8490 }, { "epoch": 2.8318879252835223, "learning_rate": 0.00022473085695716183, "step": 8490 }, { "epoch": 2.8318879252835223, "loss": 0.3220893144607544, "step": 8490 }, { "ce_loss": 0.09508085250854492, "epoch": 2.8318879252835223, "step": 8490 }, { "distill_loss": 0.1276741623878479, "epoch": 2.8318879252835223, "step": 8490 }, { "epoch": 2.8318879252835223, "ref_ce_loss": 0.09848184138536453, "step": 8490 }, { "epoch": 2.8318879252835223, "loss": 0.732323408126831, "step": 8490 }, { "ce_loss": 0.1797773838043213, "epoch": 2.8318879252835223, "step": 8490 }, { "distill_loss": 0.12821955978870392, "epoch": 2.8318879252835223, "step": 8490 }, { "epoch": 2.8318879252835223, "ref_ce_loss": 0.1704844832420349, "step": 8490 }, { "epoch": 2.8318879252835223, "loss": 0.641586422920227, "step": 8490 }, { "ce_loss": 0.15142354369163513, "epoch": 2.8318879252835223, "step": 8490 }, { "distill_loss": 0.14353042840957642, "epoch": 2.8318879252835223, "step": 8490 }, { "epoch": 2.8318879252835223, "ref_ce_loss": 0.12409044802188873, "step": 8490 }, { "epoch": 2.8318879252835223, "loss": 0.5020027756690979, "step": 8490 }, { "ce_loss": 0.1881706863641739, "epoch": 2.8318879252835223, "step": 8490 }, { "distill_loss": 0.2028343379497528, "epoch": 2.8318879252835223, "step": 8490 }, { "epoch": 2.8318879252835223, "ref_ce_loss": 0.11063137650489807, "step": 8490 }, { "epoch": 2.8352234823215476, "loss": 0.5539, "step": 8500 }, { "epoch": 2.8352234823215476, "grad_norm": 3.075287342071533, "step": 8500 }, { "epoch": 2.8352234823215476, "learning_rate": 0.0002245551561928118, "step": 8500 }, { "epoch": 2.8352234823215476, "loss": 0.5149841904640198, "step": 8500 }, { "ce_loss": 0.18708448112010956, "epoch": 2.8352234823215476, "step": 8500 }, { "distill_loss": 0.18625451624393463, "epoch": 2.8352234823215476, "step": 8500 }, { "epoch": 2.8352234823215476, "ref_ce_loss": 0.10302000492811203, "step": 8500 }, { "epoch": 2.8352234823215476, "loss": 0.5088472962379456, "step": 8500 }, { "ce_loss": 0.18305392563343048, "epoch": 2.8352234823215476, "step": 8500 }, { "distill_loss": 0.12284497916698456, "epoch": 2.8352234823215476, "step": 8500 }, { "epoch": 2.8352234823215476, "ref_ce_loss": 0.14784277975559235, "step": 8500 }, { "epoch": 2.8352234823215476, "loss": 0.6096242070198059, "step": 8500 }, { "ce_loss": 0.18820351362228394, "epoch": 2.8352234823215476, "step": 8500 }, { "distill_loss": 0.1708454191684723, "epoch": 2.8352234823215476, "step": 8500 }, { "epoch": 2.8352234823215476, "ref_ce_loss": 0.11443863064050674, "step": 8500 }, { "epoch": 2.8352234823215476, "loss": 0.5093251466751099, "step": 8500 }, { "ce_loss": 0.23517775535583496, "epoch": 2.8352234823215476, "step": 8500 }, { "distill_loss": 0.1514825075864792, "epoch": 2.8352234823215476, "step": 8500 }, { "epoch": 2.8352234823215476, "ref_ce_loss": 0.12234192341566086, "step": 8500 }, { "epoch": 2.838559039359573, "loss": 0.556, "step": 8510 }, { "epoch": 2.838559039359573, "grad_norm": 2.9502902030944824, "step": 8510 }, { "epoch": 2.838559039359573, "learning_rate": 0.00022437931946926647, "step": 8510 }, { "epoch": 2.838559039359573, "loss": 0.49666211009025574, "step": 8510 }, { "ce_loss": 0.17453722655773163, "epoch": 2.838559039359573, "step": 8510 }, { "distill_loss": 0.18285943567752838, "epoch": 2.838559039359573, "step": 8510 }, { "epoch": 2.838559039359573, "ref_ce_loss": 0.13843190670013428, "step": 8510 }, { "epoch": 2.838559039359573, "loss": 0.6453526616096497, "step": 8510 }, { "ce_loss": 0.23797675967216492, "epoch": 2.838559039359573, "step": 8510 }, { "distill_loss": 0.1395404189825058, "epoch": 2.838559039359573, "step": 8510 }, { "epoch": 2.838559039359573, "ref_ce_loss": 0.1344192624092102, "step": 8510 }, { "epoch": 2.838559039359573, "loss": 0.5082492232322693, "step": 8510 }, { "ce_loss": 0.1746096909046173, "epoch": 2.838559039359573, "step": 8510 }, { "distill_loss": 0.17531262338161469, "epoch": 2.838559039359573, "step": 8510 }, { "epoch": 2.838559039359573, "ref_ce_loss": 0.13140884041786194, "step": 8510 }, { "epoch": 2.838559039359573, "loss": 0.758216381072998, "step": 8510 }, { "ce_loss": 0.20714110136032104, "epoch": 2.838559039359573, "step": 8510 }, { "distill_loss": 0.17534372210502625, "epoch": 2.838559039359573, "step": 8510 }, { "epoch": 2.838559039359573, "ref_ce_loss": 0.16164039075374603, "step": 8510 }, { "epoch": 2.8418945963975983, "loss": 0.558, "step": 8520 }, { "epoch": 2.8418945963975983, "grad_norm": 2.721283435821533, "step": 8520 }, { "epoch": 2.8418945963975983, "learning_rate": 0.00022420334710718267, "step": 8520 }, { "epoch": 2.8418945963975983, "loss": 0.607196569442749, "step": 8520 }, { "ce_loss": 0.21133388578891754, "epoch": 2.8418945963975983, "step": 8520 }, { "distill_loss": 0.12246505171060562, "epoch": 2.8418945963975983, "step": 8520 }, { "epoch": 2.8418945963975983, "ref_ce_loss": 0.16589756309986115, "step": 8520 }, { "epoch": 2.8418945963975983, "loss": 0.4256008565425873, "step": 8520 }, { "ce_loss": 0.12766288220882416, "epoch": 2.8418945963975983, "step": 8520 }, { "distill_loss": 0.16494876146316528, "epoch": 2.8418945963975983, "step": 8520 }, { "epoch": 2.8418945963975983, "ref_ce_loss": 0.09781324863433838, "step": 8520 }, { "epoch": 2.8418945963975983, "loss": 0.6566895246505737, "step": 8520 }, { "ce_loss": 0.15638019144535065, "epoch": 2.8418945963975983, "step": 8520 }, { "distill_loss": 0.16774064302444458, "epoch": 2.8418945963975983, "step": 8520 }, { "epoch": 2.8418945963975983, "ref_ce_loss": 0.11066441237926483, "step": 8520 }, { "epoch": 2.8418945963975983, "loss": 0.48018428683280945, "step": 8520 }, { "ce_loss": 0.19729620218276978, "epoch": 2.8418945963975983, "step": 8520 }, { "distill_loss": 0.16175724565982819, "epoch": 2.8418945963975983, "step": 8520 }, { "epoch": 2.8418945963975983, "ref_ce_loss": 0.11984910815954208, "step": 8520 }, { "epoch": 2.8452301534356237, "loss": 0.5809, "step": 8530 }, { "epoch": 2.8452301534356237, "grad_norm": 2.210233688354492, "step": 8530 }, { "epoch": 2.8452301534356237, "learning_rate": 0.00022402723942746466, "step": 8530 }, { "epoch": 2.8452301534356237, "loss": 0.5266987085342407, "step": 8530 }, { "ce_loss": 0.12296690046787262, "epoch": 2.8452301534356237, "step": 8530 }, { "distill_loss": 0.15192703902721405, "epoch": 2.8452301534356237, "step": 8530 }, { "epoch": 2.8452301534356237, "ref_ce_loss": 0.14122067391872406, "step": 8530 }, { "epoch": 2.8452301534356237, "loss": 0.7991921901702881, "step": 8530 }, { "ce_loss": 0.13369446992874146, "epoch": 2.8452301534356237, "step": 8530 }, { "distill_loss": 0.17547550797462463, "epoch": 2.8452301534356237, "step": 8530 }, { "epoch": 2.8452301534356237, "ref_ce_loss": 0.15770867466926575, "step": 8530 }, { "epoch": 2.8452301534356237, "loss": 0.5874632596969604, "step": 8530 }, { "ce_loss": 0.21728350222110748, "epoch": 2.8452301534356237, "step": 8530 }, { "distill_loss": 0.1586538851261139, "epoch": 2.8452301534356237, "step": 8530 }, { "epoch": 2.8452301534356237, "ref_ce_loss": 0.153802290558815, "step": 8530 }, { "epoch": 2.8452301534356237, "loss": 0.47910526394844055, "step": 8530 }, { "ce_loss": 0.15307433903217316, "epoch": 2.8452301534356237, "step": 8530 }, { "distill_loss": 0.1651879996061325, "epoch": 2.8452301534356237, "step": 8530 }, { "epoch": 2.8452301534356237, "ref_ce_loss": 0.11388645321130753, "step": 8530 }, { "epoch": 2.848565710473649, "loss": 0.5447, "step": 8540 }, { "epoch": 2.848565710473649, "grad_norm": 2.7099955081939697, "step": 8540 }, { "epoch": 2.848565710473649, "learning_rate": 0.0002238509967512632, "step": 8540 }, { "epoch": 2.848565710473649, "loss": 0.5581467151641846, "step": 8540 }, { "ce_loss": 0.18522784113883972, "epoch": 2.848565710473649, "step": 8540 }, { "distill_loss": 0.16879862546920776, "epoch": 2.848565710473649, "step": 8540 }, { "epoch": 2.848565710473649, "ref_ce_loss": 0.14357973635196686, "step": 8540 }, { "epoch": 2.848565710473649, "loss": 0.6665406227111816, "step": 8540 }, { "ce_loss": 0.25393146276474, "epoch": 2.848565710473649, "step": 8540 }, { "distill_loss": 0.14920659363269806, "epoch": 2.848565710473649, "step": 8540 }, { "epoch": 2.848565710473649, "ref_ce_loss": 0.18525826930999756, "step": 8540 }, { "epoch": 2.848565710473649, "loss": 0.5111413598060608, "step": 8540 }, { "ce_loss": 0.1705455332994461, "epoch": 2.848565710473649, "step": 8540 }, { "distill_loss": 0.14063940942287445, "epoch": 2.848565710473649, "step": 8540 }, { "epoch": 2.848565710473649, "ref_ce_loss": 0.09902207553386688, "step": 8540 }, { "epoch": 2.848565710473649, "loss": 0.350668728351593, "step": 8540 }, { "ce_loss": 0.14593736827373505, "epoch": 2.848565710473649, "step": 8540 }, { "distill_loss": 0.09514002501964569, "epoch": 2.848565710473649, "step": 8540 }, { "epoch": 2.848565710473649, "ref_ce_loss": 0.10897145420312881, "step": 8540 }, { "epoch": 2.8519012675116744, "loss": 0.5035, "step": 8550 }, { "epoch": 2.8519012675116744, "grad_norm": 2.573655128479004, "step": 8550 }, { "epoch": 2.8519012675116744, "learning_rate": 0.00022367461939997552, "step": 8550 }, { "epoch": 2.8519012675116744, "loss": 0.5167698264122009, "step": 8550 }, { "ce_loss": 0.13213999569416046, "epoch": 2.8519012675116744, "step": 8550 }, { "distill_loss": 0.14889588952064514, "epoch": 2.8519012675116744, "step": 8550 }, { "epoch": 2.8519012675116744, "ref_ce_loss": 0.12083609402179718, "step": 8550 }, { "epoch": 2.8519012675116744, "loss": 0.5313799977302551, "step": 8550 }, { "ce_loss": 0.19686628878116608, "epoch": 2.8519012675116744, "step": 8550 }, { "distill_loss": 0.14351768791675568, "epoch": 2.8519012675116744, "step": 8550 }, { "epoch": 2.8519012675116744, "ref_ce_loss": 0.14347802102565765, "step": 8550 }, { "epoch": 2.8519012675116744, "loss": 0.9321268796920776, "step": 8550 }, { "ce_loss": 0.16296499967575073, "epoch": 2.8519012675116744, "step": 8550 }, { "distill_loss": 0.14142106473445892, "epoch": 2.8519012675116744, "step": 8550 }, { "epoch": 2.8519012675116744, "ref_ce_loss": 0.10340377688407898, "step": 8550 }, { "epoch": 2.8519012675116744, "loss": 0.4632156491279602, "step": 8550 }, { "ce_loss": 0.18383440375328064, "epoch": 2.8519012675116744, "step": 8550 }, { "distill_loss": 0.1084541380405426, "epoch": 2.8519012675116744, "step": 8550 }, { "epoch": 2.8519012675116744, "ref_ce_loss": 0.10253027826547623, "step": 8550 }, { "epoch": 2.8552368245496997, "loss": 0.5808, "step": 8560 }, { "epoch": 2.8552368245496997, "grad_norm": 3.327956199645996, "step": 8560 }, { "epoch": 2.8552368245496997, "learning_rate": 0.00022349810769524436, "step": 8560 }, { "epoch": 2.8552368245496997, "loss": 0.4661470949649811, "step": 8560 }, { "ce_loss": 0.1542460322380066, "epoch": 2.8552368245496997, "step": 8560 }, { "distill_loss": 0.18235129117965698, "epoch": 2.8552368245496997, "step": 8560 }, { "epoch": 2.8552368245496997, "ref_ce_loss": 0.08700333535671234, "step": 8560 }, { "epoch": 2.8552368245496997, "loss": 0.573223352432251, "step": 8560 }, { "ce_loss": 0.1726389080286026, "epoch": 2.8552368245496997, "step": 8560 }, { "distill_loss": 0.16355568170547485, "epoch": 2.8552368245496997, "step": 8560 }, { "epoch": 2.8552368245496997, "ref_ce_loss": 0.13505931198596954, "step": 8560 }, { "epoch": 2.8552368245496997, "loss": 0.4333297610282898, "step": 8560 }, { "ce_loss": 0.1419287770986557, "epoch": 2.8552368245496997, "step": 8560 }, { "distill_loss": 0.19388234615325928, "epoch": 2.8552368245496997, "step": 8560 }, { "epoch": 2.8552368245496997, "ref_ce_loss": 0.09745925664901733, "step": 8560 }, { "epoch": 2.8552368245496997, "loss": 0.5763939023017883, "step": 8560 }, { "ce_loss": 0.10823322832584381, "epoch": 2.8552368245496997, "step": 8560 }, { "distill_loss": 0.14647899568080902, "epoch": 2.8552368245496997, "step": 8560 }, { "epoch": 2.8552368245496997, "ref_ce_loss": 0.15130794048309326, "step": 8560 }, { "epoch": 2.858572381587725, "loss": 0.5607, "step": 8570 }, { "epoch": 2.858572381587725, "grad_norm": 2.5268304347991943, "step": 8570 }, { "epoch": 2.858572381587725, "learning_rate": 0.00022332146195895735, "step": 8570 }, { "epoch": 2.858572381587725, "loss": 0.4295397996902466, "step": 8570 }, { "ce_loss": 0.1499362289905548, "epoch": 2.858572381587725, "step": 8570 }, { "distill_loss": 0.1379919946193695, "epoch": 2.858572381587725, "step": 8570 }, { "epoch": 2.858572381587725, "ref_ce_loss": 0.10437241941690445, "step": 8570 }, { "epoch": 2.858572381587725, "loss": 0.34164315462112427, "step": 8570 }, { "ce_loss": 0.08813903480768204, "epoch": 2.858572381587725, "step": 8570 }, { "distill_loss": 0.11740319430828094, "epoch": 2.858572381587725, "step": 8570 }, { "epoch": 2.858572381587725, "ref_ce_loss": 0.128663569688797, "step": 8570 }, { "epoch": 2.858572381587725, "loss": 0.4116790294647217, "step": 8570 }, { "ce_loss": 0.1062823235988617, "epoch": 2.858572381587725, "step": 8570 }, { "distill_loss": 0.11568897217512131, "epoch": 2.858572381587725, "step": 8570 }, { "epoch": 2.858572381587725, "ref_ce_loss": 0.14106427133083344, "step": 8570 }, { "epoch": 2.858572381587725, "loss": 0.44529253244400024, "step": 8570 }, { "ce_loss": 0.14703154563903809, "epoch": 2.858572381587725, "step": 8570 }, { "distill_loss": 0.12601909041404724, "epoch": 2.858572381587725, "step": 8570 }, { "epoch": 2.858572381587725, "ref_ce_loss": 0.1380028873682022, "step": 8570 }, { "epoch": 2.8619079386257504, "loss": 0.5535, "step": 8580 }, { "epoch": 2.8619079386257504, "grad_norm": 3.367523193359375, "step": 8580 }, { "epoch": 2.8619079386257504, "learning_rate": 0.00022314468251324673, "step": 8580 }, { "epoch": 2.8619079386257504, "loss": 0.3660839796066284, "step": 8580 }, { "ce_loss": 0.09687898308038712, "epoch": 2.8619079386257504, "step": 8580 }, { "distill_loss": 0.1386238932609558, "epoch": 2.8619079386257504, "step": 8580 }, { "epoch": 2.8619079386257504, "ref_ce_loss": 0.09564144909381866, "step": 8580 }, { "epoch": 2.8619079386257504, "loss": 0.5628844499588013, "step": 8580 }, { "ce_loss": 0.20551376044750214, "epoch": 2.8619079386257504, "step": 8580 }, { "distill_loss": 0.13932254910469055, "epoch": 2.8619079386257504, "step": 8580 }, { "epoch": 2.8619079386257504, "ref_ce_loss": 0.1882753074169159, "step": 8580 }, { "epoch": 2.8619079386257504, "loss": 0.3394158184528351, "step": 8580 }, { "ce_loss": 0.0823008194565773, "epoch": 2.8619079386257504, "step": 8580 }, { "distill_loss": 0.0964013859629631, "epoch": 2.8619079386257504, "step": 8580 }, { "epoch": 2.8619079386257504, "ref_ce_loss": 0.09652000665664673, "step": 8580 }, { "epoch": 2.8619079386257504, "loss": 0.2966267466545105, "step": 8580 }, { "ce_loss": 0.10649916529655457, "epoch": 2.8619079386257504, "step": 8580 }, { "distill_loss": 0.08755770325660706, "epoch": 2.8619079386257504, "step": 8580 }, { "epoch": 2.8619079386257504, "ref_ce_loss": 0.10237345099449158, "step": 8580 }, { "epoch": 2.865243495663776, "loss": 0.5546, "step": 8590 }, { "epoch": 2.865243495663776, "grad_norm": 4.27254581451416, "step": 8590 }, { "epoch": 2.865243495663776, "learning_rate": 0.0002229677696804884, "step": 8590 }, { "epoch": 2.865243495663776, "loss": 0.5530468821525574, "step": 8590 }, { "ce_loss": 0.14216595888137817, "epoch": 2.865243495663776, "step": 8590 }, { "distill_loss": 0.15951929986476898, "epoch": 2.865243495663776, "step": 8590 }, { "epoch": 2.865243495663776, "ref_ce_loss": 0.12193866074085236, "step": 8590 }, { "epoch": 2.865243495663776, "loss": 0.5057868957519531, "step": 8590 }, { "ce_loss": 0.14068260788917542, "epoch": 2.865243495663776, "step": 8590 }, { "distill_loss": 0.15862120687961578, "epoch": 2.865243495663776, "step": 8590 }, { "epoch": 2.865243495663776, "ref_ce_loss": 0.12405318766832352, "step": 8590 }, { "epoch": 2.865243495663776, "loss": 0.5077252388000488, "step": 8590 }, { "ce_loss": 0.06671689450740814, "epoch": 2.865243495663776, "step": 8590 }, { "distill_loss": 0.11470813304185867, "epoch": 2.865243495663776, "step": 8590 }, { "epoch": 2.865243495663776, "ref_ce_loss": 0.08538176864385605, "step": 8590 }, { "epoch": 2.865243495663776, "loss": 0.49542105197906494, "step": 8590 }, { "ce_loss": 0.1653415858745575, "epoch": 2.865243495663776, "step": 8590 }, { "distill_loss": 0.14084473252296448, "epoch": 2.865243495663776, "step": 8590 }, { "epoch": 2.865243495663776, "ref_ce_loss": 0.11440587788820267, "step": 8590 }, { "epoch": 2.868579052701801, "loss": 0.5217, "step": 8600 }, { "epoch": 2.868579052701801, "grad_norm": 2.1853246688842773, "step": 8600 }, { "epoch": 2.868579052701801, "learning_rate": 0.00022279072378330163, "step": 8600 }, { "epoch": 2.868579052701801, "loss": 0.5349823832511902, "step": 8600 }, { "ce_loss": 0.21476206183433533, "epoch": 2.868579052701801, "step": 8600 }, { "distill_loss": 0.12671056389808655, "epoch": 2.868579052701801, "step": 8600 }, { "epoch": 2.868579052701801, "ref_ce_loss": 0.16497483849525452, "step": 8600 }, { "epoch": 2.868579052701801, "loss": 0.5629706978797913, "step": 8600 }, { "ce_loss": 0.1847279965877533, "epoch": 2.868579052701801, "step": 8600 }, { "distill_loss": 0.13843335211277008, "epoch": 2.868579052701801, "step": 8600 }, { "epoch": 2.868579052701801, "ref_ce_loss": 0.14642126858234406, "step": 8600 }, { "epoch": 2.868579052701801, "loss": 0.6087683439254761, "step": 8600 }, { "ce_loss": 0.15599875152111053, "epoch": 2.868579052701801, "step": 8600 }, { "distill_loss": 0.12127721309661865, "epoch": 2.868579052701801, "step": 8600 }, { "epoch": 2.868579052701801, "ref_ce_loss": 0.11146479099988937, "step": 8600 }, { "epoch": 2.868579052701801, "loss": 0.48046231269836426, "step": 8600 }, { "ce_loss": 0.10944525897502899, "epoch": 2.868579052701801, "step": 8600 }, { "distill_loss": 0.1440810263156891, "epoch": 2.868579052701801, "step": 8600 }, { "epoch": 2.868579052701801, "ref_ce_loss": 0.1533537060022354, "step": 8600 }, { "epoch": 2.8719146097398265, "loss": 0.5264, "step": 8610 }, { "epoch": 2.8719146097398265, "grad_norm": 2.3612961769104004, "step": 8610 }, { "epoch": 2.8719146097398265, "learning_rate": 0.00022261354514454827, "step": 8610 }, { "epoch": 2.8719146097398265, "loss": 0.30287694931030273, "step": 8610 }, { "ce_loss": 0.05707375332713127, "epoch": 2.8719146097398265, "step": 8610 }, { "distill_loss": 0.08226659148931503, "epoch": 2.8719146097398265, "step": 8610 }, { "epoch": 2.8719146097398265, "ref_ce_loss": 0.07613756507635117, "step": 8610 }, { "epoch": 2.8719146097398265, "loss": 0.6359444856643677, "step": 8610 }, { "ce_loss": 0.25475236773490906, "epoch": 2.8719146097398265, "step": 8610 }, { "distill_loss": 0.16380392014980316, "epoch": 2.8719146097398265, "step": 8610 }, { "epoch": 2.8719146097398265, "ref_ce_loss": 0.18352565169334412, "step": 8610 }, { "epoch": 2.8719146097398265, "loss": 0.6667486429214478, "step": 8610 }, { "ce_loss": 0.26272544264793396, "epoch": 2.8719146097398265, "step": 8610 }, { "distill_loss": 0.08766535669565201, "epoch": 2.8719146097398265, "step": 8610 }, { "epoch": 2.8719146097398265, "ref_ce_loss": 0.1847449392080307, "step": 8610 }, { "epoch": 2.8719146097398265, "loss": 0.5750166177749634, "step": 8610 }, { "ce_loss": 0.23870283365249634, "epoch": 2.8719146097398265, "step": 8610 }, { "distill_loss": 0.09349337220191956, "epoch": 2.8719146097398265, "step": 8610 }, { "epoch": 2.8719146097398265, "ref_ce_loss": 0.12696757912635803, "step": 8610 }, { "epoch": 2.875250166777852, "loss": 0.5242, "step": 8620 }, { "epoch": 2.875250166777852, "grad_norm": 2.3082547187805176, "step": 8620 }, { "epoch": 2.875250166777852, "learning_rate": 0.0002224362340873323, "step": 8620 }, { "epoch": 2.875250166777852, "loss": 0.40311920642852783, "step": 8620 }, { "ce_loss": 0.15748664736747742, "epoch": 2.875250166777852, "step": 8620 }, { "distill_loss": 0.105941042304039, "epoch": 2.875250166777852, "step": 8620 }, { "epoch": 2.875250166777852, "ref_ce_loss": 0.09454337507486343, "step": 8620 }, { "epoch": 2.875250166777852, "loss": 0.6777150630950928, "step": 8620 }, { "ce_loss": 0.1892915666103363, "epoch": 2.875250166777852, "step": 8620 }, { "distill_loss": 0.09386925399303436, "epoch": 2.875250166777852, "step": 8620 }, { "epoch": 2.875250166777852, "ref_ce_loss": 0.13299092650413513, "step": 8620 }, { "epoch": 2.875250166777852, "loss": 0.4655272960662842, "step": 8620 }, { "ce_loss": 0.13446469604969025, "epoch": 2.875250166777852, "step": 8620 }, { "distill_loss": 0.09398121386766434, "epoch": 2.875250166777852, "step": 8620 }, { "epoch": 2.875250166777852, "ref_ce_loss": 0.09203039854764938, "step": 8620 }, { "epoch": 2.875250166777852, "loss": 0.27186840772628784, "step": 8620 }, { "ce_loss": 0.0731753557920456, "epoch": 2.875250166777852, "step": 8620 }, { "distill_loss": 0.09868510812520981, "epoch": 2.875250166777852, "step": 8620 }, { "epoch": 2.875250166777852, "ref_ce_loss": 0.06487337499856949, "step": 8620 }, { "epoch": 2.878585723815877, "loss": 0.5088, "step": 8630 }, { "epoch": 2.878585723815877, "grad_norm": 2.4868619441986084, "step": 8630 }, { "epoch": 2.878585723815877, "learning_rate": 0.000222258790934999, "step": 8630 }, { "epoch": 2.878585723815877, "loss": 0.5052057504653931, "step": 8630 }, { "ce_loss": 0.12210213392972946, "epoch": 2.878585723815877, "step": 8630 }, { "distill_loss": 0.09557357430458069, "epoch": 2.878585723815877, "step": 8630 }, { "epoch": 2.878585723815877, "ref_ce_loss": 0.19164274632930756, "step": 8630 }, { "epoch": 2.878585723815877, "loss": 0.8311383724212646, "step": 8630 }, { "ce_loss": 0.3233627378940582, "epoch": 2.878585723815877, "step": 8630 }, { "distill_loss": 0.16602776944637299, "epoch": 2.878585723815877, "step": 8630 }, { "epoch": 2.878585723815877, "ref_ce_loss": 0.16990318894386292, "step": 8630 }, { "epoch": 2.878585723815877, "loss": 0.3017067611217499, "step": 8630 }, { "ce_loss": 0.06859374791383743, "epoch": 2.878585723815877, "step": 8630 }, { "distill_loss": 0.11418160051107407, "epoch": 2.878585723815877, "step": 8630 }, { "epoch": 2.878585723815877, "ref_ce_loss": 0.0983542650938034, "step": 8630 }, { "epoch": 2.878585723815877, "loss": 0.45237842202186584, "step": 8630 }, { "ce_loss": 0.17673709988594055, "epoch": 2.878585723815877, "step": 8630 }, { "distill_loss": 0.11338216066360474, "epoch": 2.878585723815877, "step": 8630 }, { "epoch": 2.878585723815877, "ref_ce_loss": 0.11751232296228409, "step": 8630 }, { "epoch": 2.8819212808539025, "loss": 0.5389, "step": 8640 }, { "epoch": 2.8819212808539025, "grad_norm": 2.4058783054351807, "step": 8640 }, { "epoch": 2.8819212808539025, "learning_rate": 0.00022208121601113493, "step": 8640 }, { "epoch": 2.8819212808539025, "loss": 0.49721455574035645, "step": 8640 }, { "ce_loss": 0.1393839567899704, "epoch": 2.8819212808539025, "step": 8640 }, { "distill_loss": 0.12871664762496948, "epoch": 2.8819212808539025, "step": 8640 }, { "epoch": 2.8819212808539025, "ref_ce_loss": 0.11616840958595276, "step": 8640 }, { "epoch": 2.8819212808539025, "loss": 0.5112259387969971, "step": 8640 }, { "ce_loss": 0.12560014426708221, "epoch": 2.8819212808539025, "step": 8640 }, { "distill_loss": 0.1277984380722046, "epoch": 2.8819212808539025, "step": 8640 }, { "epoch": 2.8819212808539025, "ref_ce_loss": 0.11363084614276886, "step": 8640 }, { "epoch": 2.8819212808539025, "loss": 0.5464163422584534, "step": 8640 }, { "ce_loss": 0.20722423493862152, "epoch": 2.8819212808539025, "step": 8640 }, { "distill_loss": 0.1460934579372406, "epoch": 2.8819212808539025, "step": 8640 }, { "epoch": 2.8819212808539025, "ref_ce_loss": 0.14361216127872467, "step": 8640 }, { "epoch": 2.8819212808539025, "loss": 0.42684420943260193, "step": 8640 }, { "ce_loss": 0.1484360545873642, "epoch": 2.8819212808539025, "step": 8640 }, { "distill_loss": 0.10624878108501434, "epoch": 2.8819212808539025, "step": 8640 }, { "epoch": 2.8819212808539025, "ref_ce_loss": 0.119191475212574, "step": 8640 }, { "epoch": 2.885256837891928, "loss": 0.5271, "step": 8650 }, { "epoch": 2.885256837891928, "grad_norm": 3.429231643676758, "step": 8650 }, { "epoch": 2.885256837891928, "learning_rate": 0.00022190350963956652, "step": 8650 }, { "epoch": 2.885256837891928, "loss": 0.3691232204437256, "step": 8650 }, { "ce_loss": 0.1358044445514679, "epoch": 2.885256837891928, "step": 8650 }, { "distill_loss": 0.13041932880878448, "epoch": 2.885256837891928, "step": 8650 }, { "epoch": 2.885256837891928, "ref_ce_loss": 0.10271897912025452, "step": 8650 }, { "epoch": 2.885256837891928, "loss": 0.32777976989746094, "step": 8650 }, { "ce_loss": 0.12133575230836868, "epoch": 2.885256837891928, "step": 8650 }, { "distill_loss": 0.10511480271816254, "epoch": 2.885256837891928, "step": 8650 }, { "epoch": 2.885256837891928, "ref_ce_loss": 0.10117995738983154, "step": 8650 }, { "epoch": 2.885256837891928, "loss": 0.39452067017555237, "step": 8650 }, { "ce_loss": 0.18165063858032227, "epoch": 2.885256837891928, "step": 8650 }, { "distill_loss": 0.12682943046092987, "epoch": 2.885256837891928, "step": 8650 }, { "epoch": 2.885256837891928, "ref_ce_loss": 0.08599083125591278, "step": 8650 }, { "epoch": 2.885256837891928, "loss": 0.5932126045227051, "step": 8650 }, { "ce_loss": 0.18555493652820587, "epoch": 2.885256837891928, "step": 8650 }, { "distill_loss": 0.11078682541847229, "epoch": 2.885256837891928, "step": 8650 }, { "epoch": 2.885256837891928, "ref_ce_loss": 0.12941141426563263, "step": 8650 }, { "epoch": 2.8885923949299532, "loss": 0.5087, "step": 8660 }, { "epoch": 2.8885923949299532, "grad_norm": 3.141085386276245, "step": 8660 }, { "epoch": 2.8885923949299532, "learning_rate": 0.00022172567214436014, "step": 8660 }, { "epoch": 2.8885923949299532, "loss": 0.43682029843330383, "step": 8660 }, { "ce_loss": 0.18460293114185333, "epoch": 2.8885923949299532, "step": 8660 }, { "distill_loss": 0.09788291156291962, "epoch": 2.8885923949299532, "step": 8660 }, { "epoch": 2.8885923949299532, "ref_ce_loss": 0.11332383006811142, "step": 8660 }, { "epoch": 2.8885923949299532, "loss": 0.783757209777832, "step": 8660 }, { "ce_loss": 0.22142735123634338, "epoch": 2.8885923949299532, "step": 8660 }, { "distill_loss": 0.1374686062335968, "epoch": 2.8885923949299532, "step": 8660 }, { "epoch": 2.8885923949299532, "ref_ce_loss": 0.14957596361637115, "step": 8660 }, { "epoch": 2.8885923949299532, "loss": 0.4457992911338806, "step": 8660 }, { "ce_loss": 0.1515551507472992, "epoch": 2.8885923949299532, "step": 8660 }, { "distill_loss": 0.09355000406503677, "epoch": 2.8885923949299532, "step": 8660 }, { "epoch": 2.8885923949299532, "ref_ce_loss": 0.1352471560239792, "step": 8660 }, { "epoch": 2.8885923949299532, "loss": 0.430512011051178, "step": 8660 }, { "ce_loss": 0.129452183842659, "epoch": 2.8885923949299532, "step": 8660 }, { "distill_loss": 0.1036761999130249, "epoch": 2.8885923949299532, "step": 8660 }, { "epoch": 2.8885923949299532, "ref_ce_loss": 0.10788404196500778, "step": 8660 }, { "epoch": 2.8919279519679786, "loss": 0.5535, "step": 8670 }, { "epoch": 2.8919279519679786, "grad_norm": 3.5380215644836426, "step": 8670 }, { "epoch": 2.8919279519679786, "learning_rate": 0.0002215477038498213, "step": 8670 }, { "epoch": 2.8919279519679786, "loss": 0.6049115061759949, "step": 8670 }, { "ce_loss": 0.10785181075334549, "epoch": 2.8919279519679786, "step": 8670 }, { "distill_loss": 0.10674380511045456, "epoch": 2.8919279519679786, "step": 8670 }, { "epoch": 2.8919279519679786, "ref_ce_loss": 0.13331040740013123, "step": 8670 }, { "epoch": 2.8919279519679786, "loss": 0.8881338238716125, "step": 8670 }, { "ce_loss": 0.23795866966247559, "epoch": 2.8919279519679786, "step": 8670 }, { "distill_loss": 0.13352984189987183, "epoch": 2.8919279519679786, "step": 8670 }, { "epoch": 2.8919279519679786, "ref_ce_loss": 0.14402584731578827, "step": 8670 }, { "epoch": 2.8919279519679786, "loss": 0.4040985107421875, "step": 8670 }, { "ce_loss": 0.103199303150177, "epoch": 2.8919279519679786, "step": 8670 }, { "distill_loss": 0.09984700381755829, "epoch": 2.8919279519679786, "step": 8670 }, { "epoch": 2.8919279519679786, "ref_ce_loss": 0.14553777873516083, "step": 8670 }, { "epoch": 2.8919279519679786, "loss": 0.34721651673316956, "step": 8670 }, { "ce_loss": 0.15223266184329987, "epoch": 2.8919279519679786, "step": 8670 }, { "distill_loss": 0.11905837804079056, "epoch": 2.8919279519679786, "step": 8670 }, { "epoch": 2.8919279519679786, "ref_ce_loss": 0.07565715163946152, "step": 8670 }, { "epoch": 2.895263509006004, "loss": 0.4798, "step": 8680 }, { "epoch": 2.895263509006004, "grad_norm": 4.128468036651611, "step": 8680 }, { "epoch": 2.895263509006004, "learning_rate": 0.0002213696050804938, "step": 8680 }, { "epoch": 2.895263509006004, "loss": 0.576320230960846, "step": 8680 }, { "ce_loss": 0.1257760226726532, "epoch": 2.895263509006004, "step": 8680 }, { "distill_loss": 0.0989554151892662, "epoch": 2.895263509006004, "step": 8680 }, { "epoch": 2.895263509006004, "ref_ce_loss": 0.10930144041776657, "step": 8680 }, { "epoch": 2.895263509006004, "loss": 0.3506055474281311, "step": 8680 }, { "ce_loss": 0.06738583743572235, "epoch": 2.895263509006004, "step": 8680 }, { "distill_loss": 0.12411016970872879, "epoch": 2.895263509006004, "step": 8680 }, { "epoch": 2.895263509006004, "ref_ce_loss": 0.09874910861253738, "step": 8680 }, { "epoch": 2.895263509006004, "loss": 0.5777968168258667, "step": 8680 }, { "ce_loss": 0.190630704164505, "epoch": 2.895263509006004, "step": 8680 }, { "distill_loss": 0.11528972536325455, "epoch": 2.895263509006004, "step": 8680 }, { "epoch": 2.895263509006004, "ref_ce_loss": 0.11596431583166122, "step": 8680 }, { "epoch": 2.895263509006004, "loss": 0.6189478635787964, "step": 8680 }, { "ce_loss": 0.16328558325767517, "epoch": 2.895263509006004, "step": 8680 }, { "distill_loss": 0.13485164940357208, "epoch": 2.895263509006004, "step": 8680 }, { "epoch": 2.895263509006004, "ref_ce_loss": 0.15076056122779846, "step": 8680 }, { "epoch": 2.8985990660440293, "loss": 0.5104, "step": 8690 }, { "epoch": 2.8985990660440293, "grad_norm": 3.6191985607147217, "step": 8690 }, { "epoch": 2.8985990660440293, "learning_rate": 0.00022119137616115973, "step": 8690 }, { "epoch": 2.8985990660440293, "loss": 0.31480494141578674, "step": 8690 }, { "ce_loss": 0.09158436208963394, "epoch": 2.8985990660440293, "step": 8690 }, { "distill_loss": 0.1079840138554573, "epoch": 2.8985990660440293, "step": 8690 }, { "epoch": 2.8985990660440293, "ref_ce_loss": 0.06985091418027878, "step": 8690 }, { "epoch": 2.8985990660440293, "loss": 1.2861382961273193, "step": 8690 }, { "ce_loss": 0.29323792457580566, "epoch": 2.8985990660440293, "step": 8690 }, { "distill_loss": 0.10477292537689209, "epoch": 2.8985990660440293, "step": 8690 }, { "epoch": 2.8985990660440293, "ref_ce_loss": 0.18968039751052856, "step": 8690 }, { "epoch": 2.8985990660440293, "loss": 0.48443499207496643, "step": 8690 }, { "ce_loss": 0.2007952183485031, "epoch": 2.8985990660440293, "step": 8690 }, { "distill_loss": 0.0963059738278389, "epoch": 2.8985990660440293, "step": 8690 }, { "epoch": 2.8985990660440293, "ref_ce_loss": 0.14208665490150452, "step": 8690 }, { "epoch": 2.8985990660440293, "loss": 0.3987194895744324, "step": 8690 }, { "ce_loss": 0.13310086727142334, "epoch": 2.8985990660440293, "step": 8690 }, { "distill_loss": 0.09960196912288666, "epoch": 2.8985990660440293, "step": 8690 }, { "epoch": 2.8985990660440293, "ref_ce_loss": 0.0725984126329422, "step": 8690 }, { "epoch": 2.9019346230820546, "loss": 0.5389, "step": 8700 }, { "epoch": 2.9019346230820546, "grad_norm": 4.235092639923096, "step": 8700 }, { "epoch": 2.9019346230820546, "learning_rate": 0.0002210130174168382, "step": 8700 }, { "epoch": 2.9019346230820546, "loss": 0.32756808400154114, "step": 8700 }, { "ce_loss": 0.15331578254699707, "epoch": 2.9019346230820546, "step": 8700 }, { "distill_loss": 0.08755943179130554, "epoch": 2.9019346230820546, "step": 8700 }, { "epoch": 2.9019346230820546, "ref_ce_loss": 0.08651743829250336, "step": 8700 }, { "epoch": 2.9019346230820546, "loss": 0.7419276237487793, "step": 8700 }, { "ce_loss": 0.1952676922082901, "epoch": 2.9019346230820546, "step": 8700 }, { "distill_loss": 0.08272796869277954, "epoch": 2.9019346230820546, "step": 8700 }, { "epoch": 2.9019346230820546, "ref_ce_loss": 0.11741222441196442, "step": 8700 }, { "epoch": 2.9019346230820546, "loss": 0.42398780584335327, "step": 8700 }, { "ce_loss": 0.11199457943439484, "epoch": 2.9019346230820546, "step": 8700 }, { "distill_loss": 0.1243538111448288, "epoch": 2.9019346230820546, "step": 8700 }, { "epoch": 2.9019346230820546, "ref_ce_loss": 0.11631476879119873, "step": 8700 }, { "epoch": 2.9019346230820546, "loss": 0.43274134397506714, "step": 8700 }, { "ce_loss": 0.19206500053405762, "epoch": 2.9019346230820546, "step": 8700 }, { "distill_loss": 0.1058146059513092, "epoch": 2.9019346230820546, "step": 8700 }, { "epoch": 2.9019346230820546, "ref_ce_loss": 0.09739596396684647, "step": 8700 }, { "epoch": 2.90527018012008, "loss": 0.5265, "step": 8710 }, { "epoch": 2.90527018012008, "grad_norm": 2.5261056423187256, "step": 8710 }, { "epoch": 2.90527018012008, "learning_rate": 0.00022083452917278528, "step": 8710 }, { "epoch": 2.90527018012008, "loss": 0.6119447350502014, "step": 8710 }, { "ce_loss": 0.16569578647613525, "epoch": 2.90527018012008, "step": 8710 }, { "distill_loss": 0.09654286503791809, "epoch": 2.90527018012008, "step": 8710 }, { "epoch": 2.90527018012008, "ref_ce_loss": 0.14469215273857117, "step": 8710 }, { "epoch": 2.90527018012008, "loss": 0.42332643270492554, "step": 8710 }, { "ce_loss": 0.16344575583934784, "epoch": 2.90527018012008, "step": 8710 }, { "distill_loss": 0.0723377913236618, "epoch": 2.90527018012008, "step": 8710 }, { "epoch": 2.90527018012008, "ref_ce_loss": 0.15298517048358917, "step": 8710 }, { "epoch": 2.90527018012008, "loss": 0.3651186227798462, "step": 8710 }, { "ce_loss": 0.12102488428354263, "epoch": 2.90527018012008, "step": 8710 }, { "distill_loss": 0.08429036289453506, "epoch": 2.90527018012008, "step": 8710 }, { "epoch": 2.90527018012008, "ref_ce_loss": 0.09180473536252975, "step": 8710 }, { "epoch": 2.90527018012008, "loss": 0.5617263317108154, "step": 8710 }, { "ce_loss": 0.2145652323961258, "epoch": 2.90527018012008, "step": 8710 }, { "distill_loss": 0.11610198765993118, "epoch": 2.90527018012008, "step": 8710 }, { "epoch": 2.90527018012008, "ref_ce_loss": 0.17532667517662048, "step": 8710 }, { "epoch": 2.9086057371581053, "loss": 0.4692, "step": 8720 }, { "epoch": 2.9086057371581053, "grad_norm": 2.6236250400543213, "step": 8720 }, { "epoch": 2.9086057371581053, "learning_rate": 0.00022065591175449305, "step": 8720 }, { "epoch": 2.9086057371581053, "loss": 0.6628434658050537, "step": 8720 }, { "ce_loss": 0.23668566346168518, "epoch": 2.9086057371581053, "step": 8720 }, { "distill_loss": 0.13840903341770172, "epoch": 2.9086057371581053, "step": 8720 }, { "epoch": 2.9086057371581053, "ref_ce_loss": 0.1786912977695465, "step": 8720 }, { "epoch": 2.9086057371581053, "loss": 0.35061073303222656, "step": 8720 }, { "ce_loss": 0.11093682795763016, "epoch": 2.9086057371581053, "step": 8720 }, { "distill_loss": 0.10949292778968811, "epoch": 2.9086057371581053, "step": 8720 }, { "epoch": 2.9086057371581053, "ref_ce_loss": 0.0809154137969017, "step": 8720 }, { "epoch": 2.9086057371581053, "loss": 0.3698574900627136, "step": 8720 }, { "ce_loss": 0.12995347380638123, "epoch": 2.9086057371581053, "step": 8720 }, { "distill_loss": 0.11510443687438965, "epoch": 2.9086057371581053, "step": 8720 }, { "epoch": 2.9086057371581053, "ref_ce_loss": 0.12470285594463348, "step": 8720 }, { "epoch": 2.9086057371581053, "loss": 0.332610160112381, "step": 8720 }, { "ce_loss": 0.13286007940769196, "epoch": 2.9086057371581053, "step": 8720 }, { "distill_loss": 0.09024094045162201, "epoch": 2.9086057371581053, "step": 8720 }, { "epoch": 2.9086057371581053, "ref_ce_loss": 0.10945945233106613, "step": 8720 }, { "epoch": 2.9119412941961307, "loss": 0.4866, "step": 8730 }, { "epoch": 2.9119412941961307, "grad_norm": 2.74001145362854, "step": 8730 }, { "epoch": 2.9119412941961307, "learning_rate": 0.00022047716548768934, "step": 8730 }, { "epoch": 2.9119412941961307, "loss": 0.5908645391464233, "step": 8730 }, { "ce_loss": 0.15341097116470337, "epoch": 2.9119412941961307, "step": 8730 }, { "distill_loss": 0.09469150006771088, "epoch": 2.9119412941961307, "step": 8730 }, { "epoch": 2.9119412941961307, "ref_ce_loss": 0.0913006067276001, "step": 8730 }, { "epoch": 2.9119412941961307, "loss": 0.3112438917160034, "step": 8730 }, { "ce_loss": 0.11778094619512558, "epoch": 2.9119412941961307, "step": 8730 }, { "distill_loss": 0.09843090176582336, "epoch": 2.9119412941961307, "step": 8730 }, { "epoch": 2.9119412941961307, "ref_ce_loss": 0.09495732933282852, "step": 8730 }, { "epoch": 2.9119412941961307, "loss": 0.6640909910202026, "step": 8730 }, { "ce_loss": 0.12501554191112518, "epoch": 2.9119412941961307, "step": 8730 }, { "distill_loss": 0.1084146797657013, "epoch": 2.9119412941961307, "step": 8730 }, { "epoch": 2.9119412941961307, "ref_ce_loss": 0.16660518944263458, "step": 8730 }, { "epoch": 2.9119412941961307, "loss": 0.3979160487651825, "step": 8730 }, { "ce_loss": 0.0966411754488945, "epoch": 2.9119412941961307, "step": 8730 }, { "distill_loss": 0.10279390215873718, "epoch": 2.9119412941961307, "step": 8730 }, { "epoch": 2.9119412941961307, "ref_ce_loss": 0.0864546075463295, "step": 8730 }, { "epoch": 2.915276851234156, "loss": 0.5306, "step": 8740 }, { "epoch": 2.915276851234156, "grad_norm": 4.032469272613525, "step": 8740 }, { "epoch": 2.915276851234156, "learning_rate": 0.0002202982906983367, "step": 8740 }, { "epoch": 2.915276851234156, "loss": 0.35388892889022827, "step": 8740 }, { "ce_loss": 0.11793344467878342, "epoch": 2.915276851234156, "step": 8740 }, { "distill_loss": 0.09678888320922852, "epoch": 2.915276851234156, "step": 8740 }, { "epoch": 2.915276851234156, "ref_ce_loss": 0.10981198400259018, "step": 8740 }, { "epoch": 2.915276851234156, "loss": 0.2746281921863556, "step": 8740 }, { "ce_loss": 0.07362538576126099, "epoch": 2.915276851234156, "step": 8740 }, { "distill_loss": 0.10906033217906952, "epoch": 2.915276851234156, "step": 8740 }, { "epoch": 2.915276851234156, "ref_ce_loss": 0.09183456748723984, "step": 8740 }, { "epoch": 2.915276851234156, "loss": 0.4105778932571411, "step": 8740 }, { "ce_loss": 0.12222662568092346, "epoch": 2.915276851234156, "step": 8740 }, { "distill_loss": 0.11339154839515686, "epoch": 2.915276851234156, "step": 8740 }, { "epoch": 2.915276851234156, "ref_ce_loss": 0.12201227992773056, "step": 8740 }, { "epoch": 2.915276851234156, "loss": 0.5338358879089355, "step": 8740 }, { "ce_loss": 0.07666455209255219, "epoch": 2.915276851234156, "step": 8740 }, { "distill_loss": 0.09451141953468323, "epoch": 2.915276851234156, "step": 8740 }, { "epoch": 2.915276851234156, "ref_ce_loss": 0.09166429936885834, "step": 8740 }, { "epoch": 2.9186124082721814, "loss": 0.4786, "step": 8750 }, { "epoch": 2.9186124082721814, "grad_norm": 4.57519006729126, "step": 8750 }, { "epoch": 2.9186124082721814, "learning_rate": 0.00022011928771263227, "step": 8750 }, { "epoch": 2.9186124082721814, "loss": 0.4377441704273224, "step": 8750 }, { "ce_loss": 0.09397535771131516, "epoch": 2.9186124082721814, "step": 8750 }, { "distill_loss": 0.08135636150836945, "epoch": 2.9186124082721814, "step": 8750 }, { "epoch": 2.9186124082721814, "ref_ce_loss": 0.0735674798488617, "step": 8750 }, { "epoch": 2.9186124082721814, "loss": 0.5145371556282043, "step": 8750 }, { "ce_loss": 0.18467354774475098, "epoch": 2.9186124082721814, "step": 8750 }, { "distill_loss": 0.1418350636959076, "epoch": 2.9186124082721814, "step": 8750 }, { "epoch": 2.9186124082721814, "ref_ce_loss": 0.13718147575855255, "step": 8750 }, { "epoch": 2.9186124082721814, "loss": 0.43943333625793457, "step": 8750 }, { "ce_loss": 0.1606380194425583, "epoch": 2.9186124082721814, "step": 8750 }, { "distill_loss": 0.12608714401721954, "epoch": 2.9186124082721814, "step": 8750 }, { "epoch": 2.9186124082721814, "ref_ce_loss": 0.11991623789072037, "step": 8750 }, { "epoch": 2.9186124082721814, "loss": 0.34229615330696106, "step": 8750 }, { "ce_loss": 0.13352441787719727, "epoch": 2.9186124082721814, "step": 8750 }, { "distill_loss": 0.11652510613203049, "epoch": 2.9186124082721814, "step": 8750 }, { "epoch": 2.9186124082721814, "ref_ce_loss": 0.09219865500926971, "step": 8750 }, { "epoch": 2.9219479653102067, "loss": 0.5343, "step": 8760 }, { "epoch": 2.9219479653102067, "grad_norm": 2.5483882427215576, "step": 8760 }, { "epoch": 2.9219479653102067, "learning_rate": 0.00021994015685700686, "step": 8760 }, { "epoch": 2.9219479653102067, "loss": 0.45735830068588257, "step": 8760 }, { "ce_loss": 0.2106165885925293, "epoch": 2.9219479653102067, "step": 8760 }, { "distill_loss": 0.11669822037220001, "epoch": 2.9219479653102067, "step": 8760 }, { "epoch": 2.9219479653102067, "ref_ce_loss": 0.08740370720624924, "step": 8760 }, { "epoch": 2.9219479653102067, "loss": 0.5426672101020813, "step": 8760 }, { "ce_loss": 0.14788727462291718, "epoch": 2.9219479653102067, "step": 8760 }, { "distill_loss": 0.08242390304803848, "epoch": 2.9219479653102067, "step": 8760 }, { "epoch": 2.9219479653102067, "ref_ce_loss": 0.09694897383451462, "step": 8760 }, { "epoch": 2.9219479653102067, "loss": 0.5055618286132812, "step": 8760 }, { "ce_loss": 0.12635177373886108, "epoch": 2.9219479653102067, "step": 8760 }, { "distill_loss": 0.12514209747314453, "epoch": 2.9219479653102067, "step": 8760 }, { "epoch": 2.9219479653102067, "ref_ce_loss": 0.18664951622486115, "step": 8760 }, { "epoch": 2.9219479653102067, "loss": 0.40916872024536133, "step": 8760 }, { "ce_loss": 0.17162492871284485, "epoch": 2.9219479653102067, "step": 8760 }, { "distill_loss": 0.11223581433296204, "epoch": 2.9219479653102067, "step": 8760 }, { "epoch": 2.9219479653102067, "ref_ce_loss": 0.08056915551424026, "step": 8760 }, { "epoch": 2.925283522348232, "loss": 0.4792, "step": 8770 }, { "epoch": 2.925283522348232, "grad_norm": 3.125194787979126, "step": 8770 }, { "epoch": 2.925283522348232, "learning_rate": 0.00021976089845812438, "step": 8770 }, { "epoch": 2.925283522348232, "loss": 0.8893252611160278, "step": 8770 }, { "ce_loss": 0.2755357027053833, "epoch": 2.925283522348232, "step": 8770 }, { "distill_loss": 0.15055133402347565, "epoch": 2.925283522348232, "step": 8770 }, { "epoch": 2.925283522348232, "ref_ce_loss": 0.17915786802768707, "step": 8770 }, { "epoch": 2.925283522348232, "loss": 0.5901782512664795, "step": 8770 }, { "ce_loss": 0.08927714824676514, "epoch": 2.925283522348232, "step": 8770 }, { "distill_loss": 0.09461453557014465, "epoch": 2.925283522348232, "step": 8770 }, { "epoch": 2.925283522348232, "ref_ce_loss": 0.10167305171489716, "step": 8770 }, { "epoch": 2.925283522348232, "loss": 0.495090514421463, "step": 8770 }, { "ce_loss": 0.1624784767627716, "epoch": 2.925283522348232, "step": 8770 }, { "distill_loss": 0.138736754655838, "epoch": 2.925283522348232, "step": 8770 }, { "epoch": 2.925283522348232, "ref_ce_loss": 0.14611607789993286, "step": 8770 }, { "epoch": 2.925283522348232, "loss": 0.6977149248123169, "step": 8770 }, { "ce_loss": 0.1322113424539566, "epoch": 2.925283522348232, "step": 8770 }, { "distill_loss": 0.11863041669130325, "epoch": 2.925283522348232, "step": 8770 }, { "epoch": 2.925283522348232, "ref_ce_loss": 0.11613546311855316, "step": 8770 }, { "epoch": 2.9286190793862574, "loss": 0.5235, "step": 8780 }, { "epoch": 2.9286190793862574, "grad_norm": 3.383786201477051, "step": 8780 }, { "epoch": 2.9286190793862574, "learning_rate": 0.00021958151284288166, "step": 8780 }, { "epoch": 2.9286190793862574, "loss": 0.31101635098457336, "step": 8780 }, { "ce_loss": 0.12315681576728821, "epoch": 2.9286190793862574, "step": 8780 }, { "distill_loss": 0.10906527936458588, "epoch": 2.9286190793862574, "step": 8780 }, { "epoch": 2.9286190793862574, "ref_ce_loss": 0.07862290740013123, "step": 8780 }, { "epoch": 2.9286190793862574, "loss": 0.5271515846252441, "step": 8780 }, { "ce_loss": 0.1860402375459671, "epoch": 2.9286190793862574, "step": 8780 }, { "distill_loss": 0.16446442902088165, "epoch": 2.9286190793862574, "step": 8780 }, { "epoch": 2.9286190793862574, "ref_ce_loss": 0.1760958433151245, "step": 8780 }, { "epoch": 2.9286190793862574, "loss": 0.4236556887626648, "step": 8780 }, { "ce_loss": 0.17246517539024353, "epoch": 2.9286190793862574, "step": 8780 }, { "distill_loss": 0.09926985204219818, "epoch": 2.9286190793862574, "step": 8780 }, { "epoch": 2.9286190793862574, "ref_ce_loss": 0.09923750162124634, "step": 8780 }, { "epoch": 2.9286190793862574, "loss": 0.4667871594429016, "step": 8780 }, { "ce_loss": 0.20223020017147064, "epoch": 2.9286190793862574, "step": 8780 }, { "distill_loss": 0.13251884281635284, "epoch": 2.9286190793862574, "step": 8780 }, { "epoch": 2.9286190793862574, "ref_ce_loss": 0.1317136138677597, "step": 8780 }, { "epoch": 2.931954636424283, "loss": 0.4888, "step": 8790 }, { "epoch": 2.931954636424283, "grad_norm": 2.1957993507385254, "step": 8790 }, { "epoch": 2.931954636424283, "learning_rate": 0.00021940200033840714, "step": 8790 }, { "epoch": 2.931954636424283, "loss": 0.4426005482673645, "step": 8790 }, { "ce_loss": 0.1768053025007248, "epoch": 2.931954636424283, "step": 8790 }, { "distill_loss": 0.11535248160362244, "epoch": 2.931954636424283, "step": 8790 }, { "epoch": 2.931954636424283, "ref_ce_loss": 0.10447856783866882, "step": 8790 }, { "epoch": 2.931954636424283, "loss": 0.7331949472427368, "step": 8790 }, { "ce_loss": 0.2538256347179413, "epoch": 2.931954636424283, "step": 8790 }, { "distill_loss": 0.1367458701133728, "epoch": 2.931954636424283, "step": 8790 }, { "epoch": 2.931954636424283, "ref_ce_loss": 0.15786539018154144, "step": 8790 }, { "epoch": 2.931954636424283, "loss": 0.2962862551212311, "step": 8790 }, { "ce_loss": 0.04074576124548912, "epoch": 2.931954636424283, "step": 8790 }, { "distill_loss": 0.09793291985988617, "epoch": 2.931954636424283, "step": 8790 }, { "epoch": 2.931954636424283, "ref_ce_loss": 0.10114602744579315, "step": 8790 }, { "epoch": 2.931954636424283, "loss": 0.9041996002197266, "step": 8790 }, { "ce_loss": 0.18689587712287903, "epoch": 2.931954636424283, "step": 8790 }, { "distill_loss": 0.1299264132976532, "epoch": 2.931954636424283, "step": 8790 }, { "epoch": 2.931954636424283, "ref_ce_loss": 0.17926687002182007, "step": 8790 }, { "epoch": 2.935290193462308, "loss": 0.5363, "step": 8800 }, { "epoch": 2.935290193462308, "grad_norm": 2.918760299682617, "step": 8800 }, { "epoch": 2.935290193462308, "learning_rate": 0.00021922236127206083, "step": 8800 }, { "epoch": 2.935290193462308, "loss": 0.4478914737701416, "step": 8800 }, { "ce_loss": 0.11208988726139069, "epoch": 2.935290193462308, "step": 8800 }, { "distill_loss": 0.1034972220659256, "epoch": 2.935290193462308, "step": 8800 }, { "epoch": 2.935290193462308, "ref_ce_loss": 0.14677636325359344, "step": 8800 }, { "epoch": 2.935290193462308, "loss": 0.35105398297309875, "step": 8800 }, { "ce_loss": 0.09494055062532425, "epoch": 2.935290193462308, "step": 8800 }, { "distill_loss": 0.11517714709043503, "epoch": 2.935290193462308, "step": 8800 }, { "epoch": 2.935290193462308, "ref_ce_loss": 0.11402986943721771, "step": 8800 }, { "epoch": 2.935290193462308, "loss": 0.523328423500061, "step": 8800 }, { "ce_loss": 0.20816966891288757, "epoch": 2.935290193462308, "step": 8800 }, { "distill_loss": 0.15322241187095642, "epoch": 2.935290193462308, "step": 8800 }, { "epoch": 2.935290193462308, "ref_ce_loss": 0.12490464001893997, "step": 8800 }, { "epoch": 2.935290193462308, "loss": 0.474077433347702, "step": 8800 }, { "ce_loss": 0.19899730384349823, "epoch": 2.935290193462308, "step": 8800 }, { "distill_loss": 0.10921809822320938, "epoch": 2.935290193462308, "step": 8800 }, { "epoch": 2.935290193462308, "ref_ce_loss": 0.12667261064052582, "step": 8800 }, { "epoch": 2.9386257505003335, "loss": 0.5603, "step": 8810 }, { "epoch": 2.9386257505003335, "grad_norm": 3.7591745853424072, "step": 8810 }, { "epoch": 2.9386257505003335, "learning_rate": 0.00021904259597143357, "step": 8810 }, { "epoch": 2.9386257505003335, "loss": 0.28778815269470215, "step": 8810 }, { "ce_loss": 0.07531074434518814, "epoch": 2.9386257505003335, "step": 8810 }, { "distill_loss": 0.07752246409654617, "epoch": 2.9386257505003335, "step": 8810 }, { "epoch": 2.9386257505003335, "ref_ce_loss": 0.13453662395477295, "step": 8810 }, { "epoch": 2.9386257505003335, "loss": 0.38257116079330444, "step": 8810 }, { "ce_loss": 0.1476069539785385, "epoch": 2.9386257505003335, "step": 8810 }, { "distill_loss": 0.11568806320428848, "epoch": 2.9386257505003335, "step": 8810 }, { "epoch": 2.9386257505003335, "ref_ce_loss": 0.08604934066534042, "step": 8810 }, { "epoch": 2.9386257505003335, "loss": 0.5415005683898926, "step": 8810 }, { "ce_loss": 0.09510982036590576, "epoch": 2.9386257505003335, "step": 8810 }, { "distill_loss": 0.10478687286376953, "epoch": 2.9386257505003335, "step": 8810 }, { "epoch": 2.9386257505003335, "ref_ce_loss": 0.10170865058898926, "step": 8810 }, { "epoch": 2.9386257505003335, "loss": 0.3948809504508972, "step": 8810 }, { "ce_loss": 0.0706266388297081, "epoch": 2.9386257505003335, "step": 8810 }, { "distill_loss": 0.09294119477272034, "epoch": 2.9386257505003335, "step": 8810 }, { "epoch": 2.9386257505003335, "ref_ce_loss": 0.0870620384812355, "step": 8810 }, { "epoch": 2.941961307538359, "loss": 0.4867, "step": 8820 }, { "epoch": 2.941961307538359, "grad_norm": 2.9408466815948486, "step": 8820 }, { "epoch": 2.941961307538359, "learning_rate": 0.0002188627047643464, "step": 8820 }, { "epoch": 2.941961307538359, "loss": 0.5317444205284119, "step": 8820 }, { "ce_loss": 0.2094917893409729, "epoch": 2.941961307538359, "step": 8820 }, { "distill_loss": 0.10389654338359833, "epoch": 2.941961307538359, "step": 8820 }, { "epoch": 2.941961307538359, "ref_ce_loss": 0.11777611076831818, "step": 8820 }, { "epoch": 2.941961307538359, "loss": 0.3371651768684387, "step": 8820 }, { "ce_loss": 0.08623742312192917, "epoch": 2.941961307538359, "step": 8820 }, { "distill_loss": 0.1031375601887703, "epoch": 2.941961307538359, "step": 8820 }, { "epoch": 2.941961307538359, "ref_ce_loss": 0.1100323423743248, "step": 8820 }, { "epoch": 2.941961307538359, "loss": 0.20720899105072021, "step": 8820 }, { "ce_loss": 0.043221935629844666, "epoch": 2.941961307538359, "step": 8820 }, { "distill_loss": 0.0784808024764061, "epoch": 2.941961307538359, "step": 8820 }, { "epoch": 2.941961307538359, "ref_ce_loss": 0.0510651059448719, "step": 8820 }, { "epoch": 2.941961307538359, "loss": 0.2733457684516907, "step": 8820 }, { "ce_loss": 0.09705958515405655, "epoch": 2.941961307538359, "step": 8820 }, { "distill_loss": 0.0906776711344719, "epoch": 2.941961307538359, "step": 8820 }, { "epoch": 2.941961307538359, "ref_ce_loss": 0.08549680560827255, "step": 8820 }, { "epoch": 2.945296864576384, "loss": 0.5421, "step": 8830 }, { "epoch": 2.945296864576384, "grad_norm": 2.729051113128662, "step": 8830 }, { "epoch": 2.945296864576384, "learning_rate": 0.00021868268797884977, "step": 8830 }, { "epoch": 2.945296864576384, "loss": 0.6006139516830444, "step": 8830 }, { "ce_loss": 0.16521619260311127, "epoch": 2.945296864576384, "step": 8830 }, { "distill_loss": 0.11474260687828064, "epoch": 2.945296864576384, "step": 8830 }, { "epoch": 2.945296864576384, "ref_ce_loss": 0.11602941900491714, "step": 8830 }, { "epoch": 2.945296864576384, "loss": 0.3933120667934418, "step": 8830 }, { "ce_loss": 0.1256079524755478, "epoch": 2.945296864576384, "step": 8830 }, { "distill_loss": 0.11178599298000336, "epoch": 2.945296864576384, "step": 8830 }, { "epoch": 2.945296864576384, "ref_ce_loss": 0.11281760036945343, "step": 8830 }, { "epoch": 2.945296864576384, "loss": 0.5145058035850525, "step": 8830 }, { "ce_loss": 0.2089313119649887, "epoch": 2.945296864576384, "step": 8830 }, { "distill_loss": 0.12459740787744522, "epoch": 2.945296864576384, "step": 8830 }, { "epoch": 2.945296864576384, "ref_ce_loss": 0.1330752670764923, "step": 8830 }, { "epoch": 2.945296864576384, "loss": 0.4225755035877228, "step": 8830 }, { "ce_loss": 0.15203280746936798, "epoch": 2.945296864576384, "step": 8830 }, { "distill_loss": 0.09559550881385803, "epoch": 2.945296864576384, "step": 8830 }, { "epoch": 2.945296864576384, "ref_ce_loss": 0.1744471788406372, "step": 8830 }, { "epoch": 2.9486324216144095, "loss": 0.4838, "step": 8840 }, { "epoch": 2.9486324216144095, "grad_norm": 3.221513509750366, "step": 8840 }, { "epoch": 2.9486324216144095, "learning_rate": 0.00021850254594322344, "step": 8840 }, { "epoch": 2.9486324216144095, "loss": 0.5516033172607422, "step": 8840 }, { "ce_loss": 0.1504536122083664, "epoch": 2.9486324216144095, "step": 8840 }, { "distill_loss": 0.13644813001155853, "epoch": 2.9486324216144095, "step": 8840 }, { "epoch": 2.9486324216144095, "ref_ce_loss": 0.08675995469093323, "step": 8840 }, { "epoch": 2.9486324216144095, "loss": 0.5576673150062561, "step": 8840 }, { "ce_loss": 0.16298459470272064, "epoch": 2.9486324216144095, "step": 8840 }, { "distill_loss": 0.13553835451602936, "epoch": 2.9486324216144095, "step": 8840 }, { "epoch": 2.9486324216144095, "ref_ce_loss": 0.12538892030715942, "step": 8840 }, { "epoch": 2.9486324216144095, "loss": 0.8474454879760742, "step": 8840 }, { "ce_loss": 0.1506018489599228, "epoch": 2.9486324216144095, "step": 8840 }, { "distill_loss": 0.13082599639892578, "epoch": 2.9486324216144095, "step": 8840 }, { "epoch": 2.9486324216144095, "ref_ce_loss": 0.14312021434307098, "step": 8840 }, { "epoch": 2.9486324216144095, "loss": 0.549639880657196, "step": 8840 }, { "ce_loss": 0.22064092755317688, "epoch": 2.9486324216144095, "step": 8840 }, { "distill_loss": 0.14355742931365967, "epoch": 2.9486324216144095, "step": 8840 }, { "epoch": 2.9486324216144095, "ref_ce_loss": 0.1570969969034195, "step": 8840 }, { "epoch": 2.951967978652435, "loss": 0.5521, "step": 8850 }, { "epoch": 2.951967978652435, "grad_norm": 2.628145217895508, "step": 8850 }, { "epoch": 2.951967978652435, "learning_rate": 0.00021832227898597531, "step": 8850 }, { "epoch": 2.951967978652435, "loss": 0.8192481994628906, "step": 8850 }, { "ce_loss": 0.12512989342212677, "epoch": 2.951967978652435, "step": 8850 }, { "distill_loss": 0.12027587741613388, "epoch": 2.951967978652435, "step": 8850 }, { "epoch": 2.951967978652435, "ref_ce_loss": 0.14873076975345612, "step": 8850 }, { "epoch": 2.951967978652435, "loss": 0.4840548038482666, "step": 8850 }, { "ce_loss": 0.10911354422569275, "epoch": 2.951967978652435, "step": 8850 }, { "distill_loss": 0.1427203267812729, "epoch": 2.951967978652435, "step": 8850 }, { "epoch": 2.951967978652435, "ref_ce_loss": 0.14467257261276245, "step": 8850 }, { "epoch": 2.951967978652435, "loss": 0.45594289898872375, "step": 8850 }, { "ce_loss": 0.12329255044460297, "epoch": 2.951967978652435, "step": 8850 }, { "distill_loss": 0.11281190812587738, "epoch": 2.951967978652435, "step": 8850 }, { "epoch": 2.951967978652435, "ref_ce_loss": 0.14958961308002472, "step": 8850 }, { "epoch": 2.951967978652435, "loss": 0.34237849712371826, "step": 8850 }, { "ce_loss": 0.12874117493629456, "epoch": 2.951967978652435, "step": 8850 }, { "distill_loss": 0.11954338848590851, "epoch": 2.951967978652435, "step": 8850 }, { "epoch": 2.951967978652435, "ref_ce_loss": 0.09373276680707932, "step": 8850 }, { "epoch": 2.9553035356904602, "loss": 0.5088, "step": 8860 }, { "epoch": 2.9553035356904602, "grad_norm": 2.186516523361206, "step": 8860 }, { "epoch": 2.9553035356904602, "learning_rate": 0.00021814188743584127, "step": 8860 }, { "epoch": 2.9553035356904602, "loss": 0.3231074810028076, "step": 8860 }, { "ce_loss": 0.10812241584062576, "epoch": 2.9553035356904602, "step": 8860 }, { "distill_loss": 0.1252068430185318, "epoch": 2.9553035356904602, "step": 8860 }, { "epoch": 2.9553035356904602, "ref_ce_loss": 0.08961108326911926, "step": 8860 }, { "epoch": 2.9553035356904602, "loss": 0.6042312383651733, "step": 8860 }, { "ce_loss": 0.22129030525684357, "epoch": 2.9553035356904602, "step": 8860 }, { "distill_loss": 0.11306517571210861, "epoch": 2.9553035356904602, "step": 8860 }, { "epoch": 2.9553035356904602, "ref_ce_loss": 0.1952359974384308, "step": 8860 }, { "epoch": 2.9553035356904602, "loss": 0.35771751403808594, "step": 8860 }, { "ce_loss": 0.14291593432426453, "epoch": 2.9553035356904602, "step": 8860 }, { "distill_loss": 0.121894471347332, "epoch": 2.9553035356904602, "step": 8860 }, { "epoch": 2.9553035356904602, "ref_ce_loss": 0.09281952679157257, "step": 8860 }, { "epoch": 2.9553035356904602, "loss": 0.5629582405090332, "step": 8860 }, { "ce_loss": 0.13195520639419556, "epoch": 2.9553035356904602, "step": 8860 }, { "distill_loss": 0.1128406897187233, "epoch": 2.9553035356904602, "step": 8860 }, { "epoch": 2.9553035356904602, "ref_ce_loss": 0.11142636090517044, "step": 8860 }, { "epoch": 2.9586390927284856, "loss": 0.5349, "step": 8870 }, { "epoch": 2.9586390927284856, "grad_norm": 3.5918776988983154, "step": 8870 }, { "epoch": 2.9586390927284856, "learning_rate": 0.00021796137162178434, "step": 8870 }, { "epoch": 2.9586390927284856, "loss": 0.9226340055465698, "step": 8870 }, { "ce_loss": 0.1620820164680481, "epoch": 2.9586390927284856, "step": 8870 }, { "distill_loss": 0.11741302907466888, "epoch": 2.9586390927284856, "step": 8870 }, { "epoch": 2.9586390927284856, "ref_ce_loss": 0.13054203987121582, "step": 8870 }, { "epoch": 2.9586390927284856, "loss": 0.5688709020614624, "step": 8870 }, { "ce_loss": 0.17032210528850555, "epoch": 2.9586390927284856, "step": 8870 }, { "distill_loss": 0.09201649576425552, "epoch": 2.9586390927284856, "step": 8870 }, { "epoch": 2.9586390927284856, "ref_ce_loss": 0.1125497967004776, "step": 8870 }, { "epoch": 2.9586390927284856, "loss": 0.833220362663269, "step": 8870 }, { "ce_loss": 0.1887754648923874, "epoch": 2.9586390927284856, "step": 8870 }, { "distill_loss": 0.12600192427635193, "epoch": 2.9586390927284856, "step": 8870 }, { "epoch": 2.9586390927284856, "ref_ce_loss": 0.11542544513940811, "step": 8870 }, { "epoch": 2.9586390927284856, "loss": 0.4784468114376068, "step": 8870 }, { "ce_loss": 0.18221917748451233, "epoch": 2.9586390927284856, "step": 8870 }, { "distill_loss": 0.12532924115657806, "epoch": 2.9586390927284856, "step": 8870 }, { "epoch": 2.9586390927284856, "ref_ce_loss": 0.12261589616537094, "step": 8870 }, { "epoch": 2.961974649766511, "loss": 0.5122, "step": 8880 }, { "epoch": 2.961974649766511, "grad_norm": 4.213609218597412, "step": 8880 }, { "epoch": 2.961974649766511, "learning_rate": 0.0002177807318729941, "step": 8880 }, { "epoch": 2.961974649766511, "loss": 0.5542953014373779, "step": 8880 }, { "ce_loss": 0.17619939148426056, "epoch": 2.961974649766511, "step": 8880 }, { "distill_loss": 0.1372058242559433, "epoch": 2.961974649766511, "step": 8880 }, { "epoch": 2.961974649766511, "ref_ce_loss": 0.12143444269895554, "step": 8880 }, { "epoch": 2.961974649766511, "loss": 0.3381030857563019, "step": 8880 }, { "ce_loss": 0.08107311278581619, "epoch": 2.961974649766511, "step": 8880 }, { "distill_loss": 0.08807455003261566, "epoch": 2.961974649766511, "step": 8880 }, { "epoch": 2.961974649766511, "ref_ce_loss": 0.1255367249250412, "step": 8880 }, { "epoch": 2.961974649766511, "loss": 0.5628779530525208, "step": 8880 }, { "ce_loss": 0.11000987887382507, "epoch": 2.961974649766511, "step": 8880 }, { "distill_loss": 0.1430390328168869, "epoch": 2.961974649766511, "step": 8880 }, { "epoch": 2.961974649766511, "ref_ce_loss": 0.12550552189350128, "step": 8880 }, { "epoch": 2.961974649766511, "loss": 0.5493335723876953, "step": 8880 }, { "ce_loss": 0.16359969973564148, "epoch": 2.961974649766511, "step": 8880 }, { "distill_loss": 0.10959623754024506, "epoch": 2.961974649766511, "step": 8880 }, { "epoch": 2.961974649766511, "ref_ce_loss": 0.1296955943107605, "step": 8880 }, { "epoch": 2.9653102068045363, "loss": 0.5355, "step": 8890 }, { "epoch": 2.9653102068045363, "grad_norm": 4.092010021209717, "step": 8890 }, { "epoch": 2.9653102068045363, "learning_rate": 0.0002175999685188863, "step": 8890 }, { "epoch": 2.9653102068045363, "loss": 0.5732136368751526, "step": 8890 }, { "ce_loss": 0.14554201066493988, "epoch": 2.9653102068045363, "step": 8890 }, { "distill_loss": 0.1008024662733078, "epoch": 2.9653102068045363, "step": 8890 }, { "epoch": 2.9653102068045363, "ref_ce_loss": 0.08674776554107666, "step": 8890 }, { "epoch": 2.9653102068045363, "loss": 0.8132926225662231, "step": 8890 }, { "ce_loss": 0.1978740245103836, "epoch": 2.9653102068045363, "step": 8890 }, { "distill_loss": 0.11106804758310318, "epoch": 2.9653102068045363, "step": 8890 }, { "epoch": 2.9653102068045363, "ref_ce_loss": 0.15045933425426483, "step": 8890 }, { "epoch": 2.9653102068045363, "loss": 0.6142012476921082, "step": 8890 }, { "ce_loss": 0.1520072966814041, "epoch": 2.9653102068045363, "step": 8890 }, { "distill_loss": 0.11832404136657715, "epoch": 2.9653102068045363, "step": 8890 }, { "epoch": 2.9653102068045363, "ref_ce_loss": 0.06626222282648087, "step": 8890 }, { "epoch": 2.9653102068045363, "loss": 0.4120936691761017, "step": 8890 }, { "ce_loss": 0.16986437141895294, "epoch": 2.9653102068045363, "step": 8890 }, { "distill_loss": 0.11584517359733582, "epoch": 2.9653102068045363, "step": 8890 }, { "epoch": 2.9653102068045363, "ref_ce_loss": 0.09480530768632889, "step": 8890 }, { "epoch": 2.9686457638425616, "loss": 0.5003, "step": 8900 }, { "epoch": 2.9686457638425616, "grad_norm": 2.9350826740264893, "step": 8900 }, { "epoch": 2.9686457638425616, "learning_rate": 0.00021741908188910192, "step": 8900 }, { "epoch": 2.9686457638425616, "loss": 0.6007994413375854, "step": 8900 }, { "ce_loss": 0.1910446435213089, "epoch": 2.9686457638425616, "step": 8900 }, { "distill_loss": 0.13157133758068085, "epoch": 2.9686457638425616, "step": 8900 }, { "epoch": 2.9686457638425616, "ref_ce_loss": 0.10696756839752197, "step": 8900 }, { "epoch": 2.9686457638425616, "loss": 0.591404139995575, "step": 8900 }, { "ce_loss": 0.1629049926996231, "epoch": 2.9686457638425616, "step": 8900 }, { "distill_loss": 0.14794319868087769, "epoch": 2.9686457638425616, "step": 8900 }, { "epoch": 2.9686457638425616, "ref_ce_loss": 0.1411333680152893, "step": 8900 }, { "epoch": 2.9686457638425616, "loss": 0.4213687479496002, "step": 8900 }, { "ce_loss": 0.1346403956413269, "epoch": 2.9686457638425616, "step": 8900 }, { "distill_loss": 0.11285700649023056, "epoch": 2.9686457638425616, "step": 8900 }, { "epoch": 2.9686457638425616, "ref_ce_loss": 0.1737578809261322, "step": 8900 }, { "epoch": 2.9686457638425616, "loss": 0.5505105257034302, "step": 8900 }, { "ce_loss": 0.15355099737644196, "epoch": 2.9686457638425616, "step": 8900 }, { "distill_loss": 0.096382737159729, "epoch": 2.9686457638425616, "step": 8900 }, { "epoch": 2.9686457638425616, "ref_ce_loss": 0.12684959173202515, "step": 8900 }, { "epoch": 2.971981320880587, "loss": 0.5263, "step": 8910 }, { "epoch": 2.971981320880587, "grad_norm": 3.6547932624816895, "step": 8910 }, { "epoch": 2.971981320880587, "learning_rate": 0.00021723807231350685, "step": 8910 }, { "epoch": 2.971981320880587, "loss": 0.5092654824256897, "step": 8910 }, { "ce_loss": 0.1908111721277237, "epoch": 2.971981320880587, "step": 8910 }, { "distill_loss": 0.16151322424411774, "epoch": 2.971981320880587, "step": 8910 }, { "epoch": 2.971981320880587, "ref_ce_loss": 0.11491622030735016, "step": 8910 }, { "epoch": 2.971981320880587, "loss": 0.4845525026321411, "step": 8910 }, { "ce_loss": 0.17053207755088806, "epoch": 2.971981320880587, "step": 8910 }, { "distill_loss": 0.11018142849206924, "epoch": 2.971981320880587, "step": 8910 }, { "epoch": 2.971981320880587, "ref_ce_loss": 0.11159297078847885, "step": 8910 }, { "epoch": 2.971981320880587, "loss": 0.4210514426231384, "step": 8910 }, { "ce_loss": 0.09162740409374237, "epoch": 2.971981320880587, "step": 8910 }, { "distill_loss": 0.07755187153816223, "epoch": 2.971981320880587, "step": 8910 }, { "epoch": 2.971981320880587, "ref_ce_loss": 0.07986513525247574, "step": 8910 }, { "epoch": 2.971981320880587, "loss": 0.6294732689857483, "step": 8910 }, { "ce_loss": 0.18695926666259766, "epoch": 2.971981320880587, "step": 8910 }, { "distill_loss": 0.10205936431884766, "epoch": 2.971981320880587, "step": 8910 }, { "epoch": 2.971981320880587, "ref_ce_loss": 0.14748182892799377, "step": 8910 }, { "epoch": 2.9753168779186123, "loss": 0.4895, "step": 8920 }, { "epoch": 2.9753168779186123, "grad_norm": 3.938183069229126, "step": 8920 }, { "epoch": 2.9753168779186123, "learning_rate": 0.00021705694012219106, "step": 8920 }, { "epoch": 2.9753168779186123, "loss": 0.31843438744544983, "step": 8920 }, { "ce_loss": 0.1231832280755043, "epoch": 2.9753168779186123, "step": 8920 }, { "distill_loss": 0.10385991632938385, "epoch": 2.9753168779186123, "step": 8920 }, { "epoch": 2.9753168779186123, "ref_ce_loss": 0.09056884795427322, "step": 8920 }, { "epoch": 2.9753168779186123, "loss": 0.38025349378585815, "step": 8920 }, { "ce_loss": 0.1042127013206482, "epoch": 2.9753168779186123, "step": 8920 }, { "distill_loss": 0.09548382461071014, "epoch": 2.9753168779186123, "step": 8920 }, { "epoch": 2.9753168779186123, "ref_ce_loss": 0.073456771671772, "step": 8920 }, { "epoch": 2.9753168779186123, "loss": 0.5087049007415771, "step": 8920 }, { "ce_loss": 0.21662679314613342, "epoch": 2.9753168779186123, "step": 8920 }, { "distill_loss": 0.1301024705171585, "epoch": 2.9753168779186123, "step": 8920 }, { "epoch": 2.9753168779186123, "ref_ce_loss": 0.12792813777923584, "step": 8920 }, { "epoch": 2.9753168779186123, "loss": 0.4105304777622223, "step": 8920 }, { "ce_loss": 0.08565964549779892, "epoch": 2.9753168779186123, "step": 8920 }, { "distill_loss": 0.11993283033370972, "epoch": 2.9753168779186123, "step": 8920 }, { "epoch": 2.9753168779186123, "ref_ce_loss": 0.13341321051120758, "step": 8920 }, { "epoch": 2.9786524349566377, "loss": 0.559, "step": 8930 }, { "epoch": 2.9786524349566377, "grad_norm": 3.1146602630615234, "step": 8930 }, { "epoch": 2.9786524349566377, "learning_rate": 0.00021687568564546838, "step": 8930 }, { "epoch": 2.9786524349566377, "loss": 0.3035496771335602, "step": 8930 }, { "ce_loss": 0.09508823603391647, "epoch": 2.9786524349566377, "step": 8930 }, { "distill_loss": 0.09790746867656708, "epoch": 2.9786524349566377, "step": 8930 }, { "epoch": 2.9786524349566377, "ref_ce_loss": 0.08749634027481079, "step": 8930 }, { "epoch": 2.9786524349566377, "loss": 0.44061118364334106, "step": 8930 }, { "ce_loss": 0.13150790333747864, "epoch": 2.9786524349566377, "step": 8930 }, { "distill_loss": 0.14861956238746643, "epoch": 2.9786524349566377, "step": 8930 }, { "epoch": 2.9786524349566377, "ref_ce_loss": 0.12680856883525848, "step": 8930 }, { "epoch": 2.9786524349566377, "loss": 0.6023802757263184, "step": 8930 }, { "ce_loss": 0.18394355475902557, "epoch": 2.9786524349566377, "step": 8930 }, { "distill_loss": 0.12831705808639526, "epoch": 2.9786524349566377, "step": 8930 }, { "epoch": 2.9786524349566377, "ref_ce_loss": 0.11957930773496628, "step": 8930 }, { "epoch": 2.9786524349566377, "loss": 0.5828354954719543, "step": 8930 }, { "ce_loss": 0.21842427551746368, "epoch": 2.9786524349566377, "step": 8930 }, { "distill_loss": 0.1230367124080658, "epoch": 2.9786524349566377, "step": 8930 }, { "epoch": 2.9786524349566377, "ref_ce_loss": 0.1871989518404007, "step": 8930 }, { "epoch": 2.981987991994663, "loss": 0.4852, "step": 8940 }, { "epoch": 2.981987991994663, "grad_norm": 2.2136104106903076, "step": 8940 }, { "epoch": 2.981987991994663, "learning_rate": 0.00021669430921387534, "step": 8940 }, { "epoch": 2.981987991994663, "loss": 0.493175208568573, "step": 8940 }, { "ce_loss": 0.2051461786031723, "epoch": 2.981987991994663, "step": 8940 }, { "distill_loss": 0.13799598813056946, "epoch": 2.981987991994663, "step": 8940 }, { "epoch": 2.981987991994663, "ref_ce_loss": 0.14984993636608124, "step": 8940 }, { "epoch": 2.981987991994663, "loss": 0.5004106760025024, "step": 8940 }, { "ce_loss": 0.1794445663690567, "epoch": 2.981987991994663, "step": 8940 }, { "distill_loss": 0.118271604180336, "epoch": 2.981987991994663, "step": 8940 }, { "epoch": 2.981987991994663, "ref_ce_loss": 0.1278836578130722, "step": 8940 }, { "epoch": 2.981987991994663, "loss": 0.2564985454082489, "step": 8940 }, { "ce_loss": 0.10331545770168304, "epoch": 2.981987991994663, "step": 8940 }, { "distill_loss": 0.09063772112131119, "epoch": 2.981987991994663, "step": 8940 }, { "epoch": 2.981987991994663, "ref_ce_loss": 0.045049406588077545, "step": 8940 }, { "epoch": 2.981987991994663, "loss": 0.5444272756576538, "step": 8940 }, { "ce_loss": 0.13070593774318695, "epoch": 2.981987991994663, "step": 8940 }, { "distill_loss": 0.1038922667503357, "epoch": 2.981987991994663, "step": 8940 }, { "epoch": 2.981987991994663, "ref_ce_loss": 0.07918254286050797, "step": 8940 }, { "epoch": 2.9853235490326884, "loss": 0.516, "step": 8950 }, { "epoch": 2.9853235490326884, "grad_norm": 2.4029927253723145, "step": 8950 }, { "epoch": 2.9853235490326884, "learning_rate": 0.00021651281115817102, "step": 8950 }, { "epoch": 2.9853235490326884, "loss": 0.5528447031974792, "step": 8950 }, { "ce_loss": 0.24434742331504822, "epoch": 2.9853235490326884, "step": 8950 }, { "distill_loss": 0.13547056913375854, "epoch": 2.9853235490326884, "step": 8950 }, { "epoch": 2.9853235490326884, "ref_ce_loss": 0.11006741225719452, "step": 8950 }, { "epoch": 2.9853235490326884, "loss": 0.3580153286457062, "step": 8950 }, { "ce_loss": 0.11471273005008698, "epoch": 2.9853235490326884, "step": 8950 }, { "distill_loss": 0.11067156493663788, "epoch": 2.9853235490326884, "step": 8950 }, { "epoch": 2.9853235490326884, "ref_ce_loss": 0.1322018951177597, "step": 8950 }, { "epoch": 2.9853235490326884, "loss": 0.2604046165943146, "step": 8950 }, { "ce_loss": 0.10595227032899857, "epoch": 2.9853235490326884, "step": 8950 }, { "distill_loss": 0.09654046595096588, "epoch": 2.9853235490326884, "step": 8950 }, { "epoch": 2.9853235490326884, "ref_ce_loss": 0.05750226601958275, "step": 8950 }, { "epoch": 2.9853235490326884, "loss": 0.8750475645065308, "step": 8950 }, { "ce_loss": 0.13068419694900513, "epoch": 2.9853235490326884, "step": 8950 }, { "distill_loss": 0.11134978383779526, "epoch": 2.9853235490326884, "step": 8950 }, { "epoch": 2.9853235490326884, "ref_ce_loss": 0.10884664952754974, "step": 8950 }, { "epoch": 2.9886591060707137, "loss": 0.5772, "step": 8960 }, { "epoch": 2.9886591060707137, "grad_norm": 2.57700514793396, "step": 8960 }, { "epoch": 2.9886591060707137, "learning_rate": 0.00021633119180933634, "step": 8960 }, { "epoch": 2.9886591060707137, "loss": 0.354244202375412, "step": 8960 }, { "ce_loss": 0.09961654990911484, "epoch": 2.9886591060707137, "step": 8960 }, { "distill_loss": 0.11927730590105057, "epoch": 2.9886591060707137, "step": 8960 }, { "epoch": 2.9886591060707137, "ref_ce_loss": 0.09512662887573242, "step": 8960 }, { "epoch": 2.9886591060707137, "loss": 0.4154999852180481, "step": 8960 }, { "ce_loss": 0.1644178330898285, "epoch": 2.9886591060707137, "step": 8960 }, { "distill_loss": 0.12879574298858643, "epoch": 2.9886591060707137, "step": 8960 }, { "epoch": 2.9886591060707137, "ref_ce_loss": 0.12126053869724274, "step": 8960 }, { "epoch": 2.9886591060707137, "loss": 0.733989417552948, "step": 8960 }, { "ce_loss": 0.19184887409210205, "epoch": 2.9886591060707137, "step": 8960 }, { "distill_loss": 0.12150511890649796, "epoch": 2.9886591060707137, "step": 8960 }, { "epoch": 2.9886591060707137, "ref_ce_loss": 0.17909909784793854, "step": 8960 }, { "epoch": 2.9886591060707137, "loss": 0.7652954459190369, "step": 8960 }, { "ce_loss": 0.1274799257516861, "epoch": 2.9886591060707137, "step": 8960 }, { "distill_loss": 0.10142847150564194, "epoch": 2.9886591060707137, "step": 8960 }, { "epoch": 2.9886591060707137, "ref_ce_loss": 0.16671831905841827, "step": 8960 }, { "epoch": 2.991994663108739, "loss": 0.4935, "step": 8970 }, { "epoch": 2.991994663108739, "grad_norm": 2.5278472900390625, "step": 8970 }, { "epoch": 2.991994663108739, "learning_rate": 0.00021614945149857334, "step": 8970 }, { "epoch": 2.991994663108739, "loss": 0.3677584230899811, "step": 8970 }, { "ce_loss": 0.14266683161258698, "epoch": 2.991994663108739, "step": 8970 }, { "distill_loss": 0.14370639622211456, "epoch": 2.991994663108739, "step": 8970 }, { "epoch": 2.991994663108739, "ref_ce_loss": 0.08122113347053528, "step": 8970 }, { "epoch": 2.991994663108739, "loss": 0.45084571838378906, "step": 8970 }, { "ce_loss": 0.0990084707736969, "epoch": 2.991994663108739, "step": 8970 }, { "distill_loss": 0.11535260081291199, "epoch": 2.991994663108739, "step": 8970 }, { "epoch": 2.991994663108739, "ref_ce_loss": 0.10417357832193375, "step": 8970 }, { "epoch": 2.991994663108739, "loss": 0.38456830382347107, "step": 8970 }, { "ce_loss": 0.1252870112657547, "epoch": 2.991994663108739, "step": 8970 }, { "distill_loss": 0.11813102662563324, "epoch": 2.991994663108739, "step": 8970 }, { "epoch": 2.991994663108739, "ref_ce_loss": 0.1055547371506691, "step": 8970 }, { "epoch": 2.991994663108739, "loss": 0.47269147634506226, "step": 8970 }, { "ce_loss": 0.22794851660728455, "epoch": 2.991994663108739, "step": 8970 }, { "distill_loss": 0.11512558162212372, "epoch": 2.991994663108739, "step": 8970 }, { "epoch": 2.991994663108739, "ref_ce_loss": 0.12925688922405243, "step": 8970 }, { "epoch": 2.9953302201467644, "loss": 0.5518, "step": 8980 }, { "epoch": 2.9953302201467644, "grad_norm": 3.004040002822876, "step": 8980 }, { "epoch": 2.9953302201467644, "learning_rate": 0.00021596759055730465, "step": 8980 }, { "epoch": 2.9953302201467644, "loss": 0.45926421880722046, "step": 8980 }, { "ce_loss": 0.20763225853443146, "epoch": 2.9953302201467644, "step": 8980 }, { "distill_loss": 0.15198659896850586, "epoch": 2.9953302201467644, "step": 8980 }, { "epoch": 2.9953302201467644, "ref_ce_loss": 0.09958194941282272, "step": 8980 }, { "epoch": 2.9953302201467644, "loss": 0.3914671242237091, "step": 8980 }, { "ce_loss": 0.1506868302822113, "epoch": 2.9953302201467644, "step": 8980 }, { "distill_loss": 0.11587530374526978, "epoch": 2.9953302201467644, "step": 8980 }, { "epoch": 2.9953302201467644, "ref_ce_loss": 0.07094470411539078, "step": 8980 }, { "epoch": 2.9953302201467644, "loss": 0.4542044699192047, "step": 8980 }, { "ce_loss": 0.16601303219795227, "epoch": 2.9953302201467644, "step": 8980 }, { "distill_loss": 0.1144561842083931, "epoch": 2.9953302201467644, "step": 8980 }, { "epoch": 2.9953302201467644, "ref_ce_loss": 0.09616707265377045, "step": 8980 }, { "epoch": 2.9953302201467644, "loss": 0.8602047562599182, "step": 8980 }, { "ce_loss": 0.18182548880577087, "epoch": 2.9953302201467644, "step": 8980 }, { "distill_loss": 0.13099931180477142, "epoch": 2.9953302201467644, "step": 8980 }, { "epoch": 2.9953302201467644, "ref_ce_loss": 0.15818721055984497, "step": 8980 }, { "epoch": 2.9986657771847898, "loss": 0.5197, "step": 8990 }, { "epoch": 2.9986657771847898, "grad_norm": 3.4914965629577637, "step": 8990 }, { "epoch": 2.9986657771847898, "learning_rate": 0.0002157856093171728, "step": 8990 }, { "epoch": 2.9986657771847898, "loss": 0.8473386764526367, "step": 8990 }, { "ce_loss": 0.1329667866230011, "epoch": 2.9986657771847898, "step": 8990 }, { "distill_loss": 0.10136765241622925, "epoch": 2.9986657771847898, "step": 8990 }, { "epoch": 2.9986657771847898, "ref_ce_loss": 0.12561270594596863, "step": 8990 }, { "epoch": 2.9986657771847898, "loss": 0.3459990918636322, "step": 8990 }, { "ce_loss": 0.09148216247558594, "epoch": 2.9986657771847898, "step": 8990 }, { "distill_loss": 0.08927734196186066, "epoch": 2.9986657771847898, "step": 8990 }, { "epoch": 2.9986657771847898, "ref_ce_loss": 0.16515298187732697, "step": 8990 }, { "epoch": 2.9986657771847898, "loss": 0.2844806909561157, "step": 8990 }, { "ce_loss": 0.10032972693443298, "epoch": 2.9986657771847898, "step": 8990 }, { "distill_loss": 0.12092792987823486, "epoch": 2.9986657771847898, "step": 8990 }, { "epoch": 2.9986657771847898, "ref_ce_loss": 0.063165083527565, "step": 8990 }, { "epoch": 2.9986657771847898, "loss": 0.5512446761131287, "step": 8990 }, { "ce_loss": 0.17064183950424194, "epoch": 2.9986657771847898, "step": 8990 }, { "distill_loss": 0.10145299881696701, "epoch": 2.9986657771847898, "step": 8990 }, { "epoch": 2.9986657771847898, "ref_ce_loss": 0.0986415445804596, "step": 8990 }, { "epoch": 3.002001334222815, "loss": 0.4782, "step": 9000 }, { "epoch": 3.002001334222815, "grad_norm": 2.815939426422119, "step": 9000 }, { "epoch": 3.002001334222815, "learning_rate": 0.0002156035081100399, "step": 9000 }, { "epoch": 3.002001334222815, "loss": 0.5625020861625671, "step": 9000 }, { "ce_loss": 0.15781192481517792, "epoch": 3.002001334222815, "step": 9000 }, { "distill_loss": 0.15901319682598114, "epoch": 3.002001334222815, "step": 9000 }, { "epoch": 3.002001334222815, "ref_ce_loss": 0.14345714449882507, "step": 9000 }, { "epoch": 3.002001334222815, "loss": 0.30363473296165466, "step": 9000 }, { "ce_loss": 0.12738262116909027, "epoch": 3.002001334222815, "step": 9000 }, { "distill_loss": 0.11215686798095703, "epoch": 3.002001334222815, "step": 9000 }, { "epoch": 3.002001334222815, "ref_ce_loss": 0.06402139365673065, "step": 9000 }, { "epoch": 3.002001334222815, "loss": 0.4059531092643738, "step": 9000 }, { "ce_loss": 0.11209169775247574, "epoch": 3.002001334222815, "step": 9000 }, { "distill_loss": 0.13779860734939575, "epoch": 3.002001334222815, "step": 9000 }, { "epoch": 3.002001334222815, "ref_ce_loss": 0.11790633201599121, "step": 9000 }, { "epoch": 3.002001334222815, "loss": 0.5732775926589966, "step": 9000 }, { "ce_loss": 0.2084646373987198, "epoch": 3.002001334222815, "step": 9000 }, { "distill_loss": 0.1004604697227478, "epoch": 3.002001334222815, "step": 9000 }, { "epoch": 3.002001334222815, "ref_ce_loss": 0.11193086206912994, "step": 9000 }, { "epoch": 3.0053368912608405, "loss": 0.4867, "step": 9010 }, { "epoch": 3.0053368912608405, "grad_norm": 2.182608127593994, "step": 9010 }, { "epoch": 3.0053368912608405, "learning_rate": 0.0002154212872679867, "step": 9010 }, { "epoch": 3.0053368912608405, "loss": 0.7315200567245483, "step": 9010 }, { "ce_loss": 0.2323620617389679, "epoch": 3.0053368912608405, "step": 9010 }, { "distill_loss": 0.1421075165271759, "epoch": 3.0053368912608405, "step": 9010 }, { "epoch": 3.0053368912608405, "ref_ce_loss": 0.12837104499340057, "step": 9010 }, { "epoch": 3.0053368912608405, "loss": 0.410982608795166, "step": 9010 }, { "ce_loss": 0.19520390033721924, "epoch": 3.0053368912608405, "step": 9010 }, { "distill_loss": 0.12332181632518768, "epoch": 3.0053368912608405, "step": 9010 }, { "epoch": 3.0053368912608405, "ref_ce_loss": 0.09218749403953552, "step": 9010 }, { "epoch": 3.0053368912608405, "loss": 0.4925827383995056, "step": 9010 }, { "ce_loss": 0.2317257672548294, "epoch": 3.0053368912608405, "step": 9010 }, { "distill_loss": 0.15137000381946564, "epoch": 3.0053368912608405, "step": 9010 }, { "epoch": 3.0053368912608405, "ref_ce_loss": 0.10924065113067627, "step": 9010 }, { "epoch": 3.0053368912608405, "loss": 0.4396410584449768, "step": 9010 }, { "ce_loss": 0.16249550879001617, "epoch": 3.0053368912608405, "step": 9010 }, { "distill_loss": 0.14607492089271545, "epoch": 3.0053368912608405, "step": 9010 }, { "epoch": 3.0053368912608405, "ref_ce_loss": 0.09223807603120804, "step": 9010 }, { "epoch": 3.008672448298866, "loss": 0.5155, "step": 9020 }, { "epoch": 3.008672448298866, "grad_norm": 1.9353896379470825, "step": 9020 }, { "epoch": 3.008672448298866, "learning_rate": 0.00021523894712331215, "step": 9020 }, { "epoch": 3.008672448298866, "loss": 0.3911009430885315, "step": 9020 }, { "ce_loss": 0.0835895836353302, "epoch": 3.008672448298866, "step": 9020 }, { "distill_loss": 0.16284173727035522, "epoch": 3.008672448298866, "step": 9020 }, { "epoch": 3.008672448298866, "ref_ce_loss": 0.11544522643089294, "step": 9020 }, { "epoch": 3.008672448298866, "loss": 0.4121594727039337, "step": 9020 }, { "ce_loss": 0.10628797113895416, "epoch": 3.008672448298866, "step": 9020 }, { "distill_loss": 0.0858006551861763, "epoch": 3.008672448298866, "step": 9020 }, { "epoch": 3.008672448298866, "ref_ce_loss": 0.14815564453601837, "step": 9020 }, { "epoch": 3.008672448298866, "loss": 0.4245898723602295, "step": 9020 }, { "ce_loss": 0.10541052371263504, "epoch": 3.008672448298866, "step": 9020 }, { "distill_loss": 0.15007489919662476, "epoch": 3.008672448298866, "step": 9020 }, { "epoch": 3.008672448298866, "ref_ce_loss": 0.1281569004058838, "step": 9020 }, { "epoch": 3.008672448298866, "loss": 0.394951194524765, "step": 9020 }, { "ce_loss": 0.09292899072170258, "epoch": 3.008672448298866, "step": 9020 }, { "distill_loss": 0.1514538824558258, "epoch": 3.008672448298866, "step": 9020 }, { "epoch": 3.008672448298866, "ref_ce_loss": 0.11690583825111389, "step": 9020 }, { "epoch": 3.012008005336891, "loss": 0.4817, "step": 9030 }, { "epoch": 3.012008005336891, "grad_norm": 3.1654815673828125, "step": 9030 }, { "epoch": 3.012008005336891, "learning_rate": 0.00021505648800853263, "step": 9030 }, { "epoch": 3.012008005336891, "loss": 0.32135191559791565, "step": 9030 }, { "ce_loss": 0.10678337514400482, "epoch": 3.012008005336891, "step": 9030 }, { "distill_loss": 0.0853194147348404, "epoch": 3.012008005336891, "step": 9030 }, { "epoch": 3.012008005336891, "ref_ce_loss": 0.066754050552845, "step": 9030 }, { "epoch": 3.012008005336891, "loss": 0.4086533188819885, "step": 9030 }, { "ce_loss": 0.08803881704807281, "epoch": 3.012008005336891, "step": 9030 }, { "distill_loss": 0.1329783946275711, "epoch": 3.012008005336891, "step": 9030 }, { "epoch": 3.012008005336891, "ref_ce_loss": 0.12893329560756683, "step": 9030 }, { "epoch": 3.012008005336891, "loss": 0.6685104370117188, "step": 9030 }, { "ce_loss": 0.15491558611392975, "epoch": 3.012008005336891, "step": 9030 }, { "distill_loss": 0.28164738416671753, "epoch": 3.012008005336891, "step": 9030 }, { "epoch": 3.012008005336891, "ref_ce_loss": 0.11756544560194016, "step": 9030 }, { "epoch": 3.012008005336891, "loss": 0.555343508720398, "step": 9030 }, { "ce_loss": 0.16929669678211212, "epoch": 3.012008005336891, "step": 9030 }, { "distill_loss": 0.13908150792121887, "epoch": 3.012008005336891, "step": 9030 }, { "epoch": 3.012008005336891, "ref_ce_loss": 0.1195400282740593, "step": 9030 }, { "epoch": 3.0153435623749165, "loss": 0.5041, "step": 9040 }, { "epoch": 3.0153435623749165, "grad_norm": 1.995711088180542, "step": 9040 }, { "epoch": 3.0153435623749165, "learning_rate": 0.00021487391025638172, "step": 9040 }, { "epoch": 3.0153435623749165, "loss": 0.3834221661090851, "step": 9040 }, { "ce_loss": 0.17786496877670288, "epoch": 3.0153435623749165, "step": 9040 }, { "distill_loss": 0.11993467062711716, "epoch": 3.0153435623749165, "step": 9040 }, { "epoch": 3.0153435623749165, "ref_ce_loss": 0.08478295803070068, "step": 9040 }, { "epoch": 3.0153435623749165, "loss": 0.4656774699687958, "step": 9040 }, { "ce_loss": 0.1686105728149414, "epoch": 3.0153435623749165, "step": 9040 }, { "distill_loss": 0.15664944052696228, "epoch": 3.0153435623749165, "step": 9040 }, { "epoch": 3.0153435623749165, "ref_ce_loss": 0.09069463610649109, "step": 9040 }, { "epoch": 3.0153435623749165, "loss": 0.28203755617141724, "step": 9040 }, { "ce_loss": 0.054809946566820145, "epoch": 3.0153435623749165, "step": 9040 }, { "distill_loss": 0.1106935366988182, "epoch": 3.0153435623749165, "step": 9040 }, { "epoch": 3.0153435623749165, "ref_ce_loss": 0.07380043715238571, "step": 9040 }, { "epoch": 3.0153435623749165, "loss": 0.6601328253746033, "step": 9040 }, { "ce_loss": 0.18228961527347565, "epoch": 3.0153435623749165, "step": 9040 }, { "distill_loss": 0.14953771233558655, "epoch": 3.0153435623749165, "step": 9040 }, { "epoch": 3.0153435623749165, "ref_ce_loss": 0.11164674162864685, "step": 9040 }, { "epoch": 3.018679119412942, "loss": 0.4816, "step": 9050 }, { "epoch": 3.018679119412942, "grad_norm": 3.625382423400879, "step": 9050 }, { "epoch": 3.018679119412942, "learning_rate": 0.00021469121419980916, "step": 9050 }, { "epoch": 3.018679119412942, "loss": 0.4787139296531677, "step": 9050 }, { "ce_loss": 0.20587719976902008, "epoch": 3.018679119412942, "step": 9050 }, { "distill_loss": 0.16615983843803406, "epoch": 3.018679119412942, "step": 9050 }, { "epoch": 3.018679119412942, "ref_ce_loss": 0.10627803951501846, "step": 9050 }, { "epoch": 3.018679119412942, "loss": 0.5185806751251221, "step": 9050 }, { "ce_loss": 0.11246607452630997, "epoch": 3.018679119412942, "step": 9050 }, { "distill_loss": 0.1460881382226944, "epoch": 3.018679119412942, "step": 9050 }, { "epoch": 3.018679119412942, "ref_ce_loss": 0.08953309059143066, "step": 9050 }, { "epoch": 3.018679119412942, "loss": 0.986879289150238, "step": 9050 }, { "ce_loss": 0.14453230798244476, "epoch": 3.018679119412942, "step": 9050 }, { "distill_loss": 0.1707886904478073, "epoch": 3.018679119412942, "step": 9050 }, { "epoch": 3.018679119412942, "ref_ce_loss": 0.12012699991464615, "step": 9050 }, { "epoch": 3.018679119412942, "loss": 0.46279260516166687, "step": 9050 }, { "ce_loss": 0.14909015595912933, "epoch": 3.018679119412942, "step": 9050 }, { "distill_loss": 0.17477688193321228, "epoch": 3.018679119412942, "step": 9050 }, { "epoch": 3.018679119412942, "ref_ce_loss": 0.10666271299123764, "step": 9050 }, { "epoch": 3.022014676450967, "loss": 0.5947, "step": 9060 }, { "epoch": 3.022014676450967, "grad_norm": 3.0756499767303467, "step": 9060 }, { "epoch": 3.022014676450967, "learning_rate": 0.00021450840017198049, "step": 9060 }, { "epoch": 3.022014676450967, "loss": 0.5729109644889832, "step": 9060 }, { "ce_loss": 0.23215997219085693, "epoch": 3.022014676450967, "step": 9060 }, { "distill_loss": 0.18815375864505768, "epoch": 3.022014676450967, "step": 9060 }, { "epoch": 3.022014676450967, "ref_ce_loss": 0.11127826571464539, "step": 9060 }, { "epoch": 3.022014676450967, "loss": 0.7342166900634766, "step": 9060 }, { "ce_loss": 0.12690752744674683, "epoch": 3.022014676450967, "step": 9060 }, { "distill_loss": 0.18578952550888062, "epoch": 3.022014676450967, "step": 9060 }, { "epoch": 3.022014676450967, "ref_ce_loss": 0.07483983039855957, "step": 9060 }, { "epoch": 3.022014676450967, "loss": 0.7186385989189148, "step": 9060 }, { "ce_loss": 0.19234232604503632, "epoch": 3.022014676450967, "step": 9060 }, { "distill_loss": 0.2232697308063507, "epoch": 3.022014676450967, "step": 9060 }, { "epoch": 3.022014676450967, "ref_ce_loss": 0.11886679381132126, "step": 9060 }, { "epoch": 3.022014676450967, "loss": 0.5786302089691162, "step": 9060 }, { "ce_loss": 0.1808573454618454, "epoch": 3.022014676450967, "step": 9060 }, { "distill_loss": 0.14951898157596588, "epoch": 3.022014676450967, "step": 9060 }, { "epoch": 3.022014676450967, "ref_ce_loss": 0.14975999295711517, "step": 9060 }, { "epoch": 3.0253502334889926, "loss": 0.5207, "step": 9070 }, { "epoch": 3.0253502334889926, "grad_norm": 2.4221537113189697, "step": 9070 }, { "epoch": 3.0253502334889926, "learning_rate": 0.0002143254685062764, "step": 9070 }, { "epoch": 3.0253502334889926, "loss": 0.462746798992157, "step": 9070 }, { "ce_loss": 0.08939208835363388, "epoch": 3.0253502334889926, "step": 9070 }, { "distill_loss": 0.10813222080469131, "epoch": 3.0253502334889926, "step": 9070 }, { "epoch": 3.0253502334889926, "ref_ce_loss": 0.1629054695367813, "step": 9070 }, { "epoch": 3.0253502334889926, "loss": 0.36405521631240845, "step": 9070 }, { "ce_loss": 0.11861259490251541, "epoch": 3.0253502334889926, "step": 9070 }, { "distill_loss": 0.08977380394935608, "epoch": 3.0253502334889926, "step": 9070 }, { "epoch": 3.0253502334889926, "ref_ce_loss": 0.10912643373012543, "step": 9070 }, { "epoch": 3.0253502334889926, "loss": 0.4217601716518402, "step": 9070 }, { "ce_loss": 0.1625213623046875, "epoch": 3.0253502334889926, "step": 9070 }, { "distill_loss": 0.1497325301170349, "epoch": 3.0253502334889926, "step": 9070 }, { "epoch": 3.0253502334889926, "ref_ce_loss": 0.10908317565917969, "step": 9070 }, { "epoch": 3.0253502334889926, "loss": 0.36443212628364563, "step": 9070 }, { "ce_loss": 0.10747730731964111, "epoch": 3.0253502334889926, "step": 9070 }, { "distill_loss": 0.10043259710073471, "epoch": 3.0253502334889926, "step": 9070 }, { "epoch": 3.0253502334889926, "ref_ce_loss": 0.1053435280919075, "step": 9070 }, { "epoch": 3.028685790527018, "loss": 0.4974, "step": 9080 }, { "epoch": 3.028685790527018, "grad_norm": 2.5194292068481445, "step": 9080 }, { "epoch": 3.028685790527018, "learning_rate": 0.0002141424195362921, "step": 9080 }, { "epoch": 3.028685790527018, "loss": 0.38930782675743103, "step": 9080 }, { "ce_loss": 0.11574207991361618, "epoch": 3.028685790527018, "step": 9080 }, { "distill_loss": 0.14219531416893005, "epoch": 3.028685790527018, "step": 9080 }, { "epoch": 3.028685790527018, "ref_ce_loss": 0.09605050086975098, "step": 9080 }, { "epoch": 3.028685790527018, "loss": 0.45022615790367126, "step": 9080 }, { "ce_loss": 0.09811569005250931, "epoch": 3.028685790527018, "step": 9080 }, { "distill_loss": 0.1672498881816864, "epoch": 3.028685790527018, "step": 9080 }, { "epoch": 3.028685790527018, "ref_ce_loss": 0.13437940180301666, "step": 9080 }, { "epoch": 3.028685790527018, "loss": 0.5666844844818115, "step": 9080 }, { "ce_loss": 0.1607423722743988, "epoch": 3.028685790527018, "step": 9080 }, { "distill_loss": 0.1925409585237503, "epoch": 3.028685790527018, "step": 9080 }, { "epoch": 3.028685790527018, "ref_ce_loss": 0.15472768247127533, "step": 9080 }, { "epoch": 3.028685790527018, "loss": 0.4131091833114624, "step": 9080 }, { "ce_loss": 0.11376647651195526, "epoch": 3.028685790527018, "step": 9080 }, { "distill_loss": 0.1661110818386078, "epoch": 3.028685790527018, "step": 9080 }, { "epoch": 3.028685790527018, "ref_ce_loss": 0.06726560741662979, "step": 9080 }, { "epoch": 3.0320213475650433, "loss": 0.5143, "step": 9090 }, { "epoch": 3.0320213475650433, "grad_norm": 2.647369146347046, "step": 9090 }, { "epoch": 3.0320213475650433, "learning_rate": 0.00021395925359583666, "step": 9090 }, { "epoch": 3.0320213475650433, "loss": 0.6554050445556641, "step": 9090 }, { "ce_loss": 0.07042604684829712, "epoch": 3.0320213475650433, "step": 9090 }, { "distill_loss": 0.08944550156593323, "epoch": 3.0320213475650433, "step": 9090 }, { "epoch": 3.0320213475650433, "ref_ce_loss": 0.08455150574445724, "step": 9090 }, { "epoch": 3.0320213475650433, "loss": 0.40479806065559387, "step": 9090 }, { "ce_loss": 0.1453285664319992, "epoch": 3.0320213475650433, "step": 9090 }, { "distill_loss": 0.11391732096672058, "epoch": 3.0320213475650433, "step": 9090 }, { "epoch": 3.0320213475650433, "ref_ce_loss": 0.11537247151136398, "step": 9090 }, { "epoch": 3.0320213475650433, "loss": 0.42904889583587646, "step": 9090 }, { "ce_loss": 0.09523611515760422, "epoch": 3.0320213475650433, "step": 9090 }, { "distill_loss": 0.080131895840168, "epoch": 3.0320213475650433, "step": 9090 }, { "epoch": 3.0320213475650433, "ref_ce_loss": 0.07932931184768677, "step": 9090 }, { "epoch": 3.0320213475650433, "loss": 0.6467583179473877, "step": 9090 }, { "ce_loss": 0.12051200866699219, "epoch": 3.0320213475650433, "step": 9090 }, { "distill_loss": 0.13366849720478058, "epoch": 3.0320213475650433, "step": 9090 }, { "epoch": 3.0320213475650433, "ref_ce_loss": 0.1472989320755005, "step": 9090 }, { "epoch": 3.0353569046030686, "loss": 0.4952, "step": 9100 }, { "epoch": 3.0353569046030686, "grad_norm": 2.8011693954467773, "step": 9100 }, { "epoch": 3.0353569046030686, "learning_rate": 0.00021377597101893256, "step": 9100 }, { "epoch": 3.0353569046030686, "loss": 0.42988431453704834, "step": 9100 }, { "ce_loss": 0.17967963218688965, "epoch": 3.0353569046030686, "step": 9100 }, { "distill_loss": 0.11916738003492355, "epoch": 3.0353569046030686, "step": 9100 }, { "epoch": 3.0353569046030686, "ref_ce_loss": 0.10457290709018707, "step": 9100 }, { "epoch": 3.0353569046030686, "loss": 0.47870928049087524, "step": 9100 }, { "ce_loss": 0.12809933722019196, "epoch": 3.0353569046030686, "step": 9100 }, { "distill_loss": 0.0908149853348732, "epoch": 3.0353569046030686, "step": 9100 }, { "epoch": 3.0353569046030686, "ref_ce_loss": 0.07106465846300125, "step": 9100 }, { "epoch": 3.0353569046030686, "loss": 0.49013468623161316, "step": 9100 }, { "ce_loss": 0.09937848895788193, "epoch": 3.0353569046030686, "step": 9100 }, { "distill_loss": 0.0903325080871582, "epoch": 3.0353569046030686, "step": 9100 }, { "epoch": 3.0353569046030686, "ref_ce_loss": 0.1495119333267212, "step": 9100 }, { "epoch": 3.0353569046030686, "loss": 0.49605005979537964, "step": 9100 }, { "ce_loss": 0.11459803581237793, "epoch": 3.0353569046030686, "step": 9100 }, { "distill_loss": 0.09380602091550827, "epoch": 3.0353569046030686, "step": 9100 }, { "epoch": 3.0353569046030686, "ref_ce_loss": 0.06981277465820312, "step": 9100 }, { "epoch": 3.038692461641094, "loss": 0.501, "step": 9110 }, { "epoch": 3.038692461641094, "grad_norm": 3.9146056175231934, "step": 9110 }, { "epoch": 3.038692461641094, "learning_rate": 0.00021359257213981485, "step": 9110 }, { "epoch": 3.038692461641094, "loss": 0.9335725903511047, "step": 9110 }, { "ce_loss": 0.21147924661636353, "epoch": 3.038692461641094, "step": 9110 }, { "distill_loss": 0.17345088720321655, "epoch": 3.038692461641094, "step": 9110 }, { "epoch": 3.038692461641094, "ref_ce_loss": 0.10016033053398132, "step": 9110 }, { "epoch": 3.038692461641094, "loss": 0.5776867866516113, "step": 9110 }, { "ce_loss": 0.24646618962287903, "epoch": 3.038692461641094, "step": 9110 }, { "distill_loss": 0.1633155643939972, "epoch": 3.038692461641094, "step": 9110 }, { "epoch": 3.038692461641094, "ref_ce_loss": 0.1673925817012787, "step": 9110 }, { "epoch": 3.038692461641094, "loss": 0.5697858333587646, "step": 9110 }, { "ce_loss": 0.10325392335653305, "epoch": 3.038692461641094, "step": 9110 }, { "distill_loss": 0.26549357175827026, "epoch": 3.038692461641094, "step": 9110 }, { "epoch": 3.038692461641094, "ref_ce_loss": 0.13094070553779602, "step": 9110 }, { "epoch": 3.038692461641094, "loss": 0.44562843441963196, "step": 9110 }, { "ce_loss": 0.11659137159585953, "epoch": 3.038692461641094, "step": 9110 }, { "distill_loss": 0.2210933119058609, "epoch": 3.038692461641094, "step": 9110 }, { "epoch": 3.038692461641094, "ref_ce_loss": 0.10697054117918015, "step": 9110 }, { "epoch": 3.0420280186791193, "loss": 0.5368, "step": 9120 }, { "epoch": 3.0420280186791193, "grad_norm": 3.2923338413238525, "step": 9120 }, { "epoch": 3.0420280186791193, "learning_rate": 0.00021340905729293078, "step": 9120 }, { "epoch": 3.0420280186791193, "loss": 0.7003005743026733, "step": 9120 }, { "ce_loss": 0.2035401165485382, "epoch": 3.0420280186791193, "step": 9120 }, { "distill_loss": 0.12703534960746765, "epoch": 3.0420280186791193, "step": 9120 }, { "epoch": 3.0420280186791193, "ref_ce_loss": 0.1929311752319336, "step": 9120 }, { "epoch": 3.0420280186791193, "loss": 0.32534945011138916, "step": 9120 }, { "ce_loss": 0.09824029356241226, "epoch": 3.0420280186791193, "step": 9120 }, { "distill_loss": 0.0958671048283577, "epoch": 3.0420280186791193, "step": 9120 }, { "epoch": 3.0420280186791193, "ref_ce_loss": 0.0892966240644455, "step": 9120 }, { "epoch": 3.0420280186791193, "loss": 0.38903501629829407, "step": 9120 }, { "ce_loss": 0.15053687989711761, "epoch": 3.0420280186791193, "step": 9120 }, { "distill_loss": 0.11209321022033691, "epoch": 3.0420280186791193, "step": 9120 }, { "epoch": 3.0420280186791193, "ref_ce_loss": 0.07555312663316727, "step": 9120 }, { "epoch": 3.0420280186791193, "loss": 0.3890794515609741, "step": 9120 }, { "ce_loss": 0.07411421835422516, "epoch": 3.0420280186791193, "step": 9120 }, { "distill_loss": 0.11061069369316101, "epoch": 3.0420280186791193, "step": 9120 }, { "epoch": 3.0420280186791193, "ref_ce_loss": 0.09935502707958221, "step": 9120 }, { "epoch": 3.0453635757171447, "loss": 0.4967, "step": 9130 }, { "epoch": 3.0453635757171447, "grad_norm": 2.6918272972106934, "step": 9130 }, { "epoch": 3.0453635757171447, "learning_rate": 0.00021322542681293904, "step": 9130 }, { "epoch": 3.0453635757171447, "loss": 0.7495424747467041, "step": 9130 }, { "ce_loss": 0.14578987658023834, "epoch": 3.0453635757171447, "step": 9130 }, { "distill_loss": 0.09604738652706146, "epoch": 3.0453635757171447, "step": 9130 }, { "epoch": 3.0453635757171447, "ref_ce_loss": 0.07308385521173477, "step": 9130 }, { "epoch": 3.0453635757171447, "loss": 0.7880163788795471, "step": 9130 }, { "ce_loss": 0.12381213158369064, "epoch": 3.0453635757171447, "step": 9130 }, { "distill_loss": 0.1534123718738556, "epoch": 3.0453635757171447, "step": 9130 }, { "epoch": 3.0453635757171447, "ref_ce_loss": 0.11481275409460068, "step": 9130 }, { "epoch": 3.0453635757171447, "loss": 0.3943750262260437, "step": 9130 }, { "ce_loss": 0.12776845693588257, "epoch": 3.0453635757171447, "step": 9130 }, { "distill_loss": 0.15161047875881195, "epoch": 3.0453635757171447, "step": 9130 }, { "epoch": 3.0453635757171447, "ref_ce_loss": 0.06874366849660873, "step": 9130 }, { "epoch": 3.0453635757171447, "loss": 0.5388900637626648, "step": 9130 }, { "ce_loss": 0.22864705324172974, "epoch": 3.0453635757171447, "step": 9130 }, { "distill_loss": 0.1401529610157013, "epoch": 3.0453635757171447, "step": 9130 }, { "epoch": 3.0453635757171447, "ref_ce_loss": 0.13798050582408905, "step": 9130 }, { "epoch": 3.04869913275517, "loss": 0.4863, "step": 9140 }, { "epoch": 3.04869913275517, "grad_norm": 2.592088460922241, "step": 9140 }, { "epoch": 3.04869913275517, "learning_rate": 0.0002130416810347092, "step": 9140 }, { "epoch": 3.04869913275517, "loss": 0.5827805995941162, "step": 9140 }, { "ce_loss": 0.1532483845949173, "epoch": 3.04869913275517, "step": 9140 }, { "distill_loss": 0.1079009547829628, "epoch": 3.04869913275517, "step": 9140 }, { "epoch": 3.04869913275517, "ref_ce_loss": 0.09535907208919525, "step": 9140 }, { "epoch": 3.04869913275517, "loss": 0.45961111783981323, "step": 9140 }, { "ce_loss": 0.153269961476326, "epoch": 3.04869913275517, "step": 9140 }, { "distill_loss": 0.1302245557308197, "epoch": 3.04869913275517, "step": 9140 }, { "epoch": 3.04869913275517, "ref_ce_loss": 0.10670872777700424, "step": 9140 }, { "epoch": 3.04869913275517, "loss": 0.5426515340805054, "step": 9140 }, { "ce_loss": 0.15486785769462585, "epoch": 3.04869913275517, "step": 9140 }, { "distill_loss": 0.11330495774745941, "epoch": 3.04869913275517, "step": 9140 }, { "epoch": 3.04869913275517, "ref_ce_loss": 0.08581661432981491, "step": 9140 }, { "epoch": 3.04869913275517, "loss": 0.3043895959854126, "step": 9140 }, { "ce_loss": 0.0612226277589798, "epoch": 3.04869913275517, "step": 9140 }, { "distill_loss": 0.11302270740270615, "epoch": 3.04869913275517, "step": 9140 }, { "epoch": 3.04869913275517, "ref_ce_loss": 0.1041092649102211, "step": 9140 }, { "epoch": 3.0520346897931954, "loss": 0.4493, "step": 9150 }, { "epoch": 3.0520346897931954, "grad_norm": 2.173112630844116, "step": 9150 }, { "epoch": 3.0520346897931954, "learning_rate": 0.00021285782029332111, "step": 9150 }, { "epoch": 3.0520346897931954, "loss": 0.3466378450393677, "step": 9150 }, { "ce_loss": 0.1040438711643219, "epoch": 3.0520346897931954, "step": 9150 }, { "distill_loss": 0.10754573345184326, "epoch": 3.0520346897931954, "step": 9150 }, { "epoch": 3.0520346897931954, "ref_ce_loss": 0.10663019120693207, "step": 9150 }, { "epoch": 3.0520346897931954, "loss": 0.7064090967178345, "step": 9150 }, { "ce_loss": 0.140250563621521, "epoch": 3.0520346897931954, "step": 9150 }, { "distill_loss": 0.10913405567407608, "epoch": 3.0520346897931954, "step": 9150 }, { "epoch": 3.0520346897931954, "ref_ce_loss": 0.1309662163257599, "step": 9150 }, { "epoch": 3.0520346897931954, "loss": 0.5606474280357361, "step": 9150 }, { "ce_loss": 0.18404537439346313, "epoch": 3.0520346897931954, "step": 9150 }, { "distill_loss": 0.1043291687965393, "epoch": 3.0520346897931954, "step": 9150 }, { "epoch": 3.0520346897931954, "ref_ce_loss": 0.11552339047193527, "step": 9150 }, { "epoch": 3.0520346897931954, "loss": 0.23970846831798553, "step": 9150 }, { "ce_loss": 0.03243853524327278, "epoch": 3.0520346897931954, "step": 9150 }, { "distill_loss": 0.10297881066799164, "epoch": 3.0520346897931954, "step": 9150 }, { "epoch": 3.0520346897931954, "ref_ce_loss": 0.060380127280950546, "step": 9150 }, { "epoch": 3.0553702468312207, "loss": 0.5151, "step": 9160 }, { "epoch": 3.0553702468312207, "grad_norm": 2.2912018299102783, "step": 9160 }, { "epoch": 3.0553702468312207, "learning_rate": 0.00021267384492406415, "step": 9160 }, { "epoch": 3.0553702468312207, "loss": 0.3046942949295044, "step": 9160 }, { "ce_loss": 0.04315432533621788, "epoch": 3.0553702468312207, "step": 9160 }, { "distill_loss": 0.0770706906914711, "epoch": 3.0553702468312207, "step": 9160 }, { "epoch": 3.0553702468312207, "ref_ce_loss": 0.07259001582860947, "step": 9160 }, { "epoch": 3.0553702468312207, "loss": 0.37191644310951233, "step": 9160 }, { "ce_loss": 0.14015623927116394, "epoch": 3.0553702468312207, "step": 9160 }, { "distill_loss": 0.10944913327693939, "epoch": 3.0553702468312207, "step": 9160 }, { "epoch": 3.0553702468312207, "ref_ce_loss": 0.09285426884889603, "step": 9160 }, { "epoch": 3.0553702468312207, "loss": 0.8361793756484985, "step": 9160 }, { "ce_loss": 0.15691381692886353, "epoch": 3.0553702468312207, "step": 9160 }, { "distill_loss": 0.12314335256814957, "epoch": 3.0553702468312207, "step": 9160 }, { "epoch": 3.0553702468312207, "ref_ce_loss": 0.09802524745464325, "step": 9160 }, { "epoch": 3.0553702468312207, "loss": 0.5051029324531555, "step": 9160 }, { "ce_loss": 0.16032618284225464, "epoch": 3.0553702468312207, "step": 9160 }, { "distill_loss": 0.1252148151397705, "epoch": 3.0553702468312207, "step": 9160 }, { "epoch": 3.0553702468312207, "ref_ce_loss": 0.09847740828990936, "step": 9160 }, { "epoch": 3.058705803869246, "loss": 0.4546, "step": 9170 }, { "epoch": 3.058705803869246, "grad_norm": 2.534843921661377, "step": 9170 }, { "epoch": 3.058705803869246, "learning_rate": 0.00021248975526243682, "step": 9170 }, { "epoch": 3.058705803869246, "loss": 0.29830026626586914, "step": 9170 }, { "ce_loss": 0.0924723893404007, "epoch": 3.058705803869246, "step": 9170 }, { "distill_loss": 0.08625347167253494, "epoch": 3.058705803869246, "step": 9170 }, { "epoch": 3.058705803869246, "ref_ce_loss": 0.09278205037117004, "step": 9170 }, { "epoch": 3.058705803869246, "loss": 0.3502795398235321, "step": 9170 }, { "ce_loss": 0.06436974555253983, "epoch": 3.058705803869246, "step": 9170 }, { "distill_loss": 0.09440024197101593, "epoch": 3.058705803869246, "step": 9170 }, { "epoch": 3.058705803869246, "ref_ce_loss": 0.0780462846159935, "step": 9170 }, { "epoch": 3.058705803869246, "loss": 0.29896295070648193, "step": 9170 }, { "ce_loss": 0.0671202540397644, "epoch": 3.058705803869246, "step": 9170 }, { "distill_loss": 0.0891413614153862, "epoch": 3.058705803869246, "step": 9170 }, { "epoch": 3.058705803869246, "ref_ce_loss": 0.09291604161262512, "step": 9170 }, { "epoch": 3.058705803869246, "loss": 0.5191394090652466, "step": 9170 }, { "ce_loss": 0.15048867464065552, "epoch": 3.058705803869246, "step": 9170 }, { "distill_loss": 0.10521610081195831, "epoch": 3.058705803869246, "step": 9170 }, { "epoch": 3.058705803869246, "ref_ce_loss": 0.09934370219707489, "step": 9170 }, { "epoch": 3.0620413609072714, "loss": 0.4654, "step": 9180 }, { "epoch": 3.0620413609072714, "grad_norm": 2.8238677978515625, "step": 9180 }, { "epoch": 3.0620413609072714, "learning_rate": 0.00021230555164414614, "step": 9180 }, { "epoch": 3.0620413609072714, "loss": 0.5475714206695557, "step": 9180 }, { "ce_loss": 0.1436816155910492, "epoch": 3.0620413609072714, "step": 9180 }, { "distill_loss": 0.11422336101531982, "epoch": 3.0620413609072714, "step": 9180 }, { "epoch": 3.0620413609072714, "ref_ce_loss": 0.1417398303747177, "step": 9180 }, { "epoch": 3.0620413609072714, "loss": 0.528938889503479, "step": 9180 }, { "ce_loss": 0.12694130837917328, "epoch": 3.0620413609072714, "step": 9180 }, { "distill_loss": 0.13421253859996796, "epoch": 3.0620413609072714, "step": 9180 }, { "epoch": 3.0620413609072714, "ref_ce_loss": 0.11657682061195374, "step": 9180 }, { "epoch": 3.0620413609072714, "loss": 0.7762036323547363, "step": 9180 }, { "ce_loss": 0.1628248542547226, "epoch": 3.0620413609072714, "step": 9180 }, { "distill_loss": 0.10600240528583527, "epoch": 3.0620413609072714, "step": 9180 }, { "epoch": 3.0620413609072714, "ref_ce_loss": 0.1570037305355072, "step": 9180 }, { "epoch": 3.0620413609072714, "loss": 0.35722458362579346, "step": 9180 }, { "ce_loss": 0.09392691403627396, "epoch": 3.0620413609072714, "step": 9180 }, { "distill_loss": 0.08912178874015808, "epoch": 3.0620413609072714, "step": 9180 }, { "epoch": 3.0620413609072714, "ref_ce_loss": 0.10203284025192261, "step": 9180 }, { "epoch": 3.0653769179452968, "loss": 0.4791, "step": 9190 }, { "epoch": 3.0653769179452968, "grad_norm": 2.509390354156494, "step": 9190 }, { "epoch": 3.0653769179452968, "learning_rate": 0.00021212123440510683, "step": 9190 }, { "epoch": 3.0653769179452968, "loss": 0.4448990225791931, "step": 9190 }, { "ce_loss": 0.15895746648311615, "epoch": 3.0653769179452968, "step": 9190 }, { "distill_loss": 0.13571859896183014, "epoch": 3.0653769179452968, "step": 9190 }, { "epoch": 3.0653769179452968, "ref_ce_loss": 0.10972491651773453, "step": 9190 }, { "epoch": 3.0653769179452968, "loss": 0.4973427653312683, "step": 9190 }, { "ce_loss": 0.09390971809625626, "epoch": 3.0653769179452968, "step": 9190 }, { "distill_loss": 0.1523730307817459, "epoch": 3.0653769179452968, "step": 9190 }, { "epoch": 3.0653769179452968, "ref_ce_loss": 0.09280847012996674, "step": 9190 }, { "epoch": 3.0653769179452968, "loss": 0.30266839265823364, "step": 9190 }, { "ce_loss": 0.08505178987979889, "epoch": 3.0653769179452968, "step": 9190 }, { "distill_loss": 0.09608002007007599, "epoch": 3.0653769179452968, "step": 9190 }, { "epoch": 3.0653769179452968, "ref_ce_loss": 0.0848335325717926, "step": 9190 }, { "epoch": 3.0653769179452968, "loss": 0.5422333478927612, "step": 9190 }, { "ce_loss": 0.2144017219543457, "epoch": 3.0653769179452968, "step": 9190 }, { "distill_loss": 0.16743631660938263, "epoch": 3.0653769179452968, "step": 9190 }, { "epoch": 3.0653769179452968, "ref_ce_loss": 0.07784318923950195, "step": 9190 }, { "epoch": 3.068712474983322, "loss": 0.5275, "step": 9200 }, { "epoch": 3.068712474983322, "grad_norm": 2.0295801162719727, "step": 9200 }, { "epoch": 3.068712474983322, "learning_rate": 0.00021193680388144074, "step": 9200 }, { "epoch": 3.068712474983322, "loss": 0.43971535563468933, "step": 9200 }, { "ce_loss": 0.10325153172016144, "epoch": 3.068712474983322, "step": 9200 }, { "distill_loss": 0.12678496539592743, "epoch": 3.068712474983322, "step": 9200 }, { "epoch": 3.068712474983322, "ref_ce_loss": 0.14589820802211761, "step": 9200 }, { "epoch": 3.068712474983322, "loss": 0.38762301206588745, "step": 9200 }, { "ce_loss": 0.08085848391056061, "epoch": 3.068712474983322, "step": 9200 }, { "distill_loss": 0.0924757868051529, "epoch": 3.068712474983322, "step": 9200 }, { "epoch": 3.068712474983322, "ref_ce_loss": 0.11108756065368652, "step": 9200 }, { "epoch": 3.068712474983322, "loss": 0.3827369213104248, "step": 9200 }, { "ce_loss": 0.06147749722003937, "epoch": 3.068712474983322, "step": 9200 }, { "distill_loss": 0.15130433440208435, "epoch": 3.068712474983322, "step": 9200 }, { "epoch": 3.068712474983322, "ref_ce_loss": 0.10998158156871796, "step": 9200 }, { "epoch": 3.068712474983322, "loss": 0.36490562558174133, "step": 9200 }, { "ce_loss": 0.1082155779004097, "epoch": 3.068712474983322, "step": 9200 }, { "distill_loss": 0.12540577352046967, "epoch": 3.068712474983322, "step": 9200 }, { "epoch": 3.068712474983322, "ref_ce_loss": 0.08237636089324951, "step": 9200 }, { "epoch": 3.0720480320213475, "loss": 0.4654, "step": 9210 }, { "epoch": 3.0720480320213475, "grad_norm": 3.6549220085144043, "step": 9210 }, { "epoch": 3.0720480320213475, "learning_rate": 0.00021175226040947643, "step": 9210 }, { "epoch": 3.0720480320213475, "loss": 0.3767533004283905, "step": 9210 }, { "ce_loss": 0.030972249805927277, "epoch": 3.0720480320213475, "step": 9210 }, { "distill_loss": 0.10726246237754822, "epoch": 3.0720480320213475, "step": 9210 }, { "epoch": 3.0720480320213475, "ref_ce_loss": 0.06324587017297745, "step": 9210 }, { "epoch": 3.0720480320213475, "loss": 0.35787636041641235, "step": 9210 }, { "ce_loss": 0.10263849794864655, "epoch": 3.0720480320213475, "step": 9210 }, { "distill_loss": 0.1269713193178177, "epoch": 3.0720480320213475, "step": 9210 }, { "epoch": 3.0720480320213475, "ref_ce_loss": 0.09760292619466782, "step": 9210 }, { "epoch": 3.0720480320213475, "loss": 0.7159144878387451, "step": 9210 }, { "ce_loss": 0.19792456924915314, "epoch": 3.0720480320213475, "step": 9210 }, { "distill_loss": 0.1999223232269287, "epoch": 3.0720480320213475, "step": 9210 }, { "epoch": 3.0720480320213475, "ref_ce_loss": 0.09326591342687607, "step": 9210 }, { "epoch": 3.0720480320213475, "loss": 0.45586901903152466, "step": 9210 }, { "ce_loss": 0.11067557334899902, "epoch": 3.0720480320213475, "step": 9210 }, { "distill_loss": 0.1134071871638298, "epoch": 3.0720480320213475, "step": 9210 }, { "epoch": 3.0720480320213475, "ref_ce_loss": 0.10379556566476822, "step": 9210 }, { "epoch": 3.075383589059373, "loss": 0.4981, "step": 9220 }, { "epoch": 3.075383589059373, "grad_norm": 3.2063794136047363, "step": 9220 }, { "epoch": 3.075383589059373, "learning_rate": 0.00021156760432574845, "step": 9220 }, { "epoch": 3.075383589059373, "loss": 0.6135134696960449, "step": 9220 }, { "ce_loss": 0.18632878363132477, "epoch": 3.075383589059373, "step": 9220 }, { "distill_loss": 0.1343146711587906, "epoch": 3.075383589059373, "step": 9220 }, { "epoch": 3.075383589059373, "ref_ce_loss": 0.12228333950042725, "step": 9220 }, { "epoch": 3.075383589059373, "loss": 0.7215794324874878, "step": 9220 }, { "ce_loss": 0.17847248911857605, "epoch": 3.075383589059373, "step": 9220 }, { "distill_loss": 0.14979347586631775, "epoch": 3.075383589059373, "step": 9220 }, { "epoch": 3.075383589059373, "ref_ce_loss": 0.13431112468242645, "step": 9220 }, { "epoch": 3.075383589059373, "loss": 0.5225598812103271, "step": 9220 }, { "ce_loss": 0.14933809638023376, "epoch": 3.075383589059373, "step": 9220 }, { "distill_loss": 0.14387843012809753, "epoch": 3.075383589059373, "step": 9220 }, { "epoch": 3.075383589059373, "ref_ce_loss": 0.09537240117788315, "step": 9220 }, { "epoch": 3.075383589059373, "loss": 0.648240327835083, "step": 9220 }, { "ce_loss": 0.0898180678486824, "epoch": 3.075383589059373, "step": 9220 }, { "distill_loss": 0.12337978929281235, "epoch": 3.075383589059373, "step": 9220 }, { "epoch": 3.075383589059373, "ref_ce_loss": 0.11949852854013443, "step": 9220 }, { "epoch": 3.078719146097398, "loss": 0.5397, "step": 9230 }, { "epoch": 3.078719146097398, "grad_norm": 4.953123569488525, "step": 9230 }, { "epoch": 3.078719146097398, "learning_rate": 0.00021138283596699658, "step": 9230 }, { "epoch": 3.078719146097398, "loss": 1.1860730648040771, "step": 9230 }, { "ce_loss": 0.2039407640695572, "epoch": 3.078719146097398, "step": 9230 }, { "distill_loss": 0.11151131242513657, "epoch": 3.078719146097398, "step": 9230 }, { "epoch": 3.078719146097398, "ref_ce_loss": 0.17183974385261536, "step": 9230 }, { "epoch": 3.078719146097398, "loss": 0.3743239939212799, "step": 9230 }, { "ce_loss": 0.08552516996860504, "epoch": 3.078719146097398, "step": 9230 }, { "distill_loss": 0.09078429639339447, "epoch": 3.078719146097398, "step": 9230 }, { "epoch": 3.078719146097398, "ref_ce_loss": 0.0932949110865593, "step": 9230 }, { "epoch": 3.078719146097398, "loss": 0.3618348240852356, "step": 9230 }, { "ce_loss": 0.09893414378166199, "epoch": 3.078719146097398, "step": 9230 }, { "distill_loss": 0.101051926612854, "epoch": 3.078719146097398, "step": 9230 }, { "epoch": 3.078719146097398, "ref_ce_loss": 0.1281239241361618, "step": 9230 }, { "epoch": 3.078719146097398, "loss": 0.45348069071769714, "step": 9230 }, { "ce_loss": 0.15692560374736786, "epoch": 3.078719146097398, "step": 9230 }, { "distill_loss": 0.1507617086172104, "epoch": 3.078719146097398, "step": 9230 }, { "epoch": 3.078719146097398, "ref_ce_loss": 0.0963934138417244, "step": 9230 }, { "epoch": 3.0820547031354235, "loss": 0.5099, "step": 9240 }, { "epoch": 3.0820547031354235, "grad_norm": 6.172658920288086, "step": 9240 }, { "epoch": 3.0820547031354235, "learning_rate": 0.00021119795567016553, "step": 9240 }, { "epoch": 3.0820547031354235, "loss": 0.77073734998703, "step": 9240 }, { "ce_loss": 0.19863741099834442, "epoch": 3.0820547031354235, "step": 9240 }, { "distill_loss": 0.13241790235042572, "epoch": 3.0820547031354235, "step": 9240 }, { "epoch": 3.0820547031354235, "ref_ce_loss": 0.10125939548015594, "step": 9240 }, { "epoch": 3.0820547031354235, "loss": 0.6127821207046509, "step": 9240 }, { "ce_loss": 0.19738160073757172, "epoch": 3.0820547031354235, "step": 9240 }, { "distill_loss": 0.13014522194862366, "epoch": 3.0820547031354235, "step": 9240 }, { "epoch": 3.0820547031354235, "ref_ce_loss": 0.15752890706062317, "step": 9240 }, { "epoch": 3.0820547031354235, "loss": 0.5437523722648621, "step": 9240 }, { "ce_loss": 0.17684681713581085, "epoch": 3.0820547031354235, "step": 9240 }, { "distill_loss": 0.09984688460826874, "epoch": 3.0820547031354235, "step": 9240 }, { "epoch": 3.0820547031354235, "ref_ce_loss": 0.08175192773342133, "step": 9240 }, { "epoch": 3.0820547031354235, "loss": 0.7698802351951599, "step": 9240 }, { "ce_loss": 0.10929053276777267, "epoch": 3.0820547031354235, "step": 9240 }, { "distill_loss": 0.0947088897228241, "epoch": 3.0820547031354235, "step": 9240 }, { "epoch": 3.0820547031354235, "ref_ce_loss": 0.10278258472681046, "step": 9240 }, { "epoch": 3.085390260173449, "loss": 0.5153, "step": 9250 }, { "epoch": 3.085390260173449, "grad_norm": 4.075594902038574, "step": 9250 }, { "epoch": 3.085390260173449, "learning_rate": 0.00021101296377240388, "step": 9250 }, { "epoch": 3.085390260173449, "loss": 0.3964616656303406, "step": 9250 }, { "ce_loss": 0.1403404325246811, "epoch": 3.085390260173449, "step": 9250 }, { "distill_loss": 0.10999090224504471, "epoch": 3.085390260173449, "step": 9250 }, { "epoch": 3.085390260173449, "ref_ce_loss": 0.09092675149440765, "step": 9250 }, { "epoch": 3.085390260173449, "loss": 0.31418895721435547, "step": 9250 }, { "ce_loss": 0.12451020628213882, "epoch": 3.085390260173449, "step": 9250 }, { "distill_loss": 0.104770727455616, "epoch": 3.085390260173449, "step": 9250 }, { "epoch": 3.085390260173449, "ref_ce_loss": 0.06058081239461899, "step": 9250 }, { "epoch": 3.085390260173449, "loss": 0.612764835357666, "step": 9250 }, { "ce_loss": 0.16425681114196777, "epoch": 3.085390260173449, "step": 9250 }, { "distill_loss": 0.14076608419418335, "epoch": 3.085390260173449, "step": 9250 }, { "epoch": 3.085390260173449, "ref_ce_loss": 0.14075103402137756, "step": 9250 }, { "epoch": 3.085390260173449, "loss": 0.45888441801071167, "step": 9250 }, { "ce_loss": 0.16245479881763458, "epoch": 3.085390260173449, "step": 9250 }, { "distill_loss": 0.13823184370994568, "epoch": 3.085390260173449, "step": 9250 }, { "epoch": 3.085390260173449, "ref_ce_loss": 0.11153128743171692, "step": 9250 }, { "epoch": 3.088725817211474, "loss": 0.5267, "step": 9260 }, { "epoch": 3.088725817211474, "grad_norm": 2.336992025375366, "step": 9260 }, { "epoch": 3.088725817211474, "learning_rate": 0.00021082786061106401, "step": 9260 }, { "epoch": 3.088725817211474, "loss": 0.24876004457473755, "step": 9260 }, { "ce_loss": 0.08521168678998947, "epoch": 3.088725817211474, "step": 9260 }, { "distill_loss": 0.07871997356414795, "epoch": 3.088725817211474, "step": 9260 }, { "epoch": 3.088725817211474, "ref_ce_loss": 0.08447657525539398, "step": 9260 }, { "epoch": 3.088725817211474, "loss": 1.1503713130950928, "step": 9260 }, { "ce_loss": 0.17891433835029602, "epoch": 3.088725817211474, "step": 9260 }, { "distill_loss": 0.12399543821811676, "epoch": 3.088725817211474, "step": 9260 }, { "epoch": 3.088725817211474, "ref_ce_loss": 0.11212249100208282, "step": 9260 }, { "epoch": 3.088725817211474, "loss": 0.5678211450576782, "step": 9260 }, { "ce_loss": 0.164375901222229, "epoch": 3.088725817211474, "step": 9260 }, { "distill_loss": 0.12175918370485306, "epoch": 3.088725817211474, "step": 9260 }, { "epoch": 3.088725817211474, "ref_ce_loss": 0.1290905773639679, "step": 9260 }, { "epoch": 3.088725817211474, "loss": 0.5536719560623169, "step": 9260 }, { "ce_loss": 0.13819049298763275, "epoch": 3.088725817211474, "step": 9260 }, { "distill_loss": 0.0980941578745842, "epoch": 3.088725817211474, "step": 9260 }, { "epoch": 3.088725817211474, "ref_ce_loss": 0.09547659754753113, "step": 9260 }, { "epoch": 3.0920613742494996, "loss": 0.5244, "step": 9270 }, { "epoch": 3.0920613742494996, "grad_norm": 1.5601093769073486, "step": 9270 }, { "epoch": 3.0920613742494996, "learning_rate": 0.000210642646523701, "step": 9270 }, { "epoch": 3.0920613742494996, "loss": 0.4849652647972107, "step": 9270 }, { "ce_loss": 0.09189176559448242, "epoch": 3.0920613742494996, "step": 9270 }, { "distill_loss": 0.11284545809030533, "epoch": 3.0920613742494996, "step": 9270 }, { "epoch": 3.0920613742494996, "ref_ce_loss": 0.12299380451440811, "step": 9270 }, { "epoch": 3.0920613742494996, "loss": 0.44110724329948425, "step": 9270 }, { "ce_loss": 0.15729226171970367, "epoch": 3.0920613742494996, "step": 9270 }, { "distill_loss": 0.11161144822835922, "epoch": 3.0920613742494996, "step": 9270 }, { "epoch": 3.0920613742494996, "ref_ce_loss": 0.1129552498459816, "step": 9270 }, { "epoch": 3.0920613742494996, "loss": 0.38249096274375916, "step": 9270 }, { "ce_loss": 0.08966681361198425, "epoch": 3.0920613742494996, "step": 9270 }, { "distill_loss": 0.11741229146718979, "epoch": 3.0920613742494996, "step": 9270 }, { "epoch": 3.0920613742494996, "ref_ce_loss": 0.07501877099275589, "step": 9270 }, { "epoch": 3.0920613742494996, "loss": 0.6309377551078796, "step": 9270 }, { "ce_loss": 0.1539163440465927, "epoch": 3.0920613742494996, "step": 9270 }, { "distill_loss": 0.1314060539007187, "epoch": 3.0920613742494996, "step": 9270 }, { "epoch": 3.0920613742494996, "ref_ce_loss": 0.12064765393733978, "step": 9270 }, { "epoch": 3.095396931287525, "loss": 0.5212, "step": 9280 }, { "epoch": 3.095396931287525, "grad_norm": 2.5884647369384766, "step": 9280 }, { "epoch": 3.095396931287525, "learning_rate": 0.0002104573218480723, "step": 9280 }, { "epoch": 3.095396931287525, "loss": 0.47474294900894165, "step": 9280 }, { "ce_loss": 0.15236707031726837, "epoch": 3.095396931287525, "step": 9280 }, { "distill_loss": 0.07646772265434265, "epoch": 3.095396931287525, "step": 9280 }, { "epoch": 3.095396931287525, "ref_ce_loss": 0.09920775145292282, "step": 9280 }, { "epoch": 3.095396931287525, "loss": 0.4098622500896454, "step": 9280 }, { "ce_loss": 0.17156140506267548, "epoch": 3.095396931287525, "step": 9280 }, { "distill_loss": 0.1252693384885788, "epoch": 3.095396931287525, "step": 9280 }, { "epoch": 3.095396931287525, "ref_ce_loss": 0.11265676468610764, "step": 9280 }, { "epoch": 3.095396931287525, "loss": 0.27114593982696533, "step": 9280 }, { "ce_loss": 0.10317492485046387, "epoch": 3.095396931287525, "step": 9280 }, { "distill_loss": 0.08390937000513077, "epoch": 3.095396931287525, "step": 9280 }, { "epoch": 3.095396931287525, "ref_ce_loss": 0.08384941518306732, "step": 9280 }, { "epoch": 3.095396931287525, "loss": 0.4030131995677948, "step": 9280 }, { "ce_loss": 0.1477421671152115, "epoch": 3.095396931287525, "step": 9280 }, { "distill_loss": 0.11889496445655823, "epoch": 3.095396931287525, "step": 9280 }, { "epoch": 3.095396931287525, "ref_ce_loss": 0.08689527958631516, "step": 9280 }, { "epoch": 3.0987324883255503, "loss": 0.4677, "step": 9290 }, { "epoch": 3.0987324883255503, "grad_norm": 2.5747299194335938, "step": 9290 }, { "epoch": 3.0987324883255503, "learning_rate": 0.00021027188692213702, "step": 9290 }, { "epoch": 3.0987324883255503, "loss": 0.44657784700393677, "step": 9290 }, { "ce_loss": 0.06276731193065643, "epoch": 3.0987324883255503, "step": 9290 }, { "distill_loss": 0.10876451432704926, "epoch": 3.0987324883255503, "step": 9290 }, { "epoch": 3.0987324883255503, "ref_ce_loss": 0.12008004635572433, "step": 9290 }, { "epoch": 3.0987324883255503, "loss": 0.4799278676509857, "step": 9290 }, { "ce_loss": 0.12068340927362442, "epoch": 3.0987324883255503, "step": 9290 }, { "distill_loss": 0.13862359523773193, "epoch": 3.0987324883255503, "step": 9290 }, { "epoch": 3.0987324883255503, "ref_ce_loss": 0.09833888709545135, "step": 9290 }, { "epoch": 3.0987324883255503, "loss": 0.4970934987068176, "step": 9290 }, { "ce_loss": 0.20263740420341492, "epoch": 3.0987324883255503, "step": 9290 }, { "distill_loss": 0.14431583881378174, "epoch": 3.0987324883255503, "step": 9290 }, { "epoch": 3.0987324883255503, "ref_ce_loss": 0.11357313394546509, "step": 9290 }, { "epoch": 3.0987324883255503, "loss": 0.5054774284362793, "step": 9290 }, { "ce_loss": 0.18168766796588898, "epoch": 3.0987324883255503, "step": 9290 }, { "distill_loss": 0.1163872703909874, "epoch": 3.0987324883255503, "step": 9290 }, { "epoch": 3.0987324883255503, "ref_ce_loss": 0.12051656097173691, "step": 9290 }, { "epoch": 3.1020680453635756, "loss": 0.4978, "step": 9300 }, { "epoch": 3.1020680453635756, "grad_norm": 3.1117746829986572, "step": 9300 }, { "epoch": 3.1020680453635756, "learning_rate": 0.00021008634208405532, "step": 9300 }, { "epoch": 3.1020680453635756, "loss": 0.6433578133583069, "step": 9300 }, { "ce_loss": 0.17003703117370605, "epoch": 3.1020680453635756, "step": 9300 }, { "distill_loss": 0.18623663485050201, "epoch": 3.1020680453635756, "step": 9300 }, { "epoch": 3.1020680453635756, "ref_ce_loss": 0.13352155685424805, "step": 9300 }, { "epoch": 3.1020680453635756, "loss": 0.7382215261459351, "step": 9300 }, { "ce_loss": 0.12075452506542206, "epoch": 3.1020680453635756, "step": 9300 }, { "distill_loss": 0.11009524762630463, "epoch": 3.1020680453635756, "step": 9300 }, { "epoch": 3.1020680453635756, "ref_ce_loss": 0.08462850004434586, "step": 9300 }, { "epoch": 3.1020680453635756, "loss": 1.0106620788574219, "step": 9300 }, { "ce_loss": 0.19730034470558167, "epoch": 3.1020680453635756, "step": 9300 }, { "distill_loss": 0.19440525770187378, "epoch": 3.1020680453635756, "step": 9300 }, { "epoch": 3.1020680453635756, "ref_ce_loss": 0.14143586158752441, "step": 9300 }, { "epoch": 3.1020680453635756, "loss": 0.2682120203971863, "step": 9300 }, { "ce_loss": 0.03231796994805336, "epoch": 3.1020680453635756, "step": 9300 }, { "distill_loss": 0.09826377779245377, "epoch": 3.1020680453635756, "step": 9300 }, { "epoch": 3.1020680453635756, "ref_ce_loss": 0.0769156664609909, "step": 9300 }, { "epoch": 3.105403602401601, "loss": 0.4805, "step": 9310 }, { "epoch": 3.105403602401601, "grad_norm": 2.6101269721984863, "step": 9310 }, { "epoch": 3.105403602401601, "learning_rate": 0.00020990068767218778, "step": 9310 }, { "epoch": 3.105403602401601, "loss": 0.6195972561836243, "step": 9310 }, { "ce_loss": 0.24588622152805328, "epoch": 3.105403602401601, "step": 9310 }, { "distill_loss": 0.16623997688293457, "epoch": 3.105403602401601, "step": 9310 }, { "epoch": 3.105403602401601, "ref_ce_loss": 0.144457146525383, "step": 9310 }, { "epoch": 3.105403602401601, "loss": 0.3230529725551605, "step": 9310 }, { "ce_loss": 0.05365613102912903, "epoch": 3.105403602401601, "step": 9310 }, { "distill_loss": 0.11842749267816544, "epoch": 3.105403602401601, "step": 9310 }, { "epoch": 3.105403602401601, "ref_ce_loss": 0.09812147170305252, "step": 9310 }, { "epoch": 3.105403602401601, "loss": 0.47844380140304565, "step": 9310 }, { "ce_loss": 0.20352646708488464, "epoch": 3.105403602401601, "step": 9310 }, { "distill_loss": 0.16697512567043304, "epoch": 3.105403602401601, "step": 9310 }, { "epoch": 3.105403602401601, "ref_ce_loss": 0.0778585821390152, "step": 9310 }, { "epoch": 3.105403602401601, "loss": 0.5424474477767944, "step": 9310 }, { "ce_loss": 0.176710307598114, "epoch": 3.105403602401601, "step": 9310 }, { "distill_loss": 0.11072219163179398, "epoch": 3.105403602401601, "step": 9310 }, { "epoch": 3.105403602401601, "ref_ce_loss": 0.11033220589160919, "step": 9310 }, { "epoch": 3.1087391594396263, "loss": 0.5438, "step": 9320 }, { "epoch": 3.1087391594396263, "grad_norm": 2.295335292816162, "step": 9320 }, { "epoch": 3.1087391594396263, "learning_rate": 0.00020971492402509483, "step": 9320 }, { "epoch": 3.1087391594396263, "loss": 0.8378156423568726, "step": 9320 }, { "ce_loss": 0.16845764219760895, "epoch": 3.1087391594396263, "step": 9320 }, { "distill_loss": 0.16372105479240417, "epoch": 3.1087391594396263, "step": 9320 }, { "epoch": 3.1087391594396263, "ref_ce_loss": 0.12829598784446716, "step": 9320 }, { "epoch": 3.1087391594396263, "loss": 0.463933527469635, "step": 9320 }, { "ce_loss": 0.1278490573167801, "epoch": 3.1087391594396263, "step": 9320 }, { "distill_loss": 0.12349837273359299, "epoch": 3.1087391594396263, "step": 9320 }, { "epoch": 3.1087391594396263, "ref_ce_loss": 0.08968111127614975, "step": 9320 }, { "epoch": 3.1087391594396263, "loss": 0.4199450612068176, "step": 9320 }, { "ce_loss": 0.11417478322982788, "epoch": 3.1087391594396263, "step": 9320 }, { "distill_loss": 0.13870379328727722, "epoch": 3.1087391594396263, "step": 9320 }, { "epoch": 3.1087391594396263, "ref_ce_loss": 0.11972949653863907, "step": 9320 }, { "epoch": 3.1087391594396263, "loss": 0.5914303064346313, "step": 9320 }, { "ce_loss": 0.14238835871219635, "epoch": 3.1087391594396263, "step": 9320 }, { "distill_loss": 0.12284432351589203, "epoch": 3.1087391594396263, "step": 9320 }, { "epoch": 3.1087391594396263, "ref_ce_loss": 0.07381831109523773, "step": 9320 }, { "epoch": 3.1120747164776517, "loss": 0.547, "step": 9330 }, { "epoch": 3.1120747164776517, "grad_norm": 2.3080222606658936, "step": 9330 }, { "epoch": 3.1120747164776517, "learning_rate": 0.00020952905148153607, "step": 9330 }, { "epoch": 3.1120747164776517, "loss": 0.8455989360809326, "step": 9330 }, { "ce_loss": 0.17443938553333282, "epoch": 3.1120747164776517, "step": 9330 }, { "distill_loss": 0.10574007034301758, "epoch": 3.1120747164776517, "step": 9330 }, { "epoch": 3.1120747164776517, "ref_ce_loss": 0.07656016945838928, "step": 9330 }, { "epoch": 3.1120747164776517, "loss": 0.6197810173034668, "step": 9330 }, { "ce_loss": 0.15716618299484253, "epoch": 3.1120747164776517, "step": 9330 }, { "distill_loss": 0.17074379324913025, "epoch": 3.1120747164776517, "step": 9330 }, { "epoch": 3.1120747164776517, "ref_ce_loss": 0.16151253879070282, "step": 9330 }, { "epoch": 3.1120747164776517, "loss": 0.4124746322631836, "step": 9330 }, { "ce_loss": 0.1550775021314621, "epoch": 3.1120747164776517, "step": 9330 }, { "distill_loss": 0.12570025026798248, "epoch": 3.1120747164776517, "step": 9330 }, { "epoch": 3.1120747164776517, "ref_ce_loss": 0.07098022103309631, "step": 9330 }, { "epoch": 3.1120747164776517, "loss": 0.8346481323242188, "step": 9330 }, { "ce_loss": 0.18216295540332794, "epoch": 3.1120747164776517, "step": 9330 }, { "distill_loss": 0.10690856724977493, "epoch": 3.1120747164776517, "step": 9330 }, { "epoch": 3.1120747164776517, "ref_ce_loss": 0.13296154141426086, "step": 9330 }, { "epoch": 3.115410273515677, "loss": 0.4797, "step": 9340 }, { "epoch": 3.115410273515677, "grad_norm": 2.3242478370666504, "step": 9340 }, { "epoch": 3.115410273515677, "learning_rate": 0.00020934307038046965, "step": 9340 }, { "epoch": 3.115410273515677, "loss": 0.3541427552700043, "step": 9340 }, { "ce_loss": 0.1066516563296318, "epoch": 3.115410273515677, "step": 9340 }, { "distill_loss": 0.1119885966181755, "epoch": 3.115410273515677, "step": 9340 }, { "epoch": 3.115410273515677, "ref_ce_loss": 0.08281800895929337, "step": 9340 }, { "epoch": 3.115410273515677, "loss": 0.788700520992279, "step": 9340 }, { "ce_loss": 0.2846371829509735, "epoch": 3.115410273515677, "step": 9340 }, { "distill_loss": 0.2575102746486664, "epoch": 3.115410273515677, "step": 9340 }, { "epoch": 3.115410273515677, "ref_ce_loss": 0.1448318362236023, "step": 9340 }, { "epoch": 3.115410273515677, "loss": 0.48121488094329834, "step": 9340 }, { "ce_loss": 0.1438976377248764, "epoch": 3.115410273515677, "step": 9340 }, { "distill_loss": 0.17746815085411072, "epoch": 3.115410273515677, "step": 9340 }, { "epoch": 3.115410273515677, "ref_ce_loss": 0.10531459003686905, "step": 9340 }, { "epoch": 3.115410273515677, "loss": 0.30875542759895325, "step": 9340 }, { "ce_loss": 0.09192804992198944, "epoch": 3.115410273515677, "step": 9340 }, { "distill_loss": 0.10150475800037384, "epoch": 3.115410273515677, "step": 9340 }, { "epoch": 3.115410273515677, "ref_ce_loss": 0.08724801987409592, "step": 9340 }, { "epoch": 3.1187458305537024, "loss": 0.4828, "step": 9350 }, { "epoch": 3.1187458305537024, "grad_norm": 3.029604434967041, "step": 9350 }, { "epoch": 3.1187458305537024, "learning_rate": 0.00020915698106105187, "step": 9350 }, { "epoch": 3.1187458305537024, "loss": 0.46674320101737976, "step": 9350 }, { "ce_loss": 0.15375635027885437, "epoch": 3.1187458305537024, "step": 9350 }, { "distill_loss": 0.20978285372257233, "epoch": 3.1187458305537024, "step": 9350 }, { "epoch": 3.1187458305537024, "ref_ce_loss": 0.10224690288305283, "step": 9350 }, { "epoch": 3.1187458305537024, "loss": 0.5229172706604004, "step": 9350 }, { "ce_loss": 0.16858038306236267, "epoch": 3.1187458305537024, "step": 9350 }, { "distill_loss": 0.21174737811088562, "epoch": 3.1187458305537024, "step": 9350 }, { "epoch": 3.1187458305537024, "ref_ce_loss": 0.10061193257570267, "step": 9350 }, { "epoch": 3.1187458305537024, "loss": 0.7606104016304016, "step": 9350 }, { "ce_loss": 0.3095298707485199, "epoch": 3.1187458305537024, "step": 9350 }, { "distill_loss": 0.1703067272901535, "epoch": 3.1187458305537024, "step": 9350 }, { "epoch": 3.1187458305537024, "ref_ce_loss": 0.2030133306980133, "step": 9350 }, { "epoch": 3.1187458305537024, "loss": 0.8234333992004395, "step": 9350 }, { "ce_loss": 0.09284783899784088, "epoch": 3.1187458305537024, "step": 9350 }, { "distill_loss": 0.19337543845176697, "epoch": 3.1187458305537024, "step": 9350 }, { "epoch": 3.1187458305537024, "ref_ce_loss": 0.07284106314182281, "step": 9350 }, { "epoch": 3.1220813875917277, "loss": 0.5192, "step": 9360 }, { "epoch": 3.1220813875917277, "grad_norm": 2.5633559226989746, "step": 9360 }, { "epoch": 3.1220813875917277, "learning_rate": 0.00020897078386263615, "step": 9360 }, { "epoch": 3.1220813875917277, "loss": 0.4884679615497589, "step": 9360 }, { "ce_loss": 0.12888407707214355, "epoch": 3.1220813875917277, "step": 9360 }, { "distill_loss": 0.17425504326820374, "epoch": 3.1220813875917277, "step": 9360 }, { "epoch": 3.1220813875917277, "ref_ce_loss": 0.1417858600616455, "step": 9360 }, { "epoch": 3.1220813875917277, "loss": 0.6288591623306274, "step": 9360 }, { "ce_loss": 0.10382901132106781, "epoch": 3.1220813875917277, "step": 9360 }, { "distill_loss": 0.204414501786232, "epoch": 3.1220813875917277, "step": 9360 }, { "epoch": 3.1220813875917277, "ref_ce_loss": 0.12629245221614838, "step": 9360 }, { "epoch": 3.1220813875917277, "loss": 0.5769619941711426, "step": 9360 }, { "ce_loss": 0.2375664860010147, "epoch": 3.1220813875917277, "step": 9360 }, { "distill_loss": 0.16669347882270813, "epoch": 3.1220813875917277, "step": 9360 }, { "epoch": 3.1220813875917277, "ref_ce_loss": 0.1284281313419342, "step": 9360 }, { "epoch": 3.1220813875917277, "loss": 0.5622938871383667, "step": 9360 }, { "ce_loss": 0.16985753178596497, "epoch": 3.1220813875917277, "step": 9360 }, { "distill_loss": 0.17187856137752533, "epoch": 3.1220813875917277, "step": 9360 }, { "epoch": 3.1220813875917277, "ref_ce_loss": 0.17148420214653015, "step": 9360 }, { "epoch": 3.125416944629753, "loss": 0.4967, "step": 9370 }, { "epoch": 3.125416944629753, "grad_norm": 2.569465398788452, "step": 9370 }, { "epoch": 3.125416944629753, "learning_rate": 0.00020878447912477268, "step": 9370 }, { "epoch": 3.125416944629753, "loss": 0.4813728332519531, "step": 9370 }, { "ce_loss": 0.2246464639902115, "epoch": 3.125416944629753, "step": 9370 }, { "distill_loss": 0.167310893535614, "epoch": 3.125416944629753, "step": 9370 }, { "epoch": 3.125416944629753, "ref_ce_loss": 0.08932902663946152, "step": 9370 }, { "epoch": 3.125416944629753, "loss": 0.3730131983757019, "step": 9370 }, { "ce_loss": 0.09139455109834671, "epoch": 3.125416944629753, "step": 9370 }, { "distill_loss": 0.1308048963546753, "epoch": 3.125416944629753, "step": 9370 }, { "epoch": 3.125416944629753, "ref_ce_loss": 0.10961811244487762, "step": 9370 }, { "epoch": 3.125416944629753, "loss": 0.3340955078601837, "step": 9370 }, { "ce_loss": 0.10592299699783325, "epoch": 3.125416944629753, "step": 9370 }, { "distill_loss": 0.14246045053005219, "epoch": 3.125416944629753, "step": 9370 }, { "epoch": 3.125416944629753, "ref_ce_loss": 0.0574227012693882, "step": 9370 }, { "epoch": 3.125416944629753, "loss": 0.3670573830604553, "step": 9370 }, { "ce_loss": 0.1103769838809967, "epoch": 3.125416944629753, "step": 9370 }, { "distill_loss": 0.12523096799850464, "epoch": 3.125416944629753, "step": 9370 }, { "epoch": 3.125416944629753, "ref_ce_loss": 0.1000145673751831, "step": 9370 }, { "epoch": 3.1287525016677784, "loss": 0.4907, "step": 9380 }, { "epoch": 3.1287525016677784, "grad_norm": 2.5900824069976807, "step": 9380 }, { "epoch": 3.1287525016677784, "learning_rate": 0.00020859806718720792, "step": 9380 }, { "epoch": 3.1287525016677784, "loss": 0.6317574977874756, "step": 9380 }, { "ce_loss": 0.22041912376880646, "epoch": 3.1287525016677784, "step": 9380 }, { "distill_loss": 0.1646532118320465, "epoch": 3.1287525016677784, "step": 9380 }, { "epoch": 3.1287525016677784, "ref_ce_loss": 0.14836913347244263, "step": 9380 }, { "epoch": 3.1287525016677784, "loss": 0.3367335796356201, "step": 9380 }, { "ce_loss": 0.11072410643100739, "epoch": 3.1287525016677784, "step": 9380 }, { "distill_loss": 0.10584492981433868, "epoch": 3.1287525016677784, "step": 9380 }, { "epoch": 3.1287525016677784, "ref_ce_loss": 0.0841221809387207, "step": 9380 }, { "epoch": 3.1287525016677784, "loss": 0.5291550755500793, "step": 9380 }, { "ce_loss": 0.22186344861984253, "epoch": 3.1287525016677784, "step": 9380 }, { "distill_loss": 0.17727592587471008, "epoch": 3.1287525016677784, "step": 9380 }, { "epoch": 3.1287525016677784, "ref_ce_loss": 0.12976141273975372, "step": 9380 }, { "epoch": 3.1287525016677784, "loss": 0.5323965549468994, "step": 9380 }, { "ce_loss": 0.1890687644481659, "epoch": 3.1287525016677784, "step": 9380 }, { "distill_loss": 0.16845601797103882, "epoch": 3.1287525016677784, "step": 9380 }, { "epoch": 3.1287525016677784, "ref_ce_loss": 0.12848173081874847, "step": 9380 }, { "epoch": 3.1320880587058038, "loss": 0.4738, "step": 9390 }, { "epoch": 3.1320880587058038, "grad_norm": 2.4686079025268555, "step": 9390 }, { "epoch": 3.1320880587058038, "learning_rate": 0.00020841154838988364, "step": 9390 }, { "epoch": 3.1320880587058038, "loss": 0.6145331859588623, "step": 9390 }, { "ce_loss": 0.15522648394107819, "epoch": 3.1320880587058038, "step": 9390 }, { "distill_loss": 0.11161641031503677, "epoch": 3.1320880587058038, "step": 9390 }, { "epoch": 3.1320880587058038, "ref_ce_loss": 0.09471061080694199, "step": 9390 }, { "epoch": 3.1320880587058038, "loss": 0.35698485374450684, "step": 9390 }, { "ce_loss": 0.07764547318220139, "epoch": 3.1320880587058038, "step": 9390 }, { "distill_loss": 0.14094972610473633, "epoch": 3.1320880587058038, "step": 9390 }, { "epoch": 3.1320880587058038, "ref_ce_loss": 0.08621007204055786, "step": 9390 }, { "epoch": 3.1320880587058038, "loss": 0.289503812789917, "step": 9390 }, { "ce_loss": 0.09239063411951065, "epoch": 3.1320880587058038, "step": 9390 }, { "distill_loss": 0.08384876698255539, "epoch": 3.1320880587058038, "step": 9390 }, { "epoch": 3.1320880587058038, "ref_ce_loss": 0.1131184846162796, "step": 9390 }, { "epoch": 3.1320880587058038, "loss": 0.3908340334892273, "step": 9390 }, { "ce_loss": 0.13656210899353027, "epoch": 3.1320880587058038, "step": 9390 }, { "distill_loss": 0.11637710779905319, "epoch": 3.1320880587058038, "step": 9390 }, { "epoch": 3.1320880587058038, "ref_ce_loss": 0.10149884223937988, "step": 9390 }, { "epoch": 3.135423615743829, "loss": 0.5158, "step": 9400 }, { "epoch": 3.135423615743829, "grad_norm": 5.387189865112305, "step": 9400 }, { "epoch": 3.135423615743829, "learning_rate": 0.00020822492307293655, "step": 9400 }, { "epoch": 3.135423615743829, "loss": 0.44152170419692993, "step": 9400 }, { "ce_loss": 0.0922570526599884, "epoch": 3.135423615743829, "step": 9400 }, { "distill_loss": 0.0994657576084137, "epoch": 3.135423615743829, "step": 9400 }, { "epoch": 3.135423615743829, "ref_ce_loss": 0.11067865043878555, "step": 9400 }, { "epoch": 3.135423615743829, "loss": 0.5112382173538208, "step": 9400 }, { "ce_loss": 0.129240944981575, "epoch": 3.135423615743829, "step": 9400 }, { "distill_loss": 0.13666221499443054, "epoch": 3.135423615743829, "step": 9400 }, { "epoch": 3.135423615743829, "ref_ce_loss": 0.11640065908432007, "step": 9400 }, { "epoch": 3.135423615743829, "loss": 0.5008998513221741, "step": 9400 }, { "ce_loss": 0.17135149240493774, "epoch": 3.135423615743829, "step": 9400 }, { "distill_loss": 0.1431116908788681, "epoch": 3.135423615743829, "step": 9400 }, { "epoch": 3.135423615743829, "ref_ce_loss": 0.06502916663885117, "step": 9400 }, { "epoch": 3.135423615743829, "loss": 0.41493064165115356, "step": 9400 }, { "ce_loss": 0.11807221919298172, "epoch": 3.135423615743829, "step": 9400 }, { "distill_loss": 0.1267719715833664, "epoch": 3.135423615743829, "step": 9400 }, { "epoch": 3.135423615743829, "ref_ce_loss": 0.07536394149065018, "step": 9400 }, { "epoch": 3.1387591727818545, "loss": 0.4922, "step": 9410 }, { "epoch": 3.1387591727818545, "grad_norm": 2.6245408058166504, "step": 9410 }, { "epoch": 3.1387591727818545, "learning_rate": 0.00020803819157669766, "step": 9410 }, { "epoch": 3.1387591727818545, "loss": 0.36940526962280273, "step": 9410 }, { "ce_loss": 0.11200276762247086, "epoch": 3.1387591727818545, "step": 9410 }, { "distill_loss": 0.10597594082355499, "epoch": 3.1387591727818545, "step": 9410 }, { "epoch": 3.1387591727818545, "ref_ce_loss": 0.11852702498435974, "step": 9410 }, { "epoch": 3.1387591727818545, "loss": 0.3383985459804535, "step": 9410 }, { "ce_loss": 0.09196282923221588, "epoch": 3.1387591727818545, "step": 9410 }, { "distill_loss": 0.10407501459121704, "epoch": 3.1387591727818545, "step": 9410 }, { "epoch": 3.1387591727818545, "ref_ce_loss": 0.08363886177539825, "step": 9410 }, { "epoch": 3.1387591727818545, "loss": 0.6444665193557739, "step": 9410 }, { "ce_loss": 0.21152932941913605, "epoch": 3.1387591727818545, "step": 9410 }, { "distill_loss": 0.14611922204494476, "epoch": 3.1387591727818545, "step": 9410 }, { "epoch": 3.1387591727818545, "ref_ce_loss": 0.12397867441177368, "step": 9410 }, { "epoch": 3.1387591727818545, "loss": 0.43682152032852173, "step": 9410 }, { "ce_loss": 0.09575681388378143, "epoch": 3.1387591727818545, "step": 9410 }, { "distill_loss": 0.11555679142475128, "epoch": 3.1387591727818545, "step": 9410 }, { "epoch": 3.1387591727818545, "ref_ce_loss": 0.11327240616083145, "step": 9410 }, { "epoch": 3.14209472981988, "loss": 0.4972, "step": 9420 }, { "epoch": 3.14209472981988, "grad_norm": 3.332326650619507, "step": 9420 }, { "epoch": 3.14209472981988, "learning_rate": 0.00020785135424169156, "step": 9420 }, { "epoch": 3.14209472981988, "loss": 0.5172271132469177, "step": 9420 }, { "ce_loss": 0.18631412088871002, "epoch": 3.14209472981988, "step": 9420 }, { "distill_loss": 0.1371542513370514, "epoch": 3.14209472981988, "step": 9420 }, { "epoch": 3.14209472981988, "ref_ce_loss": 0.10870325565338135, "step": 9420 }, { "epoch": 3.14209472981988, "loss": 0.46155813336372375, "step": 9420 }, { "ce_loss": 0.1175021380186081, "epoch": 3.14209472981988, "step": 9420 }, { "distill_loss": 0.1451730579137802, "epoch": 3.14209472981988, "step": 9420 }, { "epoch": 3.14209472981988, "ref_ce_loss": 0.14567217230796814, "step": 9420 }, { "epoch": 3.14209472981988, "loss": 0.7952618598937988, "step": 9420 }, { "ce_loss": 0.2216344028711319, "epoch": 3.14209472981988, "step": 9420 }, { "distill_loss": 0.17677900195121765, "epoch": 3.14209472981988, "step": 9420 }, { "epoch": 3.14209472981988, "ref_ce_loss": 0.15591177344322205, "step": 9420 }, { "epoch": 3.14209472981988, "loss": 0.6818380355834961, "step": 9420 }, { "ce_loss": 0.21773341298103333, "epoch": 3.14209472981988, "step": 9420 }, { "distill_loss": 0.1644251048564911, "epoch": 3.14209472981988, "step": 9420 }, { "epoch": 3.14209472981988, "ref_ce_loss": 0.14234201610088348, "step": 9420 }, { "epoch": 3.145430286857905, "loss": 0.5497, "step": 9430 }, { "epoch": 3.145430286857905, "grad_norm": 2.401385545730591, "step": 9430 }, { "epoch": 3.145430286857905, "learning_rate": 0.00020766441140863577, "step": 9430 }, { "epoch": 3.145430286857905, "loss": 0.393453449010849, "step": 9430 }, { "ce_loss": 0.13155467808246613, "epoch": 3.145430286857905, "step": 9430 }, { "distill_loss": 0.10931574553251266, "epoch": 3.145430286857905, "step": 9430 }, { "epoch": 3.145430286857905, "ref_ce_loss": 0.09655128419399261, "step": 9430 }, { "epoch": 3.145430286857905, "loss": 0.5850750803947449, "step": 9430 }, { "ce_loss": 0.26182281970977783, "epoch": 3.145430286857905, "step": 9430 }, { "distill_loss": 0.11857051402330399, "epoch": 3.145430286857905, "step": 9430 }, { "epoch": 3.145430286857905, "ref_ce_loss": 0.14775000512599945, "step": 9430 }, { "epoch": 3.145430286857905, "loss": 0.3602902293205261, "step": 9430 }, { "ce_loss": 0.12058570981025696, "epoch": 3.145430286857905, "step": 9430 }, { "distill_loss": 0.10224181413650513, "epoch": 3.145430286857905, "step": 9430 }, { "epoch": 3.145430286857905, "ref_ce_loss": 0.10732877999544144, "step": 9430 }, { "epoch": 3.145430286857905, "loss": 0.34497690200805664, "step": 9430 }, { "ce_loss": 0.08291725069284439, "epoch": 3.145430286857905, "step": 9430 }, { "distill_loss": 0.11977192759513855, "epoch": 3.145430286857905, "step": 9430 }, { "epoch": 3.145430286857905, "ref_ce_loss": 0.10097347944974899, "step": 9430 }, { "epoch": 3.1487658438959305, "loss": 0.4861, "step": 9440 }, { "epoch": 3.1487658438959305, "grad_norm": 3.3197946548461914, "step": 9440 }, { "epoch": 3.1487658438959305, "learning_rate": 0.00020747736341844038, "step": 9440 }, { "epoch": 3.1487658438959305, "loss": 0.6189022660255432, "step": 9440 }, { "ce_loss": 0.26251354813575745, "epoch": 3.1487658438959305, "step": 9440 }, { "distill_loss": 0.22185054421424866, "epoch": 3.1487658438959305, "step": 9440 }, { "epoch": 3.1487658438959305, "ref_ce_loss": 0.09539277106523514, "step": 9440 }, { "epoch": 3.1487658438959305, "loss": 0.388549268245697, "step": 9440 }, { "ce_loss": 0.09720093756914139, "epoch": 3.1487658438959305, "step": 9440 }, { "distill_loss": 0.10628783702850342, "epoch": 3.1487658438959305, "step": 9440 }, { "epoch": 3.1487658438959305, "ref_ce_loss": 0.05633091554045677, "step": 9440 }, { "epoch": 3.1487658438959305, "loss": 0.26291322708129883, "step": 9440 }, { "ce_loss": 0.0834595263004303, "epoch": 3.1487658438959305, "step": 9440 }, { "distill_loss": 0.10990596562623978, "epoch": 3.1487658438959305, "step": 9440 }, { "epoch": 3.1487658438959305, "ref_ce_loss": 0.06892497092485428, "step": 9440 }, { "epoch": 3.1487658438959305, "loss": 0.2386210411787033, "step": 9440 }, { "ce_loss": 0.044991619884967804, "epoch": 3.1487658438959305, "step": 9440 }, { "distill_loss": 0.08849681168794632, "epoch": 3.1487658438959305, "step": 9440 }, { "epoch": 3.1487658438959305, "ref_ce_loss": 0.10291331261396408, "step": 9440 }, { "epoch": 3.152101400933956, "loss": 0.5312, "step": 9450 }, { "epoch": 3.152101400933956, "grad_norm": 2.457263231277466, "step": 9450 }, { "epoch": 3.152101400933956, "learning_rate": 0.000207290210612207, "step": 9450 }, { "epoch": 3.152101400933956, "loss": 0.5044496655464172, "step": 9450 }, { "ce_loss": 0.19127508997917175, "epoch": 3.152101400933956, "step": 9450 }, { "distill_loss": 0.14769670367240906, "epoch": 3.152101400933956, "step": 9450 }, { "epoch": 3.152101400933956, "ref_ce_loss": 0.11158882081508636, "step": 9450 }, { "epoch": 3.152101400933956, "loss": 0.8176451921463013, "step": 9450 }, { "ce_loss": 0.18827858567237854, "epoch": 3.152101400933956, "step": 9450 }, { "distill_loss": 0.16407611966133118, "epoch": 3.152101400933956, "step": 9450 }, { "epoch": 3.152101400933956, "ref_ce_loss": 0.14914929866790771, "step": 9450 }, { "epoch": 3.152101400933956, "loss": 0.43922194838523865, "step": 9450 }, { "ce_loss": 0.1384783834218979, "epoch": 3.152101400933956, "step": 9450 }, { "distill_loss": 0.15343838930130005, "epoch": 3.152101400933956, "step": 9450 }, { "epoch": 3.152101400933956, "ref_ce_loss": 0.11001858860254288, "step": 9450 }, { "epoch": 3.152101400933956, "loss": 0.3858475685119629, "step": 9450 }, { "ce_loss": 0.13132528960704803, "epoch": 3.152101400933956, "step": 9450 }, { "distill_loss": 0.13699130713939667, "epoch": 3.152101400933956, "step": 9450 }, { "epoch": 3.152101400933956, "ref_ce_loss": 0.08675961196422577, "step": 9450 }, { "epoch": 3.155436957971981, "loss": 0.5252, "step": 9460 }, { "epoch": 3.155436957971981, "grad_norm": 3.537496566772461, "step": 9460 }, { "epoch": 3.155436957971981, "learning_rate": 0.00020710295333122868, "step": 9460 }, { "epoch": 3.155436957971981, "loss": 0.42468106746673584, "step": 9460 }, { "ce_loss": 0.13657568395137787, "epoch": 3.155436957971981, "step": 9460 }, { "distill_loss": 0.1591964215040207, "epoch": 3.155436957971981, "step": 9460 }, { "epoch": 3.155436957971981, "ref_ce_loss": 0.09413352608680725, "step": 9460 }, { "epoch": 3.155436957971981, "loss": 0.48567768931388855, "step": 9460 }, { "ce_loss": 0.10176261514425278, "epoch": 3.155436957971981, "step": 9460 }, { "distill_loss": 0.14666540920734406, "epoch": 3.155436957971981, "step": 9460 }, { "epoch": 3.155436957971981, "ref_ce_loss": 0.12171636521816254, "step": 9460 }, { "epoch": 3.155436957971981, "loss": 0.6425243020057678, "step": 9460 }, { "ce_loss": 0.1498386412858963, "epoch": 3.155436957971981, "step": 9460 }, { "distill_loss": 0.2042589783668518, "epoch": 3.155436957971981, "step": 9460 }, { "epoch": 3.155436957971981, "ref_ce_loss": 0.1274162232875824, "step": 9460 }, { "epoch": 3.155436957971981, "loss": 0.7263787984848022, "step": 9460 }, { "ce_loss": 0.13975995779037476, "epoch": 3.155436957971981, "step": 9460 }, { "distill_loss": 0.13692700862884521, "epoch": 3.155436957971981, "step": 9460 }, { "epoch": 3.155436957971981, "ref_ce_loss": 0.1329973340034485, "step": 9460 }, { "epoch": 3.1587725150100066, "loss": 0.5612, "step": 9470 }, { "epoch": 3.1587725150100066, "grad_norm": 5.573383331298828, "step": 9470 }, { "epoch": 3.1587725150100066, "learning_rate": 0.00020691559191698876, "step": 9470 }, { "epoch": 3.1587725150100066, "loss": 0.3879528343677521, "step": 9470 }, { "ce_loss": 0.07907040417194366, "epoch": 3.1587725150100066, "step": 9470 }, { "distill_loss": 0.1571669578552246, "epoch": 3.1587725150100066, "step": 9470 }, { "epoch": 3.1587725150100066, "ref_ce_loss": 0.09325411170721054, "step": 9470 }, { "epoch": 3.1587725150100066, "loss": 0.6910441517829895, "step": 9470 }, { "ce_loss": 0.2567923069000244, "epoch": 3.1587725150100066, "step": 9470 }, { "distill_loss": 0.19595922529697418, "epoch": 3.1587725150100066, "step": 9470 }, { "epoch": 3.1587725150100066, "ref_ce_loss": 0.19764713943004608, "step": 9470 }, { "epoch": 3.1587725150100066, "loss": 0.8553934693336487, "step": 9470 }, { "ce_loss": 0.1372901350259781, "epoch": 3.1587725150100066, "step": 9470 }, { "distill_loss": 0.1819874346256256, "epoch": 3.1587725150100066, "step": 9470 }, { "epoch": 3.1587725150100066, "ref_ce_loss": 0.1226576417684555, "step": 9470 }, { "epoch": 3.1587725150100066, "loss": 0.8925281763076782, "step": 9470 }, { "ce_loss": 0.16941511631011963, "epoch": 3.1587725150100066, "step": 9470 }, { "distill_loss": 0.2515280246734619, "epoch": 3.1587725150100066, "step": 9470 }, { "epoch": 3.1587725150100066, "ref_ce_loss": 0.14782162010669708, "step": 9470 }, { "epoch": 3.162108072048032, "loss": 0.5487, "step": 9480 }, { "epoch": 3.162108072048032, "grad_norm": 4.598330497741699, "step": 9480 }, { "epoch": 3.162108072048032, "learning_rate": 0.00020672812671116052, "step": 9480 }, { "epoch": 3.162108072048032, "loss": 0.3818383812904358, "step": 9480 }, { "ce_loss": 0.08812081813812256, "epoch": 3.162108072048032, "step": 9480 }, { "distill_loss": 0.1844634860754013, "epoch": 3.162108072048032, "step": 9480 }, { "epoch": 3.162108072048032, "ref_ce_loss": 0.10911556333303452, "step": 9480 }, { "epoch": 3.162108072048032, "loss": 0.4501780569553375, "step": 9480 }, { "ce_loss": 0.11660466343164444, "epoch": 3.162108072048032, "step": 9480 }, { "distill_loss": 0.15039286017417908, "epoch": 3.162108072048032, "step": 9480 }, { "epoch": 3.162108072048032, "ref_ce_loss": 0.10421738773584366, "step": 9480 }, { "epoch": 3.162108072048032, "loss": 0.5638201832771301, "step": 9480 }, { "ce_loss": 0.1827458292245865, "epoch": 3.162108072048032, "step": 9480 }, { "distill_loss": 0.19522276520729065, "epoch": 3.162108072048032, "step": 9480 }, { "epoch": 3.162108072048032, "ref_ce_loss": 0.11500189453363419, "step": 9480 }, { "epoch": 3.162108072048032, "loss": 0.48389768600463867, "step": 9480 }, { "ce_loss": 0.1433190554380417, "epoch": 3.162108072048032, "step": 9480 }, { "distill_loss": 0.21797630190849304, "epoch": 3.162108072048032, "step": 9480 }, { "epoch": 3.162108072048032, "ref_ce_loss": 0.12215530872344971, "step": 9480 }, { "epoch": 3.1654436290860573, "loss": 0.6044, "step": 9490 }, { "epoch": 3.1654436290860573, "grad_norm": 3.7871646881103516, "step": 9490 }, { "epoch": 3.1654436290860573, "learning_rate": 0.00020654055805560662, "step": 9490 }, { "epoch": 3.1654436290860573, "loss": 0.5622644424438477, "step": 9490 }, { "ce_loss": 0.15414637327194214, "epoch": 3.1654436290860573, "step": 9490 }, { "distill_loss": 0.2756098508834839, "epoch": 3.1654436290860573, "step": 9490 }, { "epoch": 3.1654436290860573, "ref_ce_loss": 0.10112477093935013, "step": 9490 }, { "epoch": 3.1654436290860573, "loss": 0.37916234135627747, "step": 9490 }, { "ce_loss": 0.07440236955881119, "epoch": 3.1654436290860573, "step": 9490 }, { "distill_loss": 0.17885206639766693, "epoch": 3.1654436290860573, "step": 9490 }, { "epoch": 3.1654436290860573, "ref_ce_loss": 0.06276611983776093, "step": 9490 }, { "epoch": 3.1654436290860573, "loss": 1.0593150854110718, "step": 9490 }, { "ce_loss": 0.1475900262594223, "epoch": 3.1654436290860573, "step": 9490 }, { "distill_loss": 0.297240287065506, "epoch": 3.1654436290860573, "step": 9490 }, { "epoch": 3.1654436290860573, "ref_ce_loss": 0.1502453237771988, "step": 9490 }, { "epoch": 3.1654436290860573, "loss": 0.5550252199172974, "step": 9490 }, { "ce_loss": 0.14874206483364105, "epoch": 3.1654436290860573, "step": 9490 }, { "distill_loss": 0.2950950860977173, "epoch": 3.1654436290860573, "step": 9490 }, { "epoch": 3.1654436290860573, "ref_ce_loss": 0.08617687225341797, "step": 9490 }, { "epoch": 3.1687791861240826, "loss": 0.666, "step": 9500 }, { "epoch": 3.1687791861240826, "grad_norm": 6.840047836303711, "step": 9500 }, { "epoch": 3.1687791861240826, "learning_rate": 0.0002063528862923782, "step": 9500 }, { "epoch": 3.1687791861240826, "loss": 0.6700571775436401, "step": 9500 }, { "ce_loss": 0.2572788894176483, "epoch": 3.1687791861240826, "step": 9500 }, { "distill_loss": 0.2567271888256073, "epoch": 3.1687791861240826, "step": 9500 }, { "epoch": 3.1687791861240826, "ref_ce_loss": 0.15579195320606232, "step": 9500 }, { "epoch": 3.1687791861240826, "loss": 0.9040383100509644, "step": 9500 }, { "ce_loss": 0.20419129729270935, "epoch": 3.1687791861240826, "step": 9500 }, { "distill_loss": 0.21273010969161987, "epoch": 3.1687791861240826, "step": 9500 }, { "epoch": 3.1687791861240826, "ref_ce_loss": 0.11534770578145981, "step": 9500 }, { "epoch": 3.1687791861240826, "loss": 0.5888009667396545, "step": 9500 }, { "ce_loss": 0.10383474081754684, "epoch": 3.1687791861240826, "step": 9500 }, { "distill_loss": 0.2927468419075012, "epoch": 3.1687791861240826, "step": 9500 }, { "epoch": 3.1687791861240826, "ref_ce_loss": 0.10847216844558716, "step": 9500 }, { "epoch": 3.1687791861240826, "loss": 0.44755592942237854, "step": 9500 }, { "ce_loss": 0.14601008594036102, "epoch": 3.1687791861240826, "step": 9500 }, { "distill_loss": 0.15512534976005554, "epoch": 3.1687791861240826, "step": 9500 }, { "epoch": 3.1687791861240826, "ref_ce_loss": 0.07985249161720276, "step": 9500 }, { "epoch": 3.172114743162108, "loss": 0.6205, "step": 9510 }, { "epoch": 3.172114743162108, "grad_norm": 2.793665885925293, "step": 9510 }, { "epoch": 3.172114743162108, "learning_rate": 0.00020616511176371465, "step": 9510 }, { "epoch": 3.172114743162108, "loss": 0.45676806569099426, "step": 9510 }, { "ce_loss": 0.14598795771598816, "epoch": 3.172114743162108, "step": 9510 }, { "distill_loss": 0.16061095893383026, "epoch": 3.172114743162108, "step": 9510 }, { "epoch": 3.172114743162108, "ref_ce_loss": 0.09271222352981567, "step": 9510 }, { "epoch": 3.172114743162108, "loss": 0.5553157925605774, "step": 9510 }, { "ce_loss": 0.24441476166248322, "epoch": 3.172114743162108, "step": 9510 }, { "distill_loss": 0.1846240609884262, "epoch": 3.172114743162108, "step": 9510 }, { "epoch": 3.172114743162108, "ref_ce_loss": 0.12624230980873108, "step": 9510 }, { "epoch": 3.172114743162108, "loss": 0.4163009524345398, "step": 9510 }, { "ce_loss": 0.11610870063304901, "epoch": 3.172114743162108, "step": 9510 }, { "distill_loss": 0.16040027141571045, "epoch": 3.172114743162108, "step": 9510 }, { "epoch": 3.172114743162108, "ref_ce_loss": 0.10182558000087738, "step": 9510 }, { "epoch": 3.172114743162108, "loss": 0.3117366135120392, "step": 9510 }, { "ce_loss": 0.07078323513269424, "epoch": 3.172114743162108, "step": 9510 }, { "distill_loss": 0.13040763139724731, "epoch": 3.172114743162108, "step": 9510 }, { "epoch": 3.172114743162108, "ref_ce_loss": 0.11036781966686249, "step": 9510 }, { "epoch": 3.1754503002001333, "loss": 0.5296, "step": 9520 }, { "epoch": 3.1754503002001333, "grad_norm": 4.296565532684326, "step": 9520 }, { "epoch": 3.1754503002001333, "learning_rate": 0.00020597723481204251, "step": 9520 }, { "epoch": 3.1754503002001333, "loss": 0.3121442496776581, "step": 9520 }, { "ce_loss": 0.07688009738922119, "epoch": 3.1754503002001333, "step": 9520 }, { "distill_loss": 0.12842987477779388, "epoch": 3.1754503002001333, "step": 9520 }, { "epoch": 3.1754503002001333, "ref_ce_loss": 0.10675547271966934, "step": 9520 }, { "epoch": 3.1754503002001333, "loss": 0.4808747172355652, "step": 9520 }, { "ce_loss": 0.07221264392137527, "epoch": 3.1754503002001333, "step": 9520 }, { "distill_loss": 0.15099433064460754, "epoch": 3.1754503002001333, "step": 9520 }, { "epoch": 3.1754503002001333, "ref_ce_loss": 0.08741383999586105, "step": 9520 }, { "epoch": 3.1754503002001333, "loss": 0.28258243203163147, "step": 9520 }, { "ce_loss": 0.07468129694461823, "epoch": 3.1754503002001333, "step": 9520 }, { "distill_loss": 0.1378626972436905, "epoch": 3.1754503002001333, "step": 9520 }, { "epoch": 3.1754503002001333, "ref_ce_loss": 0.06976884603500366, "step": 9520 }, { "epoch": 3.1754503002001333, "loss": 0.469711035490036, "step": 9520 }, { "ce_loss": 0.11323860287666321, "epoch": 3.1754503002001333, "step": 9520 }, { "distill_loss": 0.20424960553646088, "epoch": 3.1754503002001333, "step": 9520 }, { "epoch": 3.1754503002001333, "ref_ce_loss": 0.06783854216337204, "step": 9520 }, { "epoch": 3.1787858572381587, "loss": 0.5476, "step": 9530 }, { "epoch": 3.1787858572381587, "grad_norm": 1.974441647529602, "step": 9530 }, { "epoch": 3.1787858572381587, "learning_rate": 0.0002057892557799753, "step": 9530 }, { "epoch": 3.1787858572381587, "loss": 0.41514265537261963, "step": 9530 }, { "ce_loss": 0.12711583077907562, "epoch": 3.1787858572381587, "step": 9530 }, { "distill_loss": 0.17615363001823425, "epoch": 3.1787858572381587, "step": 9530 }, { "epoch": 3.1787858572381587, "ref_ce_loss": 0.07456578314304352, "step": 9530 }, { "epoch": 3.1787858572381587, "loss": 0.6223483085632324, "step": 9530 }, { "ce_loss": 0.16231295466423035, "epoch": 3.1787858572381587, "step": 9530 }, { "distill_loss": 0.1323533058166504, "epoch": 3.1787858572381587, "step": 9530 }, { "epoch": 3.1787858572381587, "ref_ce_loss": 0.11479992419481277, "step": 9530 }, { "epoch": 3.1787858572381587, "loss": 0.4665870666503906, "step": 9530 }, { "ce_loss": 0.12044032663106918, "epoch": 3.1787858572381587, "step": 9530 }, { "distill_loss": 0.20676037669181824, "epoch": 3.1787858572381587, "step": 9530 }, { "epoch": 3.1787858572381587, "ref_ce_loss": 0.09432920068502426, "step": 9530 }, { "epoch": 3.1787858572381587, "loss": 0.41781315207481384, "step": 9530 }, { "ce_loss": 0.15532222390174866, "epoch": 3.1787858572381587, "step": 9530 }, { "distill_loss": 0.1355230212211609, "epoch": 3.1787858572381587, "step": 9530 }, { "epoch": 3.1787858572381587, "ref_ce_loss": 0.1266229748725891, "step": 9530 }, { "epoch": 3.182121414276184, "loss": 0.5524, "step": 9540 }, { "epoch": 3.182121414276184, "grad_norm": 2.466517448425293, "step": 9540 }, { "epoch": 3.182121414276184, "learning_rate": 0.00020560117501031264, "step": 9540 }, { "epoch": 3.182121414276184, "loss": 0.3828505873680115, "step": 9540 }, { "ce_loss": 0.10816628485918045, "epoch": 3.182121414276184, "step": 9540 }, { "distill_loss": 0.09501266479492188, "epoch": 3.182121414276184, "step": 9540 }, { "epoch": 3.182121414276184, "ref_ce_loss": 0.14963635802268982, "step": 9540 }, { "epoch": 3.182121414276184, "loss": 0.6822697520256042, "step": 9540 }, { "ce_loss": 0.0815955400466919, "epoch": 3.182121414276184, "step": 9540 }, { "distill_loss": 0.17200466990470886, "epoch": 3.182121414276184, "step": 9540 }, { "epoch": 3.182121414276184, "ref_ce_loss": 0.12888874113559723, "step": 9540 }, { "epoch": 3.182121414276184, "loss": 0.428188681602478, "step": 9540 }, { "ce_loss": 0.12251435965299606, "epoch": 3.182121414276184, "step": 9540 }, { "distill_loss": 0.1265813708305359, "epoch": 3.182121414276184, "step": 9540 }, { "epoch": 3.182121414276184, "ref_ce_loss": 0.08265712857246399, "step": 9540 }, { "epoch": 3.182121414276184, "loss": 0.4096594452857971, "step": 9540 }, { "ce_loss": 0.09690508246421814, "epoch": 3.182121414276184, "step": 9540 }, { "distill_loss": 0.10855381190776825, "epoch": 3.182121414276184, "step": 9540 }, { "epoch": 3.182121414276184, "ref_ce_loss": 0.07601267099380493, "step": 9540 }, { "epoch": 3.1854569713142094, "loss": 0.5364, "step": 9550 }, { "epoch": 3.1854569713142094, "grad_norm": 2.382157325744629, "step": 9550 }, { "epoch": 3.1854569713142094, "learning_rate": 0.0002054129928460396, "step": 9550 }, { "epoch": 3.1854569713142094, "loss": 0.38213956356048584, "step": 9550 }, { "ce_loss": 0.09391985088586807, "epoch": 3.1854569713142094, "step": 9550 }, { "distill_loss": 0.12190929055213928, "epoch": 3.1854569713142094, "step": 9550 }, { "epoch": 3.1854569713142094, "ref_ce_loss": 0.09163673967123032, "step": 9550 }, { "epoch": 3.1854569713142094, "loss": 0.4600287675857544, "step": 9550 }, { "ce_loss": 0.12160974740982056, "epoch": 3.1854569713142094, "step": 9550 }, { "distill_loss": 0.13835075497627258, "epoch": 3.1854569713142094, "step": 9550 }, { "epoch": 3.1854569713142094, "ref_ce_loss": 0.09320228546857834, "step": 9550 }, { "epoch": 3.1854569713142094, "loss": 0.5476783514022827, "step": 9550 }, { "ce_loss": 0.24290725588798523, "epoch": 3.1854569713142094, "step": 9550 }, { "distill_loss": 0.15814319252967834, "epoch": 3.1854569713142094, "step": 9550 }, { "epoch": 3.1854569713142094, "ref_ce_loss": 0.12850640714168549, "step": 9550 }, { "epoch": 3.1854569713142094, "loss": 0.47217875719070435, "step": 9550 }, { "ce_loss": 0.09855984896421432, "epoch": 3.1854569713142094, "step": 9550 }, { "distill_loss": 0.13362620770931244, "epoch": 3.1854569713142094, "step": 9550 }, { "epoch": 3.1854569713142094, "ref_ce_loss": 0.07858073711395264, "step": 9550 }, { "epoch": 3.1887925283522347, "loss": 0.5146, "step": 9560 }, { "epoch": 3.1887925283522347, "grad_norm": 3.2701833248138428, "step": 9560 }, { "epoch": 3.1887925283522347, "learning_rate": 0.0002052247096303263, "step": 9560 }, { "epoch": 3.1887925283522347, "loss": 0.2855941355228424, "step": 9560 }, { "ce_loss": 0.06373196840286255, "epoch": 3.1887925283522347, "step": 9560 }, { "distill_loss": 0.13350118696689606, "epoch": 3.1887925283522347, "step": 9560 }, { "epoch": 3.1887925283522347, "ref_ce_loss": 0.06919163465499878, "step": 9560 }, { "epoch": 3.1887925283522347, "loss": 0.48115861415863037, "step": 9560 }, { "ce_loss": 0.11704720556735992, "epoch": 3.1887925283522347, "step": 9560 }, { "distill_loss": 0.1717700958251953, "epoch": 3.1887925283522347, "step": 9560 }, { "epoch": 3.1887925283522347, "ref_ce_loss": 0.11801206320524216, "step": 9560 }, { "epoch": 3.1887925283522347, "loss": 0.4057348072528839, "step": 9560 }, { "ce_loss": 0.10528801381587982, "epoch": 3.1887925283522347, "step": 9560 }, { "distill_loss": 0.18464790284633636, "epoch": 3.1887925283522347, "step": 9560 }, { "epoch": 3.1887925283522347, "ref_ce_loss": 0.11558081954717636, "step": 9560 }, { "epoch": 3.1887925283522347, "loss": 0.5976885557174683, "step": 9560 }, { "ce_loss": 0.1965349018573761, "epoch": 3.1887925283522347, "step": 9560 }, { "distill_loss": 0.11449624598026276, "epoch": 3.1887925283522347, "step": 9560 }, { "epoch": 3.1887925283522347, "ref_ce_loss": 0.18658675253391266, "step": 9560 }, { "epoch": 3.19212808539026, "loss": 0.5078, "step": 9570 }, { "epoch": 3.19212808539026, "grad_norm": 2.857909679412842, "step": 9570 }, { "epoch": 3.19212808539026, "learning_rate": 0.00020503632570652693, "step": 9570 }, { "epoch": 3.19212808539026, "loss": 0.319354385137558, "step": 9570 }, { "ce_loss": 0.09820310026407242, "epoch": 3.19212808539026, "step": 9570 }, { "distill_loss": 0.13627631962299347, "epoch": 3.19212808539026, "step": 9570 }, { "epoch": 3.19212808539026, "ref_ce_loss": 0.08482971042394638, "step": 9570 }, { "epoch": 3.19212808539026, "loss": 0.5038487315177917, "step": 9570 }, { "ce_loss": 0.13817772269248962, "epoch": 3.19212808539026, "step": 9570 }, { "distill_loss": 0.1949910670518875, "epoch": 3.19212808539026, "step": 9570 }, { "epoch": 3.19212808539026, "ref_ce_loss": 0.11488980799913406, "step": 9570 }, { "epoch": 3.19212808539026, "loss": 0.6758288145065308, "step": 9570 }, { "ce_loss": 0.1828175038099289, "epoch": 3.19212808539026, "step": 9570 }, { "distill_loss": 0.19590409100055695, "epoch": 3.19212808539026, "step": 9570 }, { "epoch": 3.19212808539026, "ref_ce_loss": 0.11350443959236145, "step": 9570 }, { "epoch": 3.19212808539026, "loss": 0.5674213171005249, "step": 9570 }, { "ce_loss": 0.08833687007427216, "epoch": 3.19212808539026, "step": 9570 }, { "distill_loss": 0.13266253471374512, "epoch": 3.19212808539026, "step": 9570 }, { "epoch": 3.19212808539026, "ref_ce_loss": 0.07736072689294815, "step": 9570 }, { "epoch": 3.1954636424282854, "loss": 0.516, "step": 9580 }, { "epoch": 3.1954636424282854, "grad_norm": 5.583287239074707, "step": 9580 }, { "epoch": 3.1954636424282854, "learning_rate": 0.00020484784141817957, "step": 9580 }, { "epoch": 3.1954636424282854, "loss": 0.8101859092712402, "step": 9580 }, { "ce_loss": 0.18054203689098358, "epoch": 3.1954636424282854, "step": 9580 }, { "distill_loss": 0.11705302447080612, "epoch": 3.1954636424282854, "step": 9580 }, { "epoch": 3.1954636424282854, "ref_ce_loss": 0.10733237117528915, "step": 9580 }, { "epoch": 3.1954636424282854, "loss": 0.3281974792480469, "step": 9580 }, { "ce_loss": 0.13033422827720642, "epoch": 3.1954636424282854, "step": 9580 }, { "distill_loss": 0.1337510496377945, "epoch": 3.1954636424282854, "step": 9580 }, { "epoch": 3.1954636424282854, "ref_ce_loss": 0.06400664150714874, "step": 9580 }, { "epoch": 3.1954636424282854, "loss": 0.5542563199996948, "step": 9580 }, { "ce_loss": 0.1291143149137497, "epoch": 3.1954636424282854, "step": 9580 }, { "distill_loss": 0.18237747251987457, "epoch": 3.1954636424282854, "step": 9580 }, { "epoch": 3.1954636424282854, "ref_ce_loss": 0.09707945585250854, "step": 9580 }, { "epoch": 3.1954636424282854, "loss": 0.46752509474754333, "step": 9580 }, { "ce_loss": 0.16086234152317047, "epoch": 3.1954636424282854, "step": 9580 }, { "distill_loss": 0.12830647826194763, "epoch": 3.1954636424282854, "step": 9580 }, { "epoch": 3.1954636424282854, "ref_ce_loss": 0.11367620527744293, "step": 9580 }, { "epoch": 3.1987991994663107, "loss": 0.4776, "step": 9590 }, { "epoch": 3.1987991994663107, "grad_norm": 2.494795322418213, "step": 9590 }, { "epoch": 3.1987991994663107, "learning_rate": 0.00020465925710900517, "step": 9590 }, { "epoch": 3.1987991994663107, "loss": 0.4189956784248352, "step": 9590 }, { "ce_loss": 0.1456405073404312, "epoch": 3.1987991994663107, "step": 9590 }, { "distill_loss": 0.129704087972641, "epoch": 3.1987991994663107, "step": 9590 }, { "epoch": 3.1987991994663107, "ref_ce_loss": 0.09875006228685379, "step": 9590 }, { "epoch": 3.1987991994663107, "loss": 0.6570210456848145, "step": 9590 }, { "ce_loss": 0.2106141895055771, "epoch": 3.1987991994663107, "step": 9590 }, { "distill_loss": 0.13541418313980103, "epoch": 3.1987991994663107, "step": 9590 }, { "epoch": 3.1987991994663107, "ref_ce_loss": 0.09310900419950485, "step": 9590 }, { "epoch": 3.1987991994663107, "loss": 0.5128756761550903, "step": 9590 }, { "ce_loss": 0.05535023659467697, "epoch": 3.1987991994663107, "step": 9590 }, { "distill_loss": 0.10842591524124146, "epoch": 3.1987991994663107, "step": 9590 }, { "epoch": 3.1987991994663107, "ref_ce_loss": 0.07918021082878113, "step": 9590 }, { "epoch": 3.1987991994663107, "loss": 0.435626745223999, "step": 9590 }, { "ce_loss": 0.13782809674739838, "epoch": 3.1987991994663107, "step": 9590 }, { "distill_loss": 0.15453578531742096, "epoch": 3.1987991994663107, "step": 9590 }, { "epoch": 3.1987991994663107, "ref_ce_loss": 0.10330259054899216, "step": 9590 }, { "epoch": 3.202134756504336, "loss": 0.5214, "step": 9600 }, { "epoch": 3.202134756504336, "grad_norm": 3.278040647506714, "step": 9600 }, { "epoch": 3.202134756504336, "learning_rate": 0.00020447057312290715, "step": 9600 }, { "epoch": 3.202134756504336, "loss": 0.417108952999115, "step": 9600 }, { "ce_loss": 0.11677819490432739, "epoch": 3.202134756504336, "step": 9600 }, { "distill_loss": 0.12888827919960022, "epoch": 3.202134756504336, "step": 9600 }, { "epoch": 3.202134756504336, "ref_ce_loss": 0.12421637773513794, "step": 9600 }, { "epoch": 3.202134756504336, "loss": 0.7868222594261169, "step": 9600 }, { "ce_loss": 0.1702321469783783, "epoch": 3.202134756504336, "step": 9600 }, { "distill_loss": 0.15147097408771515, "epoch": 3.202134756504336, "step": 9600 }, { "epoch": 3.202134756504336, "ref_ce_loss": 0.1157556101679802, "step": 9600 }, { "epoch": 3.202134756504336, "loss": 0.3623127043247223, "step": 9600 }, { "ce_loss": 0.06962898373603821, "epoch": 3.202134756504336, "step": 9600 }, { "distill_loss": 0.13172480463981628, "epoch": 3.202134756504336, "step": 9600 }, { "epoch": 3.202134756504336, "ref_ce_loss": 0.11244726926088333, "step": 9600 }, { "epoch": 3.202134756504336, "loss": 0.579292893409729, "step": 9600 }, { "ce_loss": 0.2325315922498703, "epoch": 3.202134756504336, "step": 9600 }, { "distill_loss": 0.21294154226779938, "epoch": 3.202134756504336, "step": 9600 }, { "epoch": 3.202134756504336, "ref_ce_loss": 0.08337957412004471, "step": 9600 }, { "epoch": 3.2054703135423614, "loss": 0.4992, "step": 9610 }, { "epoch": 3.2054703135423614, "grad_norm": 3.2729134559631348, "step": 9610 }, { "epoch": 3.2054703135423614, "learning_rate": 0.00020428178980397063, "step": 9610 }, { "epoch": 3.2054703135423614, "loss": 0.5233567357063293, "step": 9610 }, { "ce_loss": 0.14610396325588226, "epoch": 3.2054703135423614, "step": 9610 }, { "distill_loss": 0.20599067211151123, "epoch": 3.2054703135423614, "step": 9610 }, { "epoch": 3.2054703135423614, "ref_ce_loss": 0.07143604010343552, "step": 9610 }, { "epoch": 3.2054703135423614, "loss": 0.3326717019081116, "step": 9610 }, { "ce_loss": 0.09717848896980286, "epoch": 3.2054703135423614, "step": 9610 }, { "distill_loss": 0.12722012400627136, "epoch": 3.2054703135423614, "step": 9610 }, { "epoch": 3.2054703135423614, "ref_ce_loss": 0.10740362107753754, "step": 9610 }, { "epoch": 3.2054703135423614, "loss": 0.5446369051933289, "step": 9610 }, { "ce_loss": 0.21039313077926636, "epoch": 3.2054703135423614, "step": 9610 }, { "distill_loss": 0.13953128457069397, "epoch": 3.2054703135423614, "step": 9610 }, { "epoch": 3.2054703135423614, "ref_ce_loss": 0.10978644341230392, "step": 9610 }, { "epoch": 3.2054703135423614, "loss": 0.48363035917282104, "step": 9610 }, { "ce_loss": 0.13355810940265656, "epoch": 3.2054703135423614, "step": 9610 }, { "distill_loss": 0.1513618379831314, "epoch": 3.2054703135423614, "step": 9610 }, { "epoch": 3.2054703135423614, "ref_ce_loss": 0.1118159294128418, "step": 9610 }, { "epoch": 3.208805870580387, "loss": 0.5102, "step": 9620 }, { "epoch": 3.208805870580387, "grad_norm": 2.630423069000244, "step": 9620 }, { "epoch": 3.208805870580387, "learning_rate": 0.00020409290749646189, "step": 9620 }, { "epoch": 3.208805870580387, "loss": 0.4511738717556, "step": 9620 }, { "ce_loss": 0.12089888751506805, "epoch": 3.208805870580387, "step": 9620 }, { "distill_loss": 0.12530340254306793, "epoch": 3.208805870580387, "step": 9620 }, { "epoch": 3.208805870580387, "ref_ce_loss": 0.176653191447258, "step": 9620 }, { "epoch": 3.208805870580387, "loss": 0.45738130807876587, "step": 9620 }, { "ce_loss": 0.14809496700763702, "epoch": 3.208805870580387, "step": 9620 }, { "distill_loss": 0.10620304197072983, "epoch": 3.208805870580387, "step": 9620 }, { "epoch": 3.208805870580387, "ref_ce_loss": 0.06848637014627457, "step": 9620 }, { "epoch": 3.208805870580387, "loss": 0.5911251306533813, "step": 9620 }, { "ce_loss": 0.07842303812503815, "epoch": 3.208805870580387, "step": 9620 }, { "distill_loss": 0.13734924793243408, "epoch": 3.208805870580387, "step": 9620 }, { "epoch": 3.208805870580387, "ref_ce_loss": 0.14106030762195587, "step": 9620 }, { "epoch": 3.208805870580387, "loss": 0.44962990283966064, "step": 9620 }, { "ce_loss": 0.06224361062049866, "epoch": 3.208805870580387, "step": 9620 }, { "distill_loss": 0.12899431586265564, "epoch": 3.208805870580387, "step": 9620 }, { "epoch": 3.208805870580387, "ref_ce_loss": 0.0823230966925621, "step": 9620 }, { "epoch": 3.212141427618412, "loss": 0.4656, "step": 9630 }, { "epoch": 3.212141427618412, "grad_norm": 2.494377374649048, "step": 9630 }, { "epoch": 3.212141427618412, "learning_rate": 0.00020390392654482783, "step": 9630 }, { "epoch": 3.212141427618412, "loss": 0.33894914388656616, "step": 9630 }, { "ce_loss": 0.11409568786621094, "epoch": 3.212141427618412, "step": 9630 }, { "distill_loss": 0.11827443540096283, "epoch": 3.212141427618412, "step": 9630 }, { "epoch": 3.212141427618412, "ref_ce_loss": 0.058073848485946655, "step": 9630 }, { "epoch": 3.212141427618412, "loss": 0.3645583987236023, "step": 9630 }, { "ce_loss": 0.09342974424362183, "epoch": 3.212141427618412, "step": 9630 }, { "distill_loss": 0.13384076952934265, "epoch": 3.212141427618412, "step": 9630 }, { "epoch": 3.212141427618412, "ref_ce_loss": 0.10339362919330597, "step": 9630 }, { "epoch": 3.212141427618412, "loss": 0.5319348573684692, "step": 9630 }, { "ce_loss": 0.21150082349777222, "epoch": 3.212141427618412, "step": 9630 }, { "distill_loss": 0.14189140498638153, "epoch": 3.212141427618412, "step": 9630 }, { "epoch": 3.212141427618412, "ref_ce_loss": 0.13533258438110352, "step": 9630 }, { "epoch": 3.212141427618412, "loss": 0.48543843626976013, "step": 9630 }, { "ce_loss": 0.19601376354694366, "epoch": 3.212141427618412, "step": 9630 }, { "distill_loss": 0.13757900893688202, "epoch": 3.212141427618412, "step": 9630 }, { "epoch": 3.212141427618412, "ref_ce_loss": 0.12655571103096008, "step": 9630 }, { "epoch": 3.2154769846564375, "loss": 0.5622, "step": 9640 }, { "epoch": 3.2154769846564375, "grad_norm": 4.793918609619141, "step": 9640 }, { "epoch": 3.2154769846564375, "learning_rate": 0.0002037148472936951, "step": 9640 }, { "epoch": 3.2154769846564375, "loss": 0.3617390990257263, "step": 9640 }, { "ce_loss": 0.1141476035118103, "epoch": 3.2154769846564375, "step": 9640 }, { "distill_loss": 0.13533206284046173, "epoch": 3.2154769846564375, "step": 9640 }, { "epoch": 3.2154769846564375, "ref_ce_loss": 0.08762861788272858, "step": 9640 }, { "epoch": 3.2154769846564375, "loss": 0.423265278339386, "step": 9640 }, { "ce_loss": 0.1403900384902954, "epoch": 3.2154769846564375, "step": 9640 }, { "distill_loss": 0.15556564927101135, "epoch": 3.2154769846564375, "step": 9640 }, { "epoch": 3.2154769846564375, "ref_ce_loss": 0.10428271442651749, "step": 9640 }, { "epoch": 3.2154769846564375, "loss": 0.5071457624435425, "step": 9640 }, { "ce_loss": 0.1454550176858902, "epoch": 3.2154769846564375, "step": 9640 }, { "distill_loss": 0.10869236290454865, "epoch": 3.2154769846564375, "step": 9640 }, { "epoch": 3.2154769846564375, "ref_ce_loss": 0.11991751939058304, "step": 9640 }, { "epoch": 3.2154769846564375, "loss": 0.746184229850769, "step": 9640 }, { "ce_loss": 0.1692987084388733, "epoch": 3.2154769846564375, "step": 9640 }, { "distill_loss": 0.20635975897312164, "epoch": 3.2154769846564375, "step": 9640 }, { "epoch": 3.2154769846564375, "ref_ce_loss": 0.09694461524486542, "step": 9640 }, { "epoch": 3.218812541694463, "loss": 1.5343, "step": 9650 }, { "epoch": 3.218812541694463, "grad_norm": 8.480672836303711, "step": 9650 }, { "epoch": 3.218812541694463, "learning_rate": 0.00020352567008786963, "step": 9650 }, { "epoch": 3.218812541694463, "loss": 1.3023204803466797, "step": 9650 }, { "ce_loss": 0.7586283683776855, "epoch": 3.218812541694463, "step": 9650 }, { "distill_loss": 0.1492748260498047, "epoch": 3.218812541694463, "step": 9650 }, { "epoch": 3.218812541694463, "ref_ce_loss": 0.3516393005847931, "step": 9650 }, { "epoch": 3.218812541694463, "loss": 1.6434491872787476, "step": 9650 }, { "ce_loss": 0.7541458606719971, "epoch": 3.218812541694463, "step": 9650 }, { "distill_loss": 0.15649262070655823, "epoch": 3.218812541694463, "step": 9650 }, { "epoch": 3.218812541694463, "ref_ce_loss": 0.3518027663230896, "step": 9650 }, { "epoch": 3.218812541694463, "loss": 1.1772658824920654, "step": 9650 }, { "ce_loss": 0.6369430422782898, "epoch": 3.218812541694463, "step": 9650 }, { "distill_loss": 0.12943366169929504, "epoch": 3.218812541694463, "step": 9650 }, { "epoch": 3.218812541694463, "ref_ce_loss": 0.3679516613483429, "step": 9650 }, { "epoch": 3.218812541694463, "loss": 1.3905925750732422, "step": 9650 }, { "ce_loss": 0.726006805896759, "epoch": 3.218812541694463, "step": 9650 }, { "distill_loss": 0.11813858151435852, "epoch": 3.218812541694463, "step": 9650 }, { "epoch": 3.218812541694463, "ref_ce_loss": 0.3810167610645294, "step": 9650 }, { "epoch": 3.222148098732488, "loss": 1.0659, "step": 9660 }, { "epoch": 3.222148098732488, "grad_norm": 3.7545816898345947, "step": 9660 }, { "epoch": 3.222148098732488, "learning_rate": 0.00020333639527233616, "step": 9660 }, { "epoch": 3.222148098732488, "loss": 0.5680769681930542, "step": 9660 }, { "ce_loss": 0.20624542236328125, "epoch": 3.222148098732488, "step": 9660 }, { "distill_loss": 0.14041753113269806, "epoch": 3.222148098732488, "step": 9660 }, { "epoch": 3.222148098732488, "ref_ce_loss": 0.18326659500598907, "step": 9660 }, { "epoch": 3.222148098732488, "loss": 1.0041627883911133, "step": 9660 }, { "ce_loss": 0.4418887794017792, "epoch": 3.222148098732488, "step": 9660 }, { "distill_loss": 0.13282467424869537, "epoch": 3.222148098732488, "step": 9660 }, { "epoch": 3.222148098732488, "ref_ce_loss": 0.37674033641815186, "step": 9660 }, { "epoch": 3.222148098732488, "loss": 0.7607677578926086, "step": 9660 }, { "ce_loss": 0.35817694664001465, "epoch": 3.222148098732488, "step": 9660 }, { "distill_loss": 0.1649259328842163, "epoch": 3.222148098732488, "step": 9660 }, { "epoch": 3.222148098732488, "ref_ce_loss": 0.1993914395570755, "step": 9660 }, { "epoch": 3.222148098732488, "loss": 0.6379758715629578, "step": 9660 }, { "ce_loss": 0.29930704832077026, "epoch": 3.222148098732488, "step": 9660 }, { "distill_loss": 0.1762618124485016, "epoch": 3.222148098732488, "step": 9660 }, { "epoch": 3.222148098732488, "ref_ce_loss": 0.1622111052274704, "step": 9660 }, { "epoch": 3.2254836557705135, "loss": 0.659, "step": 9670 }, { "epoch": 3.2254836557705135, "grad_norm": 27.17189598083496, "step": 9670 }, { "epoch": 3.2254836557705135, "learning_rate": 0.00020314702319225718, "step": 9670 }, { "epoch": 3.2254836557705135, "loss": 1.4548907279968262, "step": 9670 }, { "ce_loss": 0.35011687874794006, "epoch": 3.2254836557705135, "step": 9670 }, { "distill_loss": 0.9352494478225708, "epoch": 3.2254836557705135, "step": 9670 }, { "epoch": 3.2254836557705135, "ref_ce_loss": 0.11367769539356232, "step": 9670 }, { "epoch": 3.2254836557705135, "loss": 1.437563419342041, "step": 9670 }, { "ce_loss": 0.15209540724754333, "epoch": 3.2254836557705135, "step": 9670 }, { "distill_loss": 0.9998146295547485, "epoch": 3.2254836557705135, "step": 9670 }, { "epoch": 3.2254836557705135, "ref_ce_loss": 0.14491644501686096, "step": 9670 }, { "epoch": 3.2254836557705135, "loss": 2.1415417194366455, "step": 9670 }, { "ce_loss": 0.24623355269432068, "epoch": 3.2254836557705135, "step": 9670 }, { "distill_loss": 1.5178892612457275, "epoch": 3.2254836557705135, "step": 9670 }, { "epoch": 3.2254836557705135, "ref_ce_loss": 0.202957421541214, "step": 9670 }, { "epoch": 3.2254836557705135, "loss": 0.9889248013496399, "step": 9670 }, { "ce_loss": 0.09803508222103119, "epoch": 3.2254836557705135, "step": 9670 }, { "distill_loss": 0.8088805675506592, "epoch": 3.2254836557705135, "step": 9670 }, { "epoch": 3.2254836557705135, "ref_ce_loss": 0.05574851855635643, "step": 9670 }, { "epoch": 3.228819212808539, "loss": 1.0588, "step": 9680 }, { "epoch": 3.228819212808539, "grad_norm": 3.260193109512329, "step": 9680 }, { "epoch": 3.228819212808539, "learning_rate": 0.00020295755419297268, "step": 9680 }, { "epoch": 3.228819212808539, "loss": 0.6862177848815918, "step": 9680 }, { "ce_loss": 0.17582151293754578, "epoch": 3.228819212808539, "step": 9680 }, { "distill_loss": 0.3368160128593445, "epoch": 3.228819212808539, "step": 9680 }, { "epoch": 3.228819212808539, "ref_ce_loss": 0.08550892025232315, "step": 9680 }, { "epoch": 3.228819212808539, "loss": 0.3851708173751831, "step": 9680 }, { "ce_loss": 0.05822121724486351, "epoch": 3.228819212808539, "step": 9680 }, { "distill_loss": 0.23647257685661316, "epoch": 3.228819212808539, "step": 9680 }, { "epoch": 3.228819212808539, "ref_ce_loss": 0.0903349369764328, "step": 9680 }, { "epoch": 3.228819212808539, "loss": 0.6628296971321106, "step": 9680 }, { "ce_loss": 0.0833716094493866, "epoch": 3.228819212808539, "step": 9680 }, { "distill_loss": 0.3014061152935028, "epoch": 3.228819212808539, "step": 9680 }, { "epoch": 3.228819212808539, "ref_ce_loss": 0.1067894697189331, "step": 9680 }, { "epoch": 3.228819212808539, "loss": 0.5852959156036377, "step": 9680 }, { "ce_loss": 0.07522204518318176, "epoch": 3.228819212808539, "step": 9680 }, { "distill_loss": 0.3575586974620819, "epoch": 3.228819212808539, "step": 9680 }, { "epoch": 3.228819212808539, "ref_ce_loss": 0.09939462691545486, "step": 9680 }, { "epoch": 3.2321547698465642, "loss": 0.6606, "step": 9690 }, { "epoch": 3.2321547698465642, "grad_norm": 2.6169354915618896, "step": 9690 }, { "epoch": 3.2321547698465642, "learning_rate": 0.00020276798861999933, "step": 9690 }, { "epoch": 3.2321547698465642, "loss": 0.5074576139450073, "step": 9690 }, { "ce_loss": 0.15143181383609772, "epoch": 3.2321547698465642, "step": 9690 }, { "distill_loss": 0.2413361668586731, "epoch": 3.2321547698465642, "step": 9690 }, { "epoch": 3.2321547698465642, "ref_ce_loss": 0.07629888504743576, "step": 9690 }, { "epoch": 3.2321547698465642, "loss": 0.5918530225753784, "step": 9690 }, { "ce_loss": 0.2147100418806076, "epoch": 3.2321547698465642, "step": 9690 }, { "distill_loss": 0.17524051666259766, "epoch": 3.2321547698465642, "step": 9690 }, { "epoch": 3.2321547698465642, "ref_ce_loss": 0.1480754017829895, "step": 9690 }, { "epoch": 3.2321547698465642, "loss": 0.37897729873657227, "step": 9690 }, { "ce_loss": 0.09335962682962418, "epoch": 3.2321547698465642, "step": 9690 }, { "distill_loss": 0.19734136760234833, "epoch": 3.2321547698465642, "step": 9690 }, { "epoch": 3.2321547698465642, "ref_ce_loss": 0.08819133788347244, "step": 9690 }, { "epoch": 3.2321547698465642, "loss": 0.685653805732727, "step": 9690 }, { "ce_loss": 0.13283292949199677, "epoch": 3.2321547698465642, "step": 9690 }, { "distill_loss": 0.16413123905658722, "epoch": 3.2321547698465642, "step": 9690 }, { "epoch": 3.2321547698465642, "ref_ce_loss": 0.10335833579301834, "step": 9690 }, { "epoch": 3.2354903268845896, "loss": 0.4984, "step": 9700 }, { "epoch": 3.2354903268845896, "grad_norm": 3.0020861625671387, "step": 9700 }, { "epoch": 3.2354903268845896, "learning_rate": 0.00020257832681903012, "step": 9700 }, { "epoch": 3.2354903268845896, "loss": 0.25732314586639404, "step": 9700 }, { "ce_loss": 0.032223112881183624, "epoch": 3.2354903268845896, "step": 9700 }, { "distill_loss": 0.14216585457324982, "epoch": 3.2354903268845896, "step": 9700 }, { "epoch": 3.2354903268845896, "ref_ce_loss": 0.054947543889284134, "step": 9700 }, { "epoch": 3.2354903268845896, "loss": 0.4035998582839966, "step": 9700 }, { "ce_loss": 0.09584304690361023, "epoch": 3.2354903268845896, "step": 9700 }, { "distill_loss": 0.16314128041267395, "epoch": 3.2354903268845896, "step": 9700 }, { "epoch": 3.2354903268845896, "ref_ce_loss": 0.08428196609020233, "step": 9700 }, { "epoch": 3.2354903268845896, "loss": 0.7436294555664062, "step": 9700 }, { "ce_loss": 0.18889939785003662, "epoch": 3.2354903268845896, "step": 9700 }, { "distill_loss": 0.21565823256969452, "epoch": 3.2354903268845896, "step": 9700 }, { "epoch": 3.2354903268845896, "ref_ce_loss": 0.13887543976306915, "step": 9700 }, { "epoch": 3.2354903268845896, "loss": 0.4003385007381439, "step": 9700 }, { "ce_loss": 0.06409730017185211, "epoch": 3.2354903268845896, "step": 9700 }, { "distill_loss": 0.189022034406662, "epoch": 3.2354903268845896, "step": 9700 }, { "epoch": 3.2354903268845896, "ref_ce_loss": 0.08388952910900116, "step": 9700 }, { "epoch": 3.238825883922615, "loss": 0.581, "step": 9710 }, { "epoch": 3.238825883922615, "grad_norm": 2.807310104370117, "step": 9710 }, { "epoch": 3.238825883922615, "learning_rate": 0.00020238856913593317, "step": 9710 }, { "epoch": 3.238825883922615, "loss": 0.45934754610061646, "step": 9710 }, { "ce_loss": 0.1265133172273636, "epoch": 3.238825883922615, "step": 9710 }, { "distill_loss": 0.12370441854000092, "epoch": 3.238825883922615, "step": 9710 }, { "epoch": 3.238825883922615, "ref_ce_loss": 0.09366489946842194, "step": 9710 }, { "epoch": 3.238825883922615, "loss": 0.5969701409339905, "step": 9710 }, { "ce_loss": 0.1662624329328537, "epoch": 3.238825883922615, "step": 9710 }, { "distill_loss": 0.16972975432872772, "epoch": 3.238825883922615, "step": 9710 }, { "epoch": 3.238825883922615, "ref_ce_loss": 0.11409129947423935, "step": 9710 }, { "epoch": 3.238825883922615, "loss": 0.5954180955886841, "step": 9710 }, { "ce_loss": 0.18566739559173584, "epoch": 3.238825883922615, "step": 9710 }, { "distill_loss": 0.16116562485694885, "epoch": 3.238825883922615, "step": 9710 }, { "epoch": 3.238825883922615, "ref_ce_loss": 0.12154942750930786, "step": 9710 }, { "epoch": 3.238825883922615, "loss": 0.4547787308692932, "step": 9710 }, { "ce_loss": 0.08679163455963135, "epoch": 3.238825883922615, "step": 9710 }, { "distill_loss": 0.1623559296131134, "epoch": 3.238825883922615, "step": 9710 }, { "epoch": 3.238825883922615, "ref_ce_loss": 0.1384986937046051, "step": 9710 }, { "epoch": 3.2421614409606403, "loss": 0.5901, "step": 9720 }, { "epoch": 3.2421614409606403, "grad_norm": 8.585079193115234, "step": 9720 }, { "epoch": 3.2421614409606403, "learning_rate": 0.00020219871591675172, "step": 9720 }, { "epoch": 3.2421614409606403, "loss": 0.48637694120407104, "step": 9720 }, { "ce_loss": 0.15971849858760834, "epoch": 3.2421614409606403, "step": 9720 }, { "distill_loss": 0.14762723445892334, "epoch": 3.2421614409606403, "step": 9720 }, { "epoch": 3.2421614409606403, "ref_ce_loss": 0.12237177044153214, "step": 9720 }, { "epoch": 3.2421614409606403, "loss": 0.5249758362770081, "step": 9720 }, { "ce_loss": 0.21139027178287506, "epoch": 3.2421614409606403, "step": 9720 }, { "distill_loss": 0.17246796190738678, "epoch": 3.2421614409606403, "step": 9720 }, { "epoch": 3.2421614409606403, "ref_ce_loss": 0.14065127074718475, "step": 9720 }, { "epoch": 3.2421614409606403, "loss": 0.4310125410556793, "step": 9720 }, { "ce_loss": 0.13576455414295197, "epoch": 3.2421614409606403, "step": 9720 }, { "distill_loss": 0.12842974066734314, "epoch": 3.2421614409606403, "step": 9720 }, { "epoch": 3.2421614409606403, "ref_ce_loss": 0.10069247335195541, "step": 9720 }, { "epoch": 3.2421614409606403, "loss": 0.45626258850097656, "step": 9720 }, { "ce_loss": 0.16060946881771088, "epoch": 3.2421614409606403, "step": 9720 }, { "distill_loss": 0.13746333122253418, "epoch": 3.2421614409606403, "step": 9720 }, { "epoch": 3.2421614409606403, "ref_ce_loss": 0.13184267282485962, "step": 9720 }, { "epoch": 3.2454969979986656, "loss": 0.5579, "step": 9730 }, { "epoch": 3.2454969979986656, "grad_norm": 4.468499660491943, "step": 9730 }, { "epoch": 3.2454969979986656, "learning_rate": 0.00020200876750770317, "step": 9730 }, { "epoch": 3.2454969979986656, "loss": 0.5182029008865356, "step": 9730 }, { "ce_loss": 0.15194390714168549, "epoch": 3.2454969979986656, "step": 9730 }, { "distill_loss": 0.2397410273551941, "epoch": 3.2454969979986656, "step": 9730 }, { "epoch": 3.2454969979986656, "ref_ce_loss": 0.1261671483516693, "step": 9730 }, { "epoch": 3.2454969979986656, "loss": 0.40942132472991943, "step": 9730 }, { "ce_loss": 0.11343543231487274, "epoch": 3.2454969979986656, "step": 9730 }, { "distill_loss": 0.17565378546714783, "epoch": 3.2454969979986656, "step": 9730 }, { "epoch": 3.2454969979986656, "ref_ce_loss": 0.12022323161363602, "step": 9730 }, { "epoch": 3.2454969979986656, "loss": 0.4131905436515808, "step": 9730 }, { "ce_loss": 0.09293036162853241, "epoch": 3.2454969979986656, "step": 9730 }, { "distill_loss": 0.16029858589172363, "epoch": 3.2454969979986656, "step": 9730 }, { "epoch": 3.2454969979986656, "ref_ce_loss": 0.10926628857851028, "step": 9730 }, { "epoch": 3.2454969979986656, "loss": 0.543439507484436, "step": 9730 }, { "ce_loss": 0.1307881474494934, "epoch": 3.2454969979986656, "step": 9730 }, { "distill_loss": 0.15304943919181824, "epoch": 3.2454969979986656, "step": 9730 }, { "epoch": 3.2454969979986656, "ref_ce_loss": 0.13484518229961395, "step": 9730 }, { "epoch": 3.248832555036691, "loss": 0.5438, "step": 9740 }, { "epoch": 3.248832555036691, "grad_norm": 2.117421865463257, "step": 9740 }, { "epoch": 3.248832555036691, "learning_rate": 0.00020181872425517847, "step": 9740 }, { "epoch": 3.248832555036691, "loss": 0.4847257435321808, "step": 9740 }, { "ce_loss": 0.11899697780609131, "epoch": 3.248832555036691, "step": 9740 }, { "distill_loss": 0.14587488770484924, "epoch": 3.248832555036691, "step": 9740 }, { "epoch": 3.248832555036691, "ref_ce_loss": 0.14244896173477173, "step": 9740 }, { "epoch": 3.248832555036691, "loss": 0.46945518255233765, "step": 9740 }, { "ce_loss": 0.15519380569458008, "epoch": 3.248832555036691, "step": 9740 }, { "distill_loss": 0.19748863577842712, "epoch": 3.248832555036691, "step": 9740 }, { "epoch": 3.248832555036691, "ref_ce_loss": 0.11662587523460388, "step": 9740 }, { "epoch": 3.248832555036691, "loss": 0.7112184166908264, "step": 9740 }, { "ce_loss": 0.09962423145771027, "epoch": 3.248832555036691, "step": 9740 }, { "distill_loss": 0.18776388466358185, "epoch": 3.248832555036691, "step": 9740 }, { "epoch": 3.248832555036691, "ref_ce_loss": 0.11562529951334, "step": 9740 }, { "epoch": 3.248832555036691, "loss": 0.5923649668693542, "step": 9740 }, { "ce_loss": 0.189992755651474, "epoch": 3.248832555036691, "step": 9740 }, { "distill_loss": 0.1979469507932663, "epoch": 3.248832555036691, "step": 9740 }, { "epoch": 3.248832555036691, "ref_ce_loss": 0.11803510040044785, "step": 9740 }, { "epoch": 3.2521681120747163, "loss": 0.5334, "step": 9750 }, { "epoch": 3.2521681120747163, "grad_norm": 5.560446262359619, "step": 9750 }, { "epoch": 3.2521681120747163, "learning_rate": 0.00020162858650574154, "step": 9750 }, { "epoch": 3.2521681120747163, "loss": 0.5909914970397949, "step": 9750 }, { "ce_loss": 0.23804140090942383, "epoch": 3.2521681120747163, "step": 9750 }, { "distill_loss": 0.16925199329853058, "epoch": 3.2521681120747163, "step": 9750 }, { "epoch": 3.2521681120747163, "ref_ce_loss": 0.14933234453201294, "step": 9750 }, { "epoch": 3.2521681120747163, "loss": 0.5917761325836182, "step": 9750 }, { "ce_loss": 0.09722870588302612, "epoch": 3.2521681120747163, "step": 9750 }, { "distill_loss": 0.13730478286743164, "epoch": 3.2521681120747163, "step": 9750 }, { "epoch": 3.2521681120747163, "ref_ce_loss": 0.05942942947149277, "step": 9750 }, { "epoch": 3.2521681120747163, "loss": 0.6330657005310059, "step": 9750 }, { "ce_loss": 0.1761714220046997, "epoch": 3.2521681120747163, "step": 9750 }, { "distill_loss": 0.1895550787448883, "epoch": 3.2521681120747163, "step": 9750 }, { "epoch": 3.2521681120747163, "ref_ce_loss": 0.1777772605419159, "step": 9750 }, { "epoch": 3.2521681120747163, "loss": 0.5256844758987427, "step": 9750 }, { "ce_loss": 0.13771918416023254, "epoch": 3.2521681120747163, "step": 9750 }, { "distill_loss": 0.14787374436855316, "epoch": 3.2521681120747163, "step": 9750 }, { "epoch": 3.2521681120747163, "ref_ce_loss": 0.12790906429290771, "step": 9750 }, { "epoch": 3.2555036691127417, "loss": 0.521, "step": 9760 }, { "epoch": 3.2555036691127417, "grad_norm": 2.0890445709228516, "step": 9760 }, { "epoch": 3.2555036691127417, "learning_rate": 0.00020143835460612866, "step": 9760 }, { "epoch": 3.2555036691127417, "loss": 0.42880985140800476, "step": 9760 }, { "ce_loss": 0.16630008816719055, "epoch": 3.2555036691127417, "step": 9760 }, { "distill_loss": 0.1700364351272583, "epoch": 3.2555036691127417, "step": 9760 }, { "epoch": 3.2555036691127417, "ref_ce_loss": 0.07479525357484818, "step": 9760 }, { "epoch": 3.2555036691127417, "loss": 0.5276511907577515, "step": 9760 }, { "ce_loss": 0.19932356476783752, "epoch": 3.2555036691127417, "step": 9760 }, { "distill_loss": 0.1725931465625763, "epoch": 3.2555036691127417, "step": 9760 }, { "epoch": 3.2555036691127417, "ref_ce_loss": 0.1009768545627594, "step": 9760 }, { "epoch": 3.2555036691127417, "loss": 0.4974745213985443, "step": 9760 }, { "ce_loss": 0.18790408968925476, "epoch": 3.2555036691127417, "step": 9760 }, { "distill_loss": 0.1412593126296997, "epoch": 3.2555036691127417, "step": 9760 }, { "epoch": 3.2555036691127417, "ref_ce_loss": 0.0973147377371788, "step": 9760 }, { "epoch": 3.2555036691127417, "loss": 0.48675134778022766, "step": 9760 }, { "ce_loss": 0.0493842251598835, "epoch": 3.2555036691127417, "step": 9760 }, { "distill_loss": 0.11071799695491791, "epoch": 3.2555036691127417, "step": 9760 }, { "epoch": 3.2555036691127417, "ref_ce_loss": 0.09657184034585953, "step": 9760 }, { "epoch": 3.258839226150767, "loss": 0.5075, "step": 9770 }, { "epoch": 3.258839226150767, "grad_norm": 1.9114854335784912, "step": 9770 }, { "epoch": 3.258839226150767, "learning_rate": 0.00020124802890324775, "step": 9770 }, { "epoch": 3.258839226150767, "loss": 0.4194925129413605, "step": 9770 }, { "ce_loss": 0.14611773192882538, "epoch": 3.258839226150767, "step": 9770 }, { "distill_loss": 0.1504100263118744, "epoch": 3.258839226150767, "step": 9770 }, { "epoch": 3.258839226150767, "ref_ce_loss": 0.09022071212530136, "step": 9770 }, { "epoch": 3.258839226150767, "loss": 0.5268236398696899, "step": 9770 }, { "ce_loss": 0.12277179956436157, "epoch": 3.258839226150767, "step": 9770 }, { "distill_loss": 0.14569061994552612, "epoch": 3.258839226150767, "step": 9770 }, { "epoch": 3.258839226150767, "ref_ce_loss": 0.10679484158754349, "step": 9770 }, { "epoch": 3.258839226150767, "loss": 0.4402240514755249, "step": 9770 }, { "ce_loss": 0.12379911541938782, "epoch": 3.258839226150767, "step": 9770 }, { "distill_loss": 0.11681478470563889, "epoch": 3.258839226150767, "step": 9770 }, { "epoch": 3.258839226150767, "ref_ce_loss": 0.14763450622558594, "step": 9770 }, { "epoch": 3.258839226150767, "loss": 0.278000146150589, "step": 9770 }, { "ce_loss": 0.08359132707118988, "epoch": 3.258839226150767, "step": 9770 }, { "distill_loss": 0.0681973546743393, "epoch": 3.258839226150767, "step": 9770 }, { "epoch": 3.258839226150767, "ref_ce_loss": 0.08435453474521637, "step": 9770 }, { "epoch": 3.2621747831887924, "loss": 0.4993, "step": 9780 }, { "epoch": 3.2621747831887924, "grad_norm": 2.544344186782837, "step": 9780 }, { "epoch": 3.2621747831887924, "learning_rate": 0.0002010576097441778, "step": 9780 }, { "epoch": 3.2621747831887924, "loss": 0.8057675957679749, "step": 9780 }, { "ce_loss": 0.13802434504032135, "epoch": 3.2621747831887924, "step": 9780 }, { "distill_loss": 0.16143648326396942, "epoch": 3.2621747831887924, "step": 9780 }, { "epoch": 3.2621747831887924, "ref_ce_loss": 0.0895475447177887, "step": 9780 }, { "epoch": 3.2621747831887924, "loss": 0.5339944958686829, "step": 9780 }, { "ce_loss": 0.1788073182106018, "epoch": 3.2621747831887924, "step": 9780 }, { "distill_loss": 0.16007177531719208, "epoch": 3.2621747831887924, "step": 9780 }, { "epoch": 3.2621747831887924, "ref_ce_loss": 0.09300737082958221, "step": 9780 }, { "epoch": 3.2621747831887924, "loss": 0.6089990735054016, "step": 9780 }, { "ce_loss": 0.2176850438117981, "epoch": 3.2621747831887924, "step": 9780 }, { "distill_loss": 0.22681301832199097, "epoch": 3.2621747831887924, "step": 9780 }, { "epoch": 3.2621747831887924, "ref_ce_loss": 0.1334032416343689, "step": 9780 }, { "epoch": 3.2621747831887924, "loss": 0.2944650948047638, "step": 9780 }, { "ce_loss": 0.08381827920675278, "epoch": 3.2621747831887924, "step": 9780 }, { "distill_loss": 0.12243427336215973, "epoch": 3.2621747831887924, "step": 9780 }, { "epoch": 3.2621747831887924, "ref_ce_loss": 0.08806728571653366, "step": 9780 }, { "epoch": 3.2655103402268177, "loss": 0.5072, "step": 9790 }, { "epoch": 3.2655103402268177, "grad_norm": 2.3298983573913574, "step": 9790 }, { "epoch": 3.2655103402268177, "learning_rate": 0.00020086709747616822, "step": 9790 }, { "epoch": 3.2655103402268177, "loss": 0.381562739610672, "step": 9790 }, { "ce_loss": 0.10704471915960312, "epoch": 3.2655103402268177, "step": 9790 }, { "distill_loss": 0.08754559606313705, "epoch": 3.2655103402268177, "step": 9790 }, { "epoch": 3.2655103402268177, "ref_ce_loss": 0.10674937814474106, "step": 9790 }, { "epoch": 3.2655103402268177, "loss": 0.4779171943664551, "step": 9790 }, { "ce_loss": 0.1513449251651764, "epoch": 3.2655103402268177, "step": 9790 }, { "distill_loss": 0.12535059452056885, "epoch": 3.2655103402268177, "step": 9790 }, { "epoch": 3.2655103402268177, "ref_ce_loss": 0.1194036528468132, "step": 9790 }, { "epoch": 3.2655103402268177, "loss": 0.6823694705963135, "step": 9790 }, { "ce_loss": 0.20080308616161346, "epoch": 3.2655103402268177, "step": 9790 }, { "distill_loss": 0.18392544984817505, "epoch": 3.2655103402268177, "step": 9790 }, { "epoch": 3.2655103402268177, "ref_ce_loss": 0.09276615083217621, "step": 9790 }, { "epoch": 3.2655103402268177, "loss": 0.4755535125732422, "step": 9790 }, { "ce_loss": 0.14225377142429352, "epoch": 3.2655103402268177, "step": 9790 }, { "distill_loss": 0.13601207733154297, "epoch": 3.2655103402268177, "step": 9790 }, { "epoch": 3.2655103402268177, "ref_ce_loss": 0.14101681113243103, "step": 9790 }, { "epoch": 3.268845897264843, "loss": 0.4772, "step": 9800 }, { "epoch": 3.268845897264843, "grad_norm": 2.873114824295044, "step": 9800 }, { "epoch": 3.268845897264843, "learning_rate": 0.00020067649244663837, "step": 9800 }, { "epoch": 3.268845897264843, "loss": 0.47671616077423096, "step": 9800 }, { "ce_loss": 0.18779507279396057, "epoch": 3.268845897264843, "step": 9800 }, { "distill_loss": 0.15565767884254456, "epoch": 3.268845897264843, "step": 9800 }, { "epoch": 3.268845897264843, "ref_ce_loss": 0.13307105004787445, "step": 9800 }, { "epoch": 3.268845897264843, "loss": 0.6724848747253418, "step": 9800 }, { "ce_loss": 0.12847858667373657, "epoch": 3.268845897264843, "step": 9800 }, { "distill_loss": 0.11508485674858093, "epoch": 3.268845897264843, "step": 9800 }, { "epoch": 3.268845897264843, "ref_ce_loss": 0.1192900612950325, "step": 9800 }, { "epoch": 3.268845897264843, "loss": 0.5285953283309937, "step": 9800 }, { "ce_loss": 0.14411479234695435, "epoch": 3.268845897264843, "step": 9800 }, { "distill_loss": 0.19417299330234528, "epoch": 3.268845897264843, "step": 9800 }, { "epoch": 3.268845897264843, "ref_ce_loss": 0.1415955275297165, "step": 9800 }, { "epoch": 3.268845897264843, "loss": 0.7303289175033569, "step": 9800 }, { "ce_loss": 0.11429604887962341, "epoch": 3.268845897264843, "step": 9800 }, { "distill_loss": 0.12689504027366638, "epoch": 3.268845897264843, "step": 9800 }, { "epoch": 3.268845897264843, "ref_ce_loss": 0.07350821793079376, "step": 9800 }, { "epoch": 3.2721814543028684, "loss": 0.4966, "step": 9810 }, { "epoch": 3.2721814543028684, "grad_norm": 3.44111967086792, "step": 9810 }, { "epoch": 3.2721814543028684, "learning_rate": 0.00020048579500317652, "step": 9810 }, { "epoch": 3.2721814543028684, "loss": 0.6512364149093628, "step": 9810 }, { "ce_loss": 0.24342629313468933, "epoch": 3.2721814543028684, "step": 9810 }, { "distill_loss": 0.1871444135904312, "epoch": 3.2721814543028684, "step": 9810 }, { "epoch": 3.2721814543028684, "ref_ce_loss": 0.16401995718479156, "step": 9810 }, { "epoch": 3.2721814543028684, "loss": 0.3500097393989563, "step": 9810 }, { "ce_loss": 0.032036811113357544, "epoch": 3.2721814543028684, "step": 9810 }, { "distill_loss": 0.08574137091636658, "epoch": 3.2721814543028684, "step": 9810 }, { "epoch": 3.2721814543028684, "ref_ce_loss": 0.06518110632896423, "step": 9810 }, { "epoch": 3.2721814543028684, "loss": 1.2171872854232788, "step": 9810 }, { "ce_loss": 0.1737753450870514, "epoch": 3.2721814543028684, "step": 9810 }, { "distill_loss": 0.184116393327713, "epoch": 3.2721814543028684, "step": 9810 }, { "epoch": 3.2721814543028684, "ref_ce_loss": 0.1184069886803627, "step": 9810 }, { "epoch": 3.2721814543028684, "loss": 0.5101798176765442, "step": 9810 }, { "ce_loss": 0.14508378505706787, "epoch": 3.2721814543028684, "step": 9810 }, { "distill_loss": 0.16374604403972626, "epoch": 3.2721814543028684, "step": 9810 }, { "epoch": 3.2721814543028684, "ref_ce_loss": 0.13258105516433716, "step": 9810 }, { "epoch": 3.275517011340894, "loss": 0.5242, "step": 9820 }, { "epoch": 3.275517011340894, "grad_norm": 3.690037965774536, "step": 9820 }, { "epoch": 3.275517011340894, "learning_rate": 0.00020029500549353953, "step": 9820 }, { "epoch": 3.275517011340894, "loss": 0.3060483932495117, "step": 9820 }, { "ce_loss": 0.09589601308107376, "epoch": 3.275517011340894, "step": 9820 }, { "distill_loss": 0.12110260128974915, "epoch": 3.275517011340894, "step": 9820 }, { "epoch": 3.275517011340894, "ref_ce_loss": 0.06255815178155899, "step": 9820 }, { "epoch": 3.275517011340894, "loss": 0.421102911233902, "step": 9820 }, { "ce_loss": 0.13635626435279846, "epoch": 3.275517011340894, "step": 9820 }, { "distill_loss": 0.1552823930978775, "epoch": 3.275517011340894, "step": 9820 }, { "epoch": 3.275517011340894, "ref_ce_loss": 0.08633653819561005, "step": 9820 }, { "epoch": 3.275517011340894, "loss": 0.7562495470046997, "step": 9820 }, { "ce_loss": 0.1636921614408493, "epoch": 3.275517011340894, "step": 9820 }, { "distill_loss": 0.15046286582946777, "epoch": 3.275517011340894, "step": 9820 }, { "epoch": 3.275517011340894, "ref_ce_loss": 0.13611376285552979, "step": 9820 }, { "epoch": 3.275517011340894, "loss": 0.46156540513038635, "step": 9820 }, { "ce_loss": 0.1379977911710739, "epoch": 3.275517011340894, "step": 9820 }, { "distill_loss": 0.13064102828502655, "epoch": 3.275517011340894, "step": 9820 }, { "epoch": 3.275517011340894, "ref_ce_loss": 0.12863753736019135, "step": 9820 }, { "epoch": 3.278852568378919, "loss": 0.4644, "step": 9830 }, { "epoch": 3.278852568378919, "grad_norm": 2.5951879024505615, "step": 9830 }, { "epoch": 3.278852568378919, "learning_rate": 0.00020010412426565231, "step": 9830 }, { "epoch": 3.278852568378919, "loss": 0.9946610927581787, "step": 9830 }, { "ce_loss": 0.21470636129379272, "epoch": 3.278852568378919, "step": 9830 }, { "distill_loss": 0.21564292907714844, "epoch": 3.278852568378919, "step": 9830 }, { "epoch": 3.278852568378919, "ref_ce_loss": 0.12243160605430603, "step": 9830 }, { "epoch": 3.278852568378919, "loss": 0.48204684257507324, "step": 9830 }, { "ce_loss": 0.12645453214645386, "epoch": 3.278852568378919, "step": 9830 }, { "distill_loss": 0.1417766958475113, "epoch": 3.278852568378919, "step": 9830 }, { "epoch": 3.278852568378919, "ref_ce_loss": 0.05860704556107521, "step": 9830 }, { "epoch": 3.278852568378919, "loss": 0.2505921423435211, "step": 9830 }, { "ce_loss": 0.050950322300195694, "epoch": 3.278852568378919, "step": 9830 }, { "distill_loss": 0.10858387500047684, "epoch": 3.278852568378919, "step": 9830 }, { "epoch": 3.278852568378919, "ref_ce_loss": 0.09072738885879517, "step": 9830 }, { "epoch": 3.278852568378919, "loss": 0.520016610622406, "step": 9830 }, { "ce_loss": 0.2073366940021515, "epoch": 3.278852568378919, "step": 9830 }, { "distill_loss": 0.19920435547828674, "epoch": 3.278852568378919, "step": 9830 }, { "epoch": 3.278852568378919, "ref_ce_loss": 0.11311342567205429, "step": 9830 }, { "epoch": 3.2821881254169445, "loss": 0.4845, "step": 9840 }, { "epoch": 3.2821881254169445, "grad_norm": 3.4132983684539795, "step": 9840 }, { "epoch": 3.2821881254169445, "learning_rate": 0.00019991315166760696, "step": 9840 }, { "epoch": 3.2821881254169445, "loss": 0.4670141935348511, "step": 9840 }, { "ce_loss": 0.1389661729335785, "epoch": 3.2821881254169445, "step": 9840 }, { "distill_loss": 0.11367867887020111, "epoch": 3.2821881254169445, "step": 9840 }, { "epoch": 3.2821881254169445, "ref_ce_loss": 0.12627971172332764, "step": 9840 }, { "epoch": 3.2821881254169445, "loss": 0.5269565582275391, "step": 9840 }, { "ce_loss": 0.08351260423660278, "epoch": 3.2821881254169445, "step": 9840 }, { "distill_loss": 0.11990583688020706, "epoch": 3.2821881254169445, "step": 9840 }, { "epoch": 3.2821881254169445, "ref_ce_loss": 0.13506436347961426, "step": 9840 }, { "epoch": 3.2821881254169445, "loss": 0.2908315360546112, "step": 9840 }, { "ce_loss": 0.10760679095983505, "epoch": 3.2821881254169445, "step": 9840 }, { "distill_loss": 0.08831951022148132, "epoch": 3.2821881254169445, "step": 9840 }, { "epoch": 3.2821881254169445, "ref_ce_loss": 0.07872118055820465, "step": 9840 }, { "epoch": 3.2821881254169445, "loss": 0.4684480130672455, "step": 9840 }, { "ce_loss": 0.17640560865402222, "epoch": 3.2821881254169445, "step": 9840 }, { "distill_loss": 0.15796883404254913, "epoch": 3.2821881254169445, "step": 9840 }, { "epoch": 3.2821881254169445, "ref_ce_loss": 0.13395830988883972, "step": 9840 }, { "epoch": 3.28552368245497, "loss": 0.4738, "step": 9850 }, { "epoch": 3.28552368245497, "grad_norm": 3.823991060256958, "step": 9850 }, { "epoch": 3.28552368245497, "learning_rate": 0.00019972208804766204, "step": 9850 }, { "epoch": 3.28552368245497, "loss": 0.5192458629608154, "step": 9850 }, { "ce_loss": 0.13076889514923096, "epoch": 3.28552368245497, "step": 9850 }, { "distill_loss": 0.12664109468460083, "epoch": 3.28552368245497, "step": 9850 }, { "epoch": 3.28552368245497, "ref_ce_loss": 0.1299402117729187, "step": 9850 }, { "epoch": 3.28552368245497, "loss": 0.5129473805427551, "step": 9850 }, { "ce_loss": 0.18821565806865692, "epoch": 3.28552368245497, "step": 9850 }, { "distill_loss": 0.11755567044019699, "epoch": 3.28552368245497, "step": 9850 }, { "epoch": 3.28552368245497, "ref_ce_loss": 0.11735755950212479, "step": 9850 }, { "epoch": 3.28552368245497, "loss": 0.4829626679420471, "step": 9850 }, { "ce_loss": 0.20330511033535004, "epoch": 3.28552368245497, "step": 9850 }, { "distill_loss": 0.11206808686256409, "epoch": 3.28552368245497, "step": 9850 }, { "epoch": 3.28552368245497, "ref_ce_loss": 0.0962306559085846, "step": 9850 }, { "epoch": 3.28552368245497, "loss": 0.23460951447486877, "step": 9850 }, { "ce_loss": 0.03618684411048889, "epoch": 3.28552368245497, "step": 9850 }, { "distill_loss": 0.08662353456020355, "epoch": 3.28552368245497, "step": 9850 }, { "epoch": 3.28552368245497, "ref_ce_loss": 0.06193290650844574, "step": 9850 }, { "epoch": 3.288859239492995, "loss": 0.5143, "step": 9860 }, { "epoch": 3.288859239492995, "grad_norm": 2.249880790710449, "step": 9860 }, { "epoch": 3.288859239492995, "learning_rate": 0.0001995309337542423, "step": 9860 }, { "epoch": 3.288859239492995, "loss": 0.38961324095726013, "step": 9860 }, { "ce_loss": 0.1265280395746231, "epoch": 3.288859239492995, "step": 9860 }, { "distill_loss": 0.11052341014146805, "epoch": 3.288859239492995, "step": 9860 }, { "epoch": 3.288859239492995, "ref_ce_loss": 0.08913455903530121, "step": 9860 }, { "epoch": 3.288859239492995, "loss": 0.40968209505081177, "step": 9860 }, { "ce_loss": 0.15417298674583435, "epoch": 3.288859239492995, "step": 9860 }, { "distill_loss": 0.10991737991571426, "epoch": 3.288859239492995, "step": 9860 }, { "epoch": 3.288859239492995, "ref_ce_loss": 0.145408034324646, "step": 9860 }, { "epoch": 3.288859239492995, "loss": 0.34837135672569275, "step": 9860 }, { "ce_loss": 0.07209538668394089, "epoch": 3.288859239492995, "step": 9860 }, { "distill_loss": 0.12966960668563843, "epoch": 3.288859239492995, "step": 9860 }, { "epoch": 3.288859239492995, "ref_ce_loss": 0.14647279679775238, "step": 9860 }, { "epoch": 3.288859239492995, "loss": 0.32055962085723877, "step": 9860 }, { "ce_loss": 0.1121489554643631, "epoch": 3.288859239492995, "step": 9860 }, { "distill_loss": 0.1139790490269661, "epoch": 3.288859239492995, "step": 9860 }, { "epoch": 3.288859239492995, "ref_ce_loss": 0.09429316222667694, "step": 9860 }, { "epoch": 3.2921947965310205, "loss": 0.462, "step": 9870 }, { "epoch": 3.2921947965310205, "grad_norm": 3.066953659057617, "step": 9870 }, { "epoch": 3.2921947965310205, "learning_rate": 0.00019933968913593775, "step": 9870 }, { "epoch": 3.2921947965310205, "loss": 0.4638405442237854, "step": 9870 }, { "ce_loss": 0.15076488256454468, "epoch": 3.2921947965310205, "step": 9870 }, { "distill_loss": 0.13203120231628418, "epoch": 3.2921947965310205, "step": 9870 }, { "epoch": 3.2921947965310205, "ref_ce_loss": 0.08420390635728836, "step": 9870 }, { "epoch": 3.2921947965310205, "loss": 0.32437434792518616, "step": 9870 }, { "ce_loss": 0.10534469038248062, "epoch": 3.2921947965310205, "step": 9870 }, { "distill_loss": 0.10000304877758026, "epoch": 3.2921947965310205, "step": 9870 }, { "epoch": 3.2921947965310205, "ref_ce_loss": 0.05723772197961807, "step": 9870 }, { "epoch": 3.2921947965310205, "loss": 0.6644257307052612, "step": 9870 }, { "ce_loss": 0.2317466139793396, "epoch": 3.2921947965310205, "step": 9870 }, { "distill_loss": 0.14958073198795319, "epoch": 3.2921947965310205, "step": 9870 }, { "epoch": 3.2921947965310205, "ref_ce_loss": 0.12686073780059814, "step": 9870 }, { "epoch": 3.2921947965310205, "loss": 0.41358280181884766, "step": 9870 }, { "ce_loss": 0.1507405787706375, "epoch": 3.2921947965310205, "step": 9870 }, { "distill_loss": 0.10844182223081589, "epoch": 3.2921947965310205, "step": 9870 }, { "epoch": 3.2921947965310205, "ref_ce_loss": 0.0918106660246849, "step": 9870 }, { "epoch": 3.295530353569046, "loss": 0.5121, "step": 9880 }, { "epoch": 3.295530353569046, "grad_norm": 3.1873927116394043, "step": 9880 }, { "epoch": 3.295530353569046, "learning_rate": 0.0001991483545415031, "step": 9880 }, { "epoch": 3.295530353569046, "loss": 0.44472411274909973, "step": 9880 }, { "ce_loss": 0.1372465342283249, "epoch": 3.295530353569046, "step": 9880 }, { "distill_loss": 0.17125433683395386, "epoch": 3.295530353569046, "step": 9880 }, { "epoch": 3.295530353569046, "ref_ce_loss": 0.10206897556781769, "step": 9880 }, { "epoch": 3.295530353569046, "loss": 0.3743453025817871, "step": 9880 }, { "ce_loss": 0.11211249977350235, "epoch": 3.295530353569046, "step": 9880 }, { "distill_loss": 0.17139093577861786, "epoch": 3.295530353569046, "step": 9880 }, { "epoch": 3.295530353569046, "ref_ce_loss": 0.09072692692279816, "step": 9880 }, { "epoch": 3.295530353569046, "loss": 0.5743526220321655, "step": 9880 }, { "ce_loss": 0.1388760209083557, "epoch": 3.295530353569046, "step": 9880 }, { "distill_loss": 0.1605054885149002, "epoch": 3.295530353569046, "step": 9880 }, { "epoch": 3.295530353569046, "ref_ce_loss": 0.1317940354347229, "step": 9880 }, { "epoch": 3.295530353569046, "loss": 0.25721773505210876, "step": 9880 }, { "ce_loss": 0.06072324141860008, "epoch": 3.295530353569046, "step": 9880 }, { "distill_loss": 0.12532374262809753, "epoch": 3.295530353569046, "step": 9880 }, { "epoch": 3.295530353569046, "ref_ce_loss": 0.051129937171936035, "step": 9880 }, { "epoch": 3.2988659106070712, "loss": 0.4627, "step": 9890 }, { "epoch": 3.2988659106070712, "grad_norm": 3.4960994720458984, "step": 9890 }, { "epoch": 3.2988659106070712, "learning_rate": 0.0001989569303198573, "step": 9890 }, { "epoch": 3.2988659106070712, "loss": 0.47169768810272217, "step": 9890 }, { "ce_loss": 0.22074246406555176, "epoch": 3.2988659106070712, "step": 9890 }, { "distill_loss": 0.1348043829202652, "epoch": 3.2988659106070712, "step": 9890 }, { "epoch": 3.2988659106070712, "ref_ce_loss": 0.09197013825178146, "step": 9890 }, { "epoch": 3.2988659106070712, "loss": 0.303199827671051, "step": 9890 }, { "ce_loss": 0.08557787537574768, "epoch": 3.2988659106070712, "step": 9890 }, { "distill_loss": 0.1385389268398285, "epoch": 3.2988659106070712, "step": 9890 }, { "epoch": 3.2988659106070712, "ref_ce_loss": 0.07887089997529984, "step": 9890 }, { "epoch": 3.2988659106070712, "loss": 0.38248252868652344, "step": 9890 }, { "ce_loss": 0.1412307769060135, "epoch": 3.2988659106070712, "step": 9890 }, { "distill_loss": 0.14288009703159332, "epoch": 3.2988659106070712, "step": 9890 }, { "epoch": 3.2988659106070712, "ref_ce_loss": 0.09742052853107452, "step": 9890 }, { "epoch": 3.2988659106070712, "loss": 0.5217670798301697, "step": 9890 }, { "ce_loss": 0.19139555096626282, "epoch": 3.2988659106070712, "step": 9890 }, { "distill_loss": 0.12419295310974121, "epoch": 3.2988659106070712, "step": 9890 }, { "epoch": 3.2988659106070712, "ref_ce_loss": 0.11238839477300644, "step": 9890 }, { "epoch": 3.3022014676450966, "loss": 0.4834, "step": 9900 }, { "epoch": 3.3022014676450966, "grad_norm": 2.676337480545044, "step": 9900 }, { "epoch": 3.3022014676450966, "learning_rate": 0.00019876541682008246, "step": 9900 }, { "epoch": 3.3022014676450966, "loss": 1.0787899494171143, "step": 9900 }, { "ce_loss": 0.1795887053012848, "epoch": 3.3022014676450966, "step": 9900 }, { "distill_loss": 0.13583481311798096, "epoch": 3.3022014676450966, "step": 9900 }, { "epoch": 3.3022014676450966, "ref_ce_loss": 0.12809765338897705, "step": 9900 }, { "epoch": 3.3022014676450966, "loss": 0.5525591969490051, "step": 9900 }, { "ce_loss": 0.13453686237335205, "epoch": 3.3022014676450966, "step": 9900 }, { "distill_loss": 0.11752891540527344, "epoch": 3.3022014676450966, "step": 9900 }, { "epoch": 3.3022014676450966, "ref_ce_loss": 0.15759889781475067, "step": 9900 }, { "epoch": 3.3022014676450966, "loss": 0.3806399703025818, "step": 9900 }, { "ce_loss": 0.10052482038736343, "epoch": 3.3022014676450966, "step": 9900 }, { "distill_loss": 0.09455669671297073, "epoch": 3.3022014676450966, "step": 9900 }, { "epoch": 3.3022014676450966, "ref_ce_loss": 0.134662926197052, "step": 9900 }, { "epoch": 3.3022014676450966, "loss": 0.30280801653862, "step": 9900 }, { "ce_loss": 0.09678518772125244, "epoch": 3.3022014676450966, "step": 9900 }, { "distill_loss": 0.10427294671535492, "epoch": 3.3022014676450966, "step": 9900 }, { "epoch": 3.3022014676450966, "ref_ce_loss": 0.10156451910734177, "step": 9900 }, { "epoch": 3.305537024683122, "loss": 0.5029, "step": 9910 }, { "epoch": 3.305537024683122, "grad_norm": 4.027454853057861, "step": 9910 }, { "epoch": 3.305537024683122, "learning_rate": 0.00019857381439142372, "step": 9910 }, { "epoch": 3.305537024683122, "loss": 0.25459831953048706, "step": 9910 }, { "ce_loss": 0.042915359139442444, "epoch": 3.305537024683122, "step": 9910 }, { "distill_loss": 0.10543102025985718, "epoch": 3.305537024683122, "step": 9910 }, { "epoch": 3.305537024683122, "ref_ce_loss": 0.10604436695575714, "step": 9910 }, { "epoch": 3.305537024683122, "loss": 0.3459298014640808, "step": 9910 }, { "ce_loss": 0.09528271108865738, "epoch": 3.305537024683122, "step": 9910 }, { "distill_loss": 0.11957503855228424, "epoch": 3.305537024683122, "step": 9910 }, { "epoch": 3.305537024683122, "ref_ce_loss": 0.10496371984481812, "step": 9910 }, { "epoch": 3.305537024683122, "loss": 0.3889404237270355, "step": 9910 }, { "ce_loss": 0.1402936428785324, "epoch": 3.305537024683122, "step": 9910 }, { "distill_loss": 0.1201651319861412, "epoch": 3.305537024683122, "step": 9910 }, { "epoch": 3.305537024683122, "ref_ce_loss": 0.12824715673923492, "step": 9910 }, { "epoch": 3.305537024683122, "loss": 0.44783639907836914, "step": 9910 }, { "ce_loss": 0.1377970278263092, "epoch": 3.305537024683122, "step": 9910 }, { "distill_loss": 0.11447380483150482, "epoch": 3.305537024683122, "step": 9910 }, { "epoch": 3.305537024683122, "ref_ce_loss": 0.12642312049865723, "step": 9910 }, { "epoch": 3.3088725817211473, "loss": 0.4543, "step": 9920 }, { "epoch": 3.3088725817211473, "grad_norm": 2.3650686740875244, "step": 9920 }, { "epoch": 3.3088725817211473, "learning_rate": 0.00019838212338328838, "step": 9920 }, { "epoch": 3.3088725817211473, "loss": 0.9150041341781616, "step": 9920 }, { "ce_loss": 0.16532965004444122, "epoch": 3.3088725817211473, "step": 9920 }, { "distill_loss": 0.12562328577041626, "epoch": 3.3088725817211473, "step": 9920 }, { "epoch": 3.3088725817211473, "ref_ce_loss": 0.12355418503284454, "step": 9920 }, { "epoch": 3.3088725817211473, "loss": 0.5890618562698364, "step": 9920 }, { "ce_loss": 0.22781842947006226, "epoch": 3.3088725817211473, "step": 9920 }, { "distill_loss": 0.12290843576192856, "epoch": 3.3088725817211473, "step": 9920 }, { "epoch": 3.3088725817211473, "ref_ce_loss": 0.16453175246715546, "step": 9920 }, { "epoch": 3.3088725817211473, "loss": 0.5935525894165039, "step": 9920 }, { "ce_loss": 0.12414514273405075, "epoch": 3.3088725817211473, "step": 9920 }, { "distill_loss": 0.10505258291959763, "epoch": 3.3088725817211473, "step": 9920 }, { "epoch": 3.3088725817211473, "ref_ce_loss": 0.12029238045215607, "step": 9920 }, { "epoch": 3.3088725817211473, "loss": 0.5247844457626343, "step": 9920 }, { "ce_loss": 0.11055473983287811, "epoch": 3.3088725817211473, "step": 9920 }, { "distill_loss": 0.1137736439704895, "epoch": 3.3088725817211473, "step": 9920 }, { "epoch": 3.3088725817211473, "ref_ce_loss": 0.11973520368337631, "step": 9920 }, { "epoch": 3.3122081387591726, "loss": 0.4588, "step": 9930 }, { "epoch": 3.3122081387591726, "grad_norm": 2.262359857559204, "step": 9930 }, { "epoch": 3.3122081387591726, "learning_rate": 0.00019819034414524515, "step": 9930 }, { "epoch": 3.3122081387591726, "loss": 0.5808669328689575, "step": 9930 }, { "ce_loss": 0.19623792171478271, "epoch": 3.3122081387591726, "step": 9930 }, { "distill_loss": 0.1292586475610733, "epoch": 3.3122081387591726, "step": 9930 }, { "epoch": 3.3122081387591726, "ref_ce_loss": 0.11712811887264252, "step": 9930 }, { "epoch": 3.3122081387591726, "loss": 0.3369356393814087, "step": 9930 }, { "ce_loss": 0.10894513875246048, "epoch": 3.3122081387591726, "step": 9930 }, { "distill_loss": 0.09358334541320801, "epoch": 3.3122081387591726, "step": 9930 }, { "epoch": 3.3122081387591726, "ref_ce_loss": 0.1341428905725479, "step": 9930 }, { "epoch": 3.3122081387591726, "loss": 0.6350229382514954, "step": 9930 }, { "ce_loss": 0.27962931990623474, "epoch": 3.3122081387591726, "step": 9930 }, { "distill_loss": 0.15435360372066498, "epoch": 3.3122081387591726, "step": 9930 }, { "epoch": 3.3122081387591726, "ref_ce_loss": 0.1478508561849594, "step": 9930 }, { "epoch": 3.3122081387591726, "loss": 0.34648460149765015, "step": 9930 }, { "ce_loss": 0.13289031386375427, "epoch": 3.3122081387591726, "step": 9930 }, { "distill_loss": 0.10292407125234604, "epoch": 3.3122081387591726, "step": 9930 }, { "epoch": 3.3122081387591726, "ref_ce_loss": 0.1104697436094284, "step": 9930 }, { "epoch": 3.315543695797198, "loss": 0.5132, "step": 9940 }, { "epoch": 3.315543695797198, "grad_norm": 3.6616158485412598, "step": 9940 }, { "epoch": 3.315543695797198, "learning_rate": 0.00019799847702702377, "step": 9940 }, { "epoch": 3.315543695797198, "loss": 0.4875798523426056, "step": 9940 }, { "ce_loss": 0.1997719556093216, "epoch": 3.315543695797198, "step": 9940 }, { "distill_loss": 0.15560133755207062, "epoch": 3.315543695797198, "step": 9940 }, { "epoch": 3.315543695797198, "ref_ce_loss": 0.09026752412319183, "step": 9940 }, { "epoch": 3.315543695797198, "loss": 0.3376682698726654, "step": 9940 }, { "ce_loss": 0.0443369522690773, "epoch": 3.315543695797198, "step": 9940 }, { "distill_loss": 0.10496130585670471, "epoch": 3.315543695797198, "step": 9940 }, { "epoch": 3.315543695797198, "ref_ce_loss": 0.08619407564401627, "step": 9940 }, { "epoch": 3.315543695797198, "loss": 0.4448798894882202, "step": 9940 }, { "ce_loss": 0.154523566365242, "epoch": 3.315543695797198, "step": 9940 }, { "distill_loss": 0.15318924188613892, "epoch": 3.315543695797198, "step": 9940 }, { "epoch": 3.315543695797198, "ref_ce_loss": 0.13680867850780487, "step": 9940 }, { "epoch": 3.315543695797198, "loss": 0.5253775119781494, "step": 9940 }, { "ce_loss": 0.2364787459373474, "epoch": 3.315543695797198, "step": 9940 }, { "distill_loss": 0.1384231299161911, "epoch": 3.315543695797198, "step": 9940 }, { "epoch": 3.315543695797198, "ref_ce_loss": 0.1246381625533104, "step": 9940 }, { "epoch": 3.3188792528352233, "loss": 0.4866, "step": 9950 }, { "epoch": 3.3188792528352233, "grad_norm": 7.7432332038879395, "step": 9950 }, { "epoch": 3.3188792528352233, "learning_rate": 0.00019780652237851414, "step": 9950 }, { "epoch": 3.3188792528352233, "loss": 0.40983110666275024, "step": 9950 }, { "ce_loss": 0.12707103788852692, "epoch": 3.3188792528352233, "step": 9950 }, { "distill_loss": 0.13503378629684448, "epoch": 3.3188792528352233, "step": 9950 }, { "epoch": 3.3188792528352233, "ref_ce_loss": 0.11911485344171524, "step": 9950 }, { "epoch": 3.3188792528352233, "loss": 0.5177261233329773, "step": 9950 }, { "ce_loss": 0.16839143633842468, "epoch": 3.3188792528352233, "step": 9950 }, { "distill_loss": 0.1610061228275299, "epoch": 3.3188792528352233, "step": 9950 }, { "epoch": 3.3188792528352233, "ref_ce_loss": 0.1314130425453186, "step": 9950 }, { "epoch": 3.3188792528352233, "loss": 0.6280122995376587, "step": 9950 }, { "ce_loss": 0.15752331912517548, "epoch": 3.3188792528352233, "step": 9950 }, { "distill_loss": 0.15175409615039825, "epoch": 3.3188792528352233, "step": 9950 }, { "epoch": 3.3188792528352233, "ref_ce_loss": 0.1513548046350479, "step": 9950 }, { "epoch": 3.3188792528352233, "loss": 0.37333211302757263, "step": 9950 }, { "ce_loss": 0.14288701117038727, "epoch": 3.3188792528352233, "step": 9950 }, { "distill_loss": 0.11070699244737625, "epoch": 3.3188792528352233, "step": 9950 }, { "epoch": 3.3188792528352233, "ref_ce_loss": 0.11954483389854431, "step": 9950 }, { "epoch": 3.3222148098732487, "loss": 0.4921, "step": 9960 }, { "epoch": 3.3222148098732487, "grad_norm": 2.99534010887146, "step": 9960 }, { "epoch": 3.3222148098732487, "learning_rate": 0.00019761448054976573, "step": 9960 }, { "epoch": 3.3222148098732487, "loss": 0.74758380651474, "step": 9960 }, { "ce_loss": 0.13936404883861542, "epoch": 3.3222148098732487, "step": 9960 }, { "distill_loss": 0.10002149641513824, "epoch": 3.3222148098732487, "step": 9960 }, { "epoch": 3.3222148098732487, "ref_ce_loss": 0.11220888048410416, "step": 9960 }, { "epoch": 3.3222148098732487, "loss": 1.3047645092010498, "step": 9960 }, { "ce_loss": 0.1143212616443634, "epoch": 3.3222148098732487, "step": 9960 }, { "distill_loss": 0.12109558284282684, "epoch": 3.3222148098732487, "step": 9960 }, { "epoch": 3.3222148098732487, "ref_ce_loss": 0.1020015999674797, "step": 9960 }, { "epoch": 3.3222148098732487, "loss": 0.42227429151535034, "step": 9960 }, { "ce_loss": 0.0887669026851654, "epoch": 3.3222148098732487, "step": 9960 }, { "distill_loss": 0.14309155941009521, "epoch": 3.3222148098732487, "step": 9960 }, { "epoch": 3.3222148098732487, "ref_ce_loss": 0.11570673435926437, "step": 9960 }, { "epoch": 3.3222148098732487, "loss": 0.349815309047699, "step": 9960 }, { "ce_loss": 0.08138938993215561, "epoch": 3.3222148098732487, "step": 9960 }, { "distill_loss": 0.11505304276943207, "epoch": 3.3222148098732487, "step": 9960 }, { "epoch": 3.3222148098732487, "ref_ce_loss": 0.11736801266670227, "step": 9960 }, { "epoch": 3.325550366911274, "loss": 0.5193, "step": 9970 }, { "epoch": 3.325550366911274, "grad_norm": 6.900662422180176, "step": 9970 }, { "epoch": 3.325550366911274, "learning_rate": 0.0001974223518909873, "step": 9970 }, { "epoch": 3.325550366911274, "loss": 0.652093231678009, "step": 9970 }, { "ce_loss": 0.28630322217941284, "epoch": 3.325550366911274, "step": 9970 }, { "distill_loss": 0.23069880902767181, "epoch": 3.325550366911274, "step": 9970 }, { "epoch": 3.325550366911274, "ref_ce_loss": 0.13491539657115936, "step": 9970 }, { "epoch": 3.325550366911274, "loss": 0.5203249454498291, "step": 9970 }, { "ce_loss": 0.0266736950725317, "epoch": 3.325550366911274, "step": 9970 }, { "distill_loss": 0.08863398432731628, "epoch": 3.325550366911274, "step": 9970 }, { "epoch": 3.325550366911274, "ref_ce_loss": 0.10284887999296188, "step": 9970 }, { "epoch": 3.325550366911274, "loss": 0.3818473815917969, "step": 9970 }, { "ce_loss": 0.12291661649942398, "epoch": 3.325550366911274, "step": 9970 }, { "distill_loss": 0.11008165031671524, "epoch": 3.325550366911274, "step": 9970 }, { "epoch": 3.325550366911274, "ref_ce_loss": 0.12098938971757889, "step": 9970 }, { "epoch": 3.325550366911274, "loss": 0.29974690079689026, "step": 9970 }, { "ce_loss": 0.1182885468006134, "epoch": 3.325550366911274, "step": 9970 }, { "distill_loss": 0.10031406581401825, "epoch": 3.325550366911274, "step": 9970 }, { "epoch": 3.325550366911274, "ref_ce_loss": 0.0809178575873375, "step": 9970 }, { "epoch": 3.3288859239492994, "loss": 0.4931, "step": 9980 }, { "epoch": 3.3288859239492994, "grad_norm": 3.05298113822937, "step": 9980 }, { "epoch": 3.3288859239492994, "learning_rate": 0.00019723013675254557, "step": 9980 }, { "epoch": 3.3288859239492994, "loss": 0.33277106285095215, "step": 9980 }, { "ce_loss": 0.09598188102245331, "epoch": 3.3288859239492994, "step": 9980 }, { "distill_loss": 0.10369281470775604, "epoch": 3.3288859239492994, "step": 9980 }, { "epoch": 3.3288859239492994, "ref_ce_loss": 0.10251549631357193, "step": 9980 }, { "epoch": 3.3288859239492994, "loss": 0.3862980604171753, "step": 9980 }, { "ce_loss": 0.15732648968696594, "epoch": 3.3288859239492994, "step": 9980 }, { "distill_loss": 0.09723378717899323, "epoch": 3.3288859239492994, "step": 9980 }, { "epoch": 3.3288859239492994, "ref_ce_loss": 0.13168202340602875, "step": 9980 }, { "epoch": 3.3288859239492994, "loss": 0.3247963786125183, "step": 9980 }, { "ce_loss": 0.13272802531719208, "epoch": 3.3288859239492994, "step": 9980 }, { "distill_loss": 0.11012163758277893, "epoch": 3.3288859239492994, "step": 9980 }, { "epoch": 3.3288859239492994, "ref_ce_loss": 0.0818270593881607, "step": 9980 }, { "epoch": 3.3288859239492994, "loss": 0.3600651025772095, "step": 9980 }, { "ce_loss": 0.12932313978672028, "epoch": 3.3288859239492994, "step": 9980 }, { "distill_loss": 0.1150616928935051, "epoch": 3.3288859239492994, "step": 9980 }, { "epoch": 3.3288859239492994, "ref_ce_loss": 0.08246026933193207, "step": 9980 }, { "epoch": 3.3322214809873247, "loss": 0.464, "step": 9990 }, { "epoch": 3.3322214809873247, "grad_norm": 3.1215741634368896, "step": 9990 }, { "epoch": 3.3322214809873247, "learning_rate": 0.00019703783548496515, "step": 9990 }, { "epoch": 3.3322214809873247, "loss": 0.41121190786361694, "step": 9990 }, { "ce_loss": 0.1616680771112442, "epoch": 3.3322214809873247, "step": 9990 }, { "distill_loss": 0.12587764859199524, "epoch": 3.3322214809873247, "step": 9990 }, { "epoch": 3.3322214809873247, "ref_ce_loss": 0.09707213193178177, "step": 9990 }, { "epoch": 3.3322214809873247, "loss": 0.6120696067810059, "step": 9990 }, { "ce_loss": 0.1784650683403015, "epoch": 3.3322214809873247, "step": 9990 }, { "distill_loss": 0.11934734135866165, "epoch": 3.3322214809873247, "step": 9990 }, { "epoch": 3.3322214809873247, "ref_ce_loss": 0.10499797761440277, "step": 9990 }, { "epoch": 3.3322214809873247, "loss": 0.6155219078063965, "step": 9990 }, { "ce_loss": 0.12345847487449646, "epoch": 3.3322214809873247, "step": 9990 }, { "distill_loss": 0.10840432345867157, "epoch": 3.3322214809873247, "step": 9990 }, { "epoch": 3.3322214809873247, "ref_ce_loss": 0.13200382888317108, "step": 9990 }, { "epoch": 3.3322214809873247, "loss": 0.3213757276535034, "step": 9990 }, { "ce_loss": 0.049462899565696716, "epoch": 3.3322214809873247, "step": 9990 }, { "distill_loss": 0.06645163893699646, "epoch": 3.3322214809873247, "step": 9990 }, { "epoch": 3.3322214809873247, "ref_ce_loss": 0.07780586183071136, "step": 9990 }, { "epoch": 3.33555703802535, "loss": 0.4702, "step": 10000 }, { "epoch": 3.33555703802535, "grad_norm": 2.5109341144561768, "step": 10000 }, { "epoch": 3.33555703802535, "learning_rate": 0.00019684544843892772, "step": 10000 }, { "epoch": 3.33555703802535, "loss": 0.384892076253891, "step": 10000 }, { "ce_loss": 0.12277370691299438, "epoch": 3.33555703802535, "step": 10000 }, { "distill_loss": 0.11243809014558792, "epoch": 3.33555703802535, "step": 10000 }, { "epoch": 3.33555703802535, "ref_ce_loss": 0.10022447258234024, "step": 10000 }, { "epoch": 3.33555703802535, "loss": 0.31973639130592346, "step": 10000 }, { "ce_loss": 0.11735039204359055, "epoch": 3.33555703802535, "step": 10000 }, { "distill_loss": 0.12508799135684967, "epoch": 3.33555703802535, "step": 10000 }, { "epoch": 3.33555703802535, "ref_ce_loss": 0.07725123316049576, "step": 10000 }, { "epoch": 3.33555703802535, "loss": 0.30221205949783325, "step": 10000 }, { "ce_loss": 0.09164729714393616, "epoch": 3.33555703802535, "step": 10000 }, { "distill_loss": 0.07754302769899368, "epoch": 3.33555703802535, "step": 10000 }, { "epoch": 3.33555703802535, "ref_ce_loss": 0.10495035350322723, "step": 10000 }, { "epoch": 3.33555703802535, "loss": 0.5154871940612793, "step": 10000 }, { "ce_loss": 0.1592252552509308, "epoch": 3.33555703802535, "step": 10000 }, { "distill_loss": 0.12933585047721863, "epoch": 3.33555703802535, "step": 10000 }, { "epoch": 3.33555703802535, "ref_ce_loss": 0.13211289048194885, "step": 10000 }, { "epoch": 3.3388925950633754, "loss": 0.5013, "step": 10010 }, { "epoch": 3.3388925950633754, "grad_norm": 2.0777783393859863, "step": 10010 }, { "epoch": 3.3388925950633754, "learning_rate": 0.0001966529759652714, "step": 10010 }, { "epoch": 3.3388925950633754, "loss": 1.3123657703399658, "step": 10010 }, { "ce_loss": 0.20345760881900787, "epoch": 3.3388925950633754, "step": 10010 }, { "distill_loss": 0.13194897770881653, "epoch": 3.3388925950633754, "step": 10010 }, { "epoch": 3.3388925950633754, "ref_ce_loss": 0.09893735498189926, "step": 10010 }, { "epoch": 3.3388925950633754, "loss": 0.5096614360809326, "step": 10010 }, { "ce_loss": 0.13056215643882751, "epoch": 3.3388925950633754, "step": 10010 }, { "distill_loss": 0.09500642120838165, "epoch": 3.3388925950633754, "step": 10010 }, { "epoch": 3.3388925950633754, "ref_ce_loss": 0.11117087304592133, "step": 10010 }, { "epoch": 3.3388925950633754, "loss": 0.45446115732192993, "step": 10010 }, { "ce_loss": 0.1598779857158661, "epoch": 3.3388925950633754, "step": 10010 }, { "distill_loss": 0.10420762747526169, "epoch": 3.3388925950633754, "step": 10010 }, { "epoch": 3.3388925950633754, "ref_ce_loss": 0.11016450077295303, "step": 10010 }, { "epoch": 3.3388925950633754, "loss": 0.5303270816802979, "step": 10010 }, { "ce_loss": 0.14446771144866943, "epoch": 3.3388925950633754, "step": 10010 }, { "distill_loss": 0.1430281698703766, "epoch": 3.3388925950633754, "step": 10010 }, { "epoch": 3.3388925950633754, "ref_ce_loss": 0.13635635375976562, "step": 10010 }, { "epoch": 3.342228152101401, "loss": 0.521, "step": 10020 }, { "epoch": 3.342228152101401, "grad_norm": 3.594378709793091, "step": 10020 }, { "epoch": 3.342228152101401, "learning_rate": 0.00019646041841499, "step": 10020 }, { "epoch": 3.342228152101401, "loss": 0.538306474685669, "step": 10020 }, { "ce_loss": 0.11759919673204422, "epoch": 3.342228152101401, "step": 10020 }, { "distill_loss": 0.08925510942935944, "epoch": 3.342228152101401, "step": 10020 }, { "epoch": 3.342228152101401, "ref_ce_loss": 0.10188145190477371, "step": 10020 }, { "epoch": 3.342228152101401, "loss": 0.37550702691078186, "step": 10020 }, { "ce_loss": 0.10454921424388885, "epoch": 3.342228152101401, "step": 10020 }, { "distill_loss": 0.0822499692440033, "epoch": 3.342228152101401, "step": 10020 }, { "epoch": 3.342228152101401, "ref_ce_loss": 0.13304555416107178, "step": 10020 }, { "epoch": 3.342228152101401, "loss": 0.5278544425964355, "step": 10020 }, { "ce_loss": 0.08236437290906906, "epoch": 3.342228152101401, "step": 10020 }, { "distill_loss": 0.09529929608106613, "epoch": 3.342228152101401, "step": 10020 }, { "epoch": 3.342228152101401, "ref_ce_loss": 0.07882020622491837, "step": 10020 }, { "epoch": 3.342228152101401, "loss": 0.43094292283058167, "step": 10020 }, { "ce_loss": 0.10031628608703613, "epoch": 3.342228152101401, "step": 10020 }, { "distill_loss": 0.08027870208024979, "epoch": 3.342228152101401, "step": 10020 }, { "epoch": 3.342228152101401, "ref_ce_loss": 0.07536637783050537, "step": 10020 }, { "epoch": 3.345563709139426, "loss": 0.4727, "step": 10030 }, { "epoch": 3.345563709139426, "grad_norm": 2.131840944290161, "step": 10030 }, { "epoch": 3.345563709139426, "learning_rate": 0.00019626777613923255, "step": 10030 }, { "epoch": 3.345563709139426, "loss": 0.2794293761253357, "step": 10030 }, { "ce_loss": 0.0968615934252739, "epoch": 3.345563709139426, "step": 10030 }, { "distill_loss": 0.09702742844820023, "epoch": 3.345563709139426, "step": 10030 }, { "epoch": 3.345563709139426, "ref_ce_loss": 0.06158556044101715, "step": 10030 }, { "epoch": 3.345563709139426, "loss": 0.5944832563400269, "step": 10030 }, { "ce_loss": 0.2411114126443863, "epoch": 3.345563709139426, "step": 10030 }, { "distill_loss": 0.15547247231006622, "epoch": 3.345563709139426, "step": 10030 }, { "epoch": 3.345563709139426, "ref_ce_loss": 0.1431601643562317, "step": 10030 }, { "epoch": 3.345563709139426, "loss": 0.36009153723716736, "step": 10030 }, { "ce_loss": 0.11186462640762329, "epoch": 3.345563709139426, "step": 10030 }, { "distill_loss": 0.11474623531103134, "epoch": 3.345563709139426, "step": 10030 }, { "epoch": 3.345563709139426, "ref_ce_loss": 0.1330903023481369, "step": 10030 }, { "epoch": 3.345563709139426, "loss": 0.4779060482978821, "step": 10030 }, { "ce_loss": 0.13936035335063934, "epoch": 3.345563709139426, "step": 10030 }, { "distill_loss": 0.13349001109600067, "epoch": 3.345563709139426, "step": 10030 }, { "epoch": 3.345563709139426, "ref_ce_loss": 0.1413579285144806, "step": 10030 }, { "epoch": 3.3488992661774515, "loss": 0.4775, "step": 10040 }, { "epoch": 3.3488992661774515, "grad_norm": 2.7397146224975586, "step": 10040 }, { "epoch": 3.3488992661774515, "learning_rate": 0.00019607504948930253, "step": 10040 }, { "epoch": 3.3488992661774515, "loss": 0.500059187412262, "step": 10040 }, { "ce_loss": 0.17873555421829224, "epoch": 3.3488992661774515, "step": 10040 }, { "distill_loss": 0.1275375485420227, "epoch": 3.3488992661774515, "step": 10040 }, { "epoch": 3.3488992661774515, "ref_ce_loss": 0.1627815067768097, "step": 10040 }, { "epoch": 3.3488992661774515, "loss": 0.3630131781101227, "step": 10040 }, { "ce_loss": 0.09446496516466141, "epoch": 3.3488992661774515, "step": 10040 }, { "distill_loss": 0.15339601039886475, "epoch": 3.3488992661774515, "step": 10040 }, { "epoch": 3.3488992661774515, "ref_ce_loss": 0.11503797024488449, "step": 10040 }, { "epoch": 3.3488992661774515, "loss": 0.3327060639858246, "step": 10040 }, { "ce_loss": 0.07894602417945862, "epoch": 3.3488992661774515, "step": 10040 }, { "distill_loss": 0.09880810230970383, "epoch": 3.3488992661774515, "step": 10040 }, { "epoch": 3.3488992661774515, "ref_ce_loss": 0.0752006471157074, "step": 10040 }, { "epoch": 3.3488992661774515, "loss": 0.4440487027168274, "step": 10040 }, { "ce_loss": 0.10263817757368088, "epoch": 3.3488992661774515, "step": 10040 }, { "distill_loss": 0.11173877120018005, "epoch": 3.3488992661774515, "step": 10040 }, { "epoch": 3.3488992661774515, "ref_ce_loss": 0.08284956961870193, "step": 10040 }, { "epoch": 3.352234823215477, "loss": 0.4764, "step": 10050 }, { "epoch": 3.352234823215477, "grad_norm": 3.3203084468841553, "step": 10050 }, { "epoch": 3.352234823215477, "learning_rate": 0.0001958822388166574, "step": 10050 }, { "epoch": 3.352234823215477, "loss": 0.4372074007987976, "step": 10050 }, { "ce_loss": 0.14007893204689026, "epoch": 3.352234823215477, "step": 10050 }, { "distill_loss": 0.11357307434082031, "epoch": 3.352234823215477, "step": 10050 }, { "epoch": 3.352234823215477, "ref_ce_loss": 0.09972081333398819, "step": 10050 }, { "epoch": 3.352234823215477, "loss": 0.6508349776268005, "step": 10050 }, { "ce_loss": 0.1959071010351181, "epoch": 3.352234823215477, "step": 10050 }, { "distill_loss": 0.15428954362869263, "epoch": 3.352234823215477, "step": 10050 }, { "epoch": 3.352234823215477, "ref_ce_loss": 0.1650163233280182, "step": 10050 }, { "epoch": 3.352234823215477, "loss": 0.3119848966598511, "step": 10050 }, { "ce_loss": 0.12638384103775024, "epoch": 3.352234823215477, "step": 10050 }, { "distill_loss": 0.08100803196430206, "epoch": 3.352234823215477, "step": 10050 }, { "epoch": 3.352234823215477, "ref_ce_loss": 0.07511499524116516, "step": 10050 }, { "epoch": 3.352234823215477, "loss": 0.5597946643829346, "step": 10050 }, { "ce_loss": 0.18874359130859375, "epoch": 3.352234823215477, "step": 10050 }, { "distill_loss": 0.12405283004045486, "epoch": 3.352234823215477, "step": 10050 }, { "epoch": 3.352234823215477, "ref_ce_loss": 0.1353001892566681, "step": 10050 }, { "epoch": 3.355570380253502, "loss": 0.4504, "step": 10060 }, { "epoch": 3.355570380253502, "grad_norm": 3.4764153957366943, "step": 10060 }, { "epoch": 3.355570380253502, "learning_rate": 0.00019568934447290775, "step": 10060 }, { "epoch": 3.355570380253502, "loss": 0.31696444749832153, "step": 10060 }, { "ce_loss": 0.13871832191944122, "epoch": 3.355570380253502, "step": 10060 }, { "distill_loss": 0.10438908636569977, "epoch": 3.355570380253502, "step": 10060 }, { "epoch": 3.355570380253502, "ref_ce_loss": 0.07383356243371964, "step": 10060 }, { "epoch": 3.355570380253502, "loss": 0.6949265599250793, "step": 10060 }, { "ce_loss": 0.15461240708827972, "epoch": 3.355570380253502, "step": 10060 }, { "distill_loss": 0.11820879578590393, "epoch": 3.355570380253502, "step": 10060 }, { "epoch": 3.355570380253502, "ref_ce_loss": 0.12217464298009872, "step": 10060 }, { "epoch": 3.355570380253502, "loss": 0.232447549700737, "step": 10060 }, { "ce_loss": 0.08912397176027298, "epoch": 3.355570380253502, "step": 10060 }, { "distill_loss": 0.08207815885543823, "epoch": 3.355570380253502, "step": 10060 }, { "epoch": 3.355570380253502, "ref_ce_loss": 0.06119540333747864, "step": 10060 }, { "epoch": 3.355570380253502, "loss": 0.5104449987411499, "step": 10060 }, { "ce_loss": 0.1363963633775711, "epoch": 3.355570380253502, "step": 10060 }, { "distill_loss": 0.1245986744761467, "epoch": 3.355570380253502, "step": 10060 }, { "epoch": 3.355570380253502, "ref_ce_loss": 0.10693434625864029, "step": 10060 }, { "epoch": 3.3589059372915275, "loss": 0.5335, "step": 10070 }, { "epoch": 3.3589059372915275, "grad_norm": 3.6948611736297607, "step": 10070 }, { "epoch": 3.3589059372915275, "learning_rate": 0.00019549636680981673, "step": 10070 }, { "epoch": 3.3589059372915275, "loss": 0.38189783692359924, "step": 10070 }, { "ce_loss": 0.10184728354215622, "epoch": 3.3589059372915275, "step": 10070 }, { "distill_loss": 0.137140691280365, "epoch": 3.3589059372915275, "step": 10070 }, { "epoch": 3.3589059372915275, "ref_ce_loss": 0.10864861309528351, "step": 10070 }, { "epoch": 3.3589059372915275, "loss": 0.4603230953216553, "step": 10070 }, { "ce_loss": 0.11181650310754776, "epoch": 3.3589059372915275, "step": 10070 }, { "distill_loss": 0.13628484308719635, "epoch": 3.3589059372915275, "step": 10070 }, { "epoch": 3.3589059372915275, "ref_ce_loss": 0.11302220821380615, "step": 10070 }, { "epoch": 3.3589059372915275, "loss": 0.4682135581970215, "step": 10070 }, { "ce_loss": 0.15053050220012665, "epoch": 3.3589059372915275, "step": 10070 }, { "distill_loss": 0.16319218277931213, "epoch": 3.3589059372915275, "step": 10070 }, { "epoch": 3.3589059372915275, "ref_ce_loss": 0.07172270119190216, "step": 10070 }, { "epoch": 3.3589059372915275, "loss": 0.20223474502563477, "step": 10070 }, { "ce_loss": 0.047858357429504395, "epoch": 3.3589059372915275, "step": 10070 }, { "distill_loss": 0.0959976464509964, "epoch": 3.3589059372915275, "step": 10070 }, { "epoch": 3.3589059372915275, "ref_ce_loss": 0.058139316737651825, "step": 10070 }, { "epoch": 3.362241494329553, "loss": 0.4468, "step": 10080 }, { "epoch": 3.362241494329553, "grad_norm": 13.602625846862793, "step": 10080 }, { "epoch": 3.362241494329553, "learning_rate": 0.00019530330617929952, "step": 10080 }, { "epoch": 3.362241494329553, "loss": 0.28368741273880005, "step": 10080 }, { "ce_loss": 0.06638386845588684, "epoch": 3.362241494329553, "step": 10080 }, { "distill_loss": 0.10312242805957794, "epoch": 3.362241494329553, "step": 10080 }, { "epoch": 3.362241494329553, "ref_ce_loss": 0.07346618920564651, "step": 10080 }, { "epoch": 3.362241494329553, "loss": 0.6166037321090698, "step": 10080 }, { "ce_loss": 0.1991272121667862, "epoch": 3.362241494329553, "step": 10080 }, { "distill_loss": 0.11051832139492035, "epoch": 3.362241494329553, "step": 10080 }, { "epoch": 3.362241494329553, "ref_ce_loss": 0.11945595592260361, "step": 10080 }, { "epoch": 3.362241494329553, "loss": 0.4137451946735382, "step": 10080 }, { "ce_loss": 0.16128048300743103, "epoch": 3.362241494329553, "step": 10080 }, { "distill_loss": 0.14707604050636292, "epoch": 3.362241494329553, "step": 10080 }, { "epoch": 3.362241494329553, "ref_ce_loss": 0.08171996474266052, "step": 10080 }, { "epoch": 3.362241494329553, "loss": 0.2727615237236023, "step": 10080 }, { "ce_loss": 0.049319345504045486, "epoch": 3.362241494329553, "step": 10080 }, { "distill_loss": 0.10266780853271484, "epoch": 3.362241494329553, "step": 10080 }, { "epoch": 3.362241494329553, "ref_ce_loss": 0.07126244157552719, "step": 10080 }, { "epoch": 3.3655770513675782, "loss": 0.4974, "step": 10090 }, { "epoch": 3.3655770513675782, "grad_norm": 6.224396228790283, "step": 10090 }, { "epoch": 3.3655770513675782, "learning_rate": 0.0001951101629334225, "step": 10090 }, { "epoch": 3.3655770513675782, "loss": 0.5048408508300781, "step": 10090 }, { "ce_loss": 0.11753513664007187, "epoch": 3.3655770513675782, "step": 10090 }, { "distill_loss": 0.14925657212734222, "epoch": 3.3655770513675782, "step": 10090 }, { "epoch": 3.3655770513675782, "ref_ce_loss": 0.09866927564144135, "step": 10090 }, { "epoch": 3.3655770513675782, "loss": 0.4639674425125122, "step": 10090 }, { "ce_loss": 0.12178202718496323, "epoch": 3.3655770513675782, "step": 10090 }, { "distill_loss": 0.12596401572227478, "epoch": 3.3655770513675782, "step": 10090 }, { "epoch": 3.3655770513675782, "ref_ce_loss": 0.08418355882167816, "step": 10090 }, { "epoch": 3.3655770513675782, "loss": 0.47464287281036377, "step": 10090 }, { "ce_loss": 0.16472110152244568, "epoch": 3.3655770513675782, "step": 10090 }, { "distill_loss": 0.13395604491233826, "epoch": 3.3655770513675782, "step": 10090 }, { "epoch": 3.3655770513675782, "ref_ce_loss": 0.11161396652460098, "step": 10090 }, { "epoch": 3.3655770513675782, "loss": 0.485629677772522, "step": 10090 }, { "ce_loss": 0.09048180282115936, "epoch": 3.3655770513675782, "step": 10090 }, { "distill_loss": 0.10109473764896393, "epoch": 3.3655770513675782, "step": 10090 }, { "epoch": 3.3655770513675782, "ref_ce_loss": 0.11090537160634995, "step": 10090 }, { "epoch": 3.3689126084056036, "loss": 0.4875, "step": 10100 }, { "epoch": 3.3689126084056036, "grad_norm": 2.4241278171539307, "step": 10100 }, { "epoch": 3.3689126084056036, "learning_rate": 0.0001949169374244028, "step": 10100 }, { "epoch": 3.3689126084056036, "loss": 1.1379024982452393, "step": 10100 }, { "ce_loss": 0.20646634697914124, "epoch": 3.3689126084056036, "step": 10100 }, { "distill_loss": 0.1389389932155609, "epoch": 3.3689126084056036, "step": 10100 }, { "epoch": 3.3689126084056036, "ref_ce_loss": 0.14189954102039337, "step": 10100 }, { "epoch": 3.3689126084056036, "loss": 0.3208090364933014, "step": 10100 }, { "ce_loss": 0.08959181606769562, "epoch": 3.3689126084056036, "step": 10100 }, { "distill_loss": 0.12985199689865112, "epoch": 3.3689126084056036, "step": 10100 }, { "epoch": 3.3689126084056036, "ref_ce_loss": 0.06652584671974182, "step": 10100 }, { "epoch": 3.3689126084056036, "loss": 0.4916841983795166, "step": 10100 }, { "ce_loss": 0.12754502892494202, "epoch": 3.3689126084056036, "step": 10100 }, { "distill_loss": 0.13923802971839905, "epoch": 3.3689126084056036, "step": 10100 }, { "epoch": 3.3689126084056036, "ref_ce_loss": 0.12598656117916107, "step": 10100 }, { "epoch": 3.3689126084056036, "loss": 0.2617308497428894, "step": 10100 }, { "ce_loss": 0.06644704192876816, "epoch": 3.3689126084056036, "step": 10100 }, { "distill_loss": 0.09629429876804352, "epoch": 3.3689126084056036, "step": 10100 }, { "epoch": 3.3689126084056036, "ref_ce_loss": 0.06436831504106522, "step": 10100 }, { "epoch": 3.372248165443629, "loss": 0.5084, "step": 10110 }, { "epoch": 3.372248165443629, "grad_norm": 2.8610148429870605, "step": 10110 }, { "epoch": 3.372248165443629, "learning_rate": 0.00019472363000460756, "step": 10110 }, { "epoch": 3.372248165443629, "loss": 0.46414175629615784, "step": 10110 }, { "ce_loss": 0.1390947699546814, "epoch": 3.372248165443629, "step": 10110 }, { "distill_loss": 0.12784691154956818, "epoch": 3.372248165443629, "step": 10110 }, { "epoch": 3.372248165443629, "ref_ce_loss": 0.10234922170639038, "step": 10110 }, { "epoch": 3.372248165443629, "loss": 0.25928565859794617, "step": 10110 }, { "ce_loss": 0.08007047325372696, "epoch": 3.372248165443629, "step": 10110 }, { "distill_loss": 0.10526285320520401, "epoch": 3.372248165443629, "step": 10110 }, { "epoch": 3.372248165443629, "ref_ce_loss": 0.07375837862491608, "step": 10110 }, { "epoch": 3.372248165443629, "loss": 0.30820897221565247, "step": 10110 }, { "ce_loss": 0.12451837211847305, "epoch": 3.372248165443629, "step": 10110 }, { "distill_loss": 0.09478746354579926, "epoch": 3.372248165443629, "step": 10110 }, { "epoch": 3.372248165443629, "ref_ce_loss": 0.06415072828531265, "step": 10110 }, { "epoch": 3.372248165443629, "loss": 0.3212338984012604, "step": 10110 }, { "ce_loss": 0.09792843461036682, "epoch": 3.372248165443629, "step": 10110 }, { "distill_loss": 0.09361623972654343, "epoch": 3.372248165443629, "step": 10110 }, { "epoch": 3.372248165443629, "ref_ce_loss": 0.06124284863471985, "step": 10110 }, { "epoch": 3.3755837224816543, "loss": 0.5001, "step": 10120 }, { "epoch": 3.3755837224816543, "grad_norm": 2.521216869354248, "step": 10120 }, { "epoch": 3.3755837224816543, "learning_rate": 0.00019453024102655326, "step": 10120 }, { "epoch": 3.3755837224816543, "loss": 0.3180510103702545, "step": 10120 }, { "ce_loss": 0.10240530967712402, "epoch": 3.3755837224816543, "step": 10120 }, { "distill_loss": 0.11712464690208435, "epoch": 3.3755837224816543, "step": 10120 }, { "epoch": 3.3755837224816543, "ref_ce_loss": 0.0984673723578453, "step": 10120 }, { "epoch": 3.3755837224816543, "loss": 0.6909222602844238, "step": 10120 }, { "ce_loss": 0.1457805633544922, "epoch": 3.3755837224816543, "step": 10120 }, { "distill_loss": 0.1715020090341568, "epoch": 3.3755837224816543, "step": 10120 }, { "epoch": 3.3755837224816543, "ref_ce_loss": 0.10833510011434555, "step": 10120 }, { "epoch": 3.3755837224816543, "loss": 0.4989069402217865, "step": 10120 }, { "ce_loss": 0.2270646095275879, "epoch": 3.3755837224816543, "step": 10120 }, { "distill_loss": 0.18987296521663666, "epoch": 3.3755837224816543, "step": 10120 }, { "epoch": 3.3755837224816543, "ref_ce_loss": 0.08195262402296066, "step": 10120 }, { "epoch": 3.3755837224816543, "loss": 0.38403308391571045, "step": 10120 }, { "ce_loss": 0.11043936759233475, "epoch": 3.3755837224816543, "step": 10120 }, { "distill_loss": 0.12119657546281815, "epoch": 3.3755837224816543, "step": 10120 }, { "epoch": 3.3755837224816543, "ref_ce_loss": 0.10974656790494919, "step": 10120 }, { "epoch": 3.3789192795196796, "loss": 0.4831, "step": 10130 }, { "epoch": 3.3789192795196796, "grad_norm": 3.5986907482147217, "step": 10130 }, { "epoch": 3.3789192795196796, "learning_rate": 0.00019433677084290497, "step": 10130 }, { "epoch": 3.3789192795196796, "loss": 0.30134570598602295, "step": 10130 }, { "ce_loss": 0.07178352773189545, "epoch": 3.3789192795196796, "step": 10130 }, { "distill_loss": 0.0984201729297638, "epoch": 3.3789192795196796, "step": 10130 }, { "epoch": 3.3789192795196796, "ref_ce_loss": 0.0872310921549797, "step": 10130 }, { "epoch": 3.3789192795196796, "loss": 0.7953840494155884, "step": 10130 }, { "ce_loss": 0.1972198635339737, "epoch": 3.3789192795196796, "step": 10130 }, { "distill_loss": 0.13514646887779236, "epoch": 3.3789192795196796, "step": 10130 }, { "epoch": 3.3789192795196796, "ref_ce_loss": 0.15314488112926483, "step": 10130 }, { "epoch": 3.3789192795196796, "loss": 0.4279595613479614, "step": 10130 }, { "ce_loss": 0.1257810741662979, "epoch": 3.3789192795196796, "step": 10130 }, { "distill_loss": 0.1260966956615448, "epoch": 3.3789192795196796, "step": 10130 }, { "epoch": 3.3789192795196796, "ref_ce_loss": 0.10996173322200775, "step": 10130 }, { "epoch": 3.3789192795196796, "loss": 0.23371756076812744, "step": 10130 }, { "ce_loss": 0.037545885890722275, "epoch": 3.3789192795196796, "step": 10130 }, { "distill_loss": 0.11940938234329224, "epoch": 3.3789192795196796, "step": 10130 }, { "epoch": 3.3789192795196796, "ref_ce_loss": 0.07670680433511734, "step": 10130 }, { "epoch": 3.382254836557705, "loss": 0.517, "step": 10140 }, { "epoch": 3.382254836557705, "grad_norm": 3.4739842414855957, "step": 10140 }, { "epoch": 3.382254836557705, "learning_rate": 0.00019414321980647616, "step": 10140 }, { "epoch": 3.382254836557705, "loss": 0.2659488022327423, "step": 10140 }, { "ce_loss": 0.05423908680677414, "epoch": 3.382254836557705, "step": 10140 }, { "distill_loss": 0.11070837080478668, "epoch": 3.382254836557705, "step": 10140 }, { "epoch": 3.382254836557705, "ref_ce_loss": 0.10095931589603424, "step": 10140 }, { "epoch": 3.382254836557705, "loss": 0.4671019911766052, "step": 10140 }, { "ce_loss": 0.08684322983026505, "epoch": 3.382254836557705, "step": 10140 }, { "distill_loss": 0.17755573987960815, "epoch": 3.382254836557705, "step": 10140 }, { "epoch": 3.382254836557705, "ref_ce_loss": 0.10452690720558167, "step": 10140 }, { "epoch": 3.382254836557705, "loss": 0.6300301551818848, "step": 10140 }, { "ce_loss": 0.1521824449300766, "epoch": 3.382254836557705, "step": 10140 }, { "distill_loss": 0.16065070033073425, "epoch": 3.382254836557705, "step": 10140 }, { "epoch": 3.382254836557705, "ref_ce_loss": 0.10154230147600174, "step": 10140 }, { "epoch": 3.382254836557705, "loss": 0.33878257870674133, "step": 10140 }, { "ce_loss": 0.10511957108974457, "epoch": 3.382254836557705, "step": 10140 }, { "distill_loss": 0.1427706480026245, "epoch": 3.382254836557705, "step": 10140 }, { "epoch": 3.382254836557705, "ref_ce_loss": 0.08972452580928802, "step": 10140 }, { "epoch": 3.3855903935957303, "loss": 0.4769, "step": 10150 }, { "epoch": 3.3855903935957303, "grad_norm": 2.4133687019348145, "step": 10150 }, { "epoch": 3.3855903935957303, "learning_rate": 0.0001939495882702275, "step": 10150 }, { "epoch": 3.3855903935957303, "loss": 0.3416212797164917, "step": 10150 }, { "ce_loss": 0.11643853038549423, "epoch": 3.3855903935957303, "step": 10150 }, { "distill_loss": 0.13698649406433105, "epoch": 3.3855903935957303, "step": 10150 }, { "epoch": 3.3855903935957303, "ref_ce_loss": 0.0881284549832344, "step": 10150 }, { "epoch": 3.3855903935957303, "loss": 0.5176674127578735, "step": 10150 }, { "ce_loss": 0.18818378448486328, "epoch": 3.3855903935957303, "step": 10150 }, { "distill_loss": 0.12774789333343506, "epoch": 3.3855903935957303, "step": 10150 }, { "epoch": 3.3855903935957303, "ref_ce_loss": 0.12273141741752625, "step": 10150 }, { "epoch": 3.3855903935957303, "loss": 0.4802161157131195, "step": 10150 }, { "ce_loss": 0.10690363496541977, "epoch": 3.3855903935957303, "step": 10150 }, { "distill_loss": 0.18461987376213074, "epoch": 3.3855903935957303, "step": 10150 }, { "epoch": 3.3855903935957303, "ref_ce_loss": 0.12223734706640244, "step": 10150 }, { "epoch": 3.3855903935957303, "loss": 0.5595582723617554, "step": 10150 }, { "ce_loss": 0.20622768998146057, "epoch": 3.3855903935957303, "step": 10150 }, { "distill_loss": 0.2150389701128006, "epoch": 3.3855903935957303, "step": 10150 }, { "epoch": 3.3855903935957303, "ref_ce_loss": 0.09231989085674286, "step": 10150 }, { "epoch": 3.3889259506337557, "loss": 0.4572, "step": 10160 }, { "epoch": 3.3889259506337557, "grad_norm": 2.9398603439331055, "step": 10160 }, { "epoch": 3.3889259506337557, "learning_rate": 0.0001937558765872665, "step": 10160 }, { "epoch": 3.3889259506337557, "loss": 0.5103137493133545, "step": 10160 }, { "ce_loss": 0.07832441478967667, "epoch": 3.3889259506337557, "step": 10160 }, { "distill_loss": 0.16051746904850006, "epoch": 3.3889259506337557, "step": 10160 }, { "epoch": 3.3889259506337557, "ref_ce_loss": 0.06761564314365387, "step": 10160 }, { "epoch": 3.3889259506337557, "loss": 0.380056232213974, "step": 10160 }, { "ce_loss": 0.1041548103094101, "epoch": 3.3889259506337557, "step": 10160 }, { "distill_loss": 0.15851038694381714, "epoch": 3.3889259506337557, "step": 10160 }, { "epoch": 3.3889259506337557, "ref_ce_loss": 0.11733871698379517, "step": 10160 }, { "epoch": 3.3889259506337557, "loss": 0.37477728724479675, "step": 10160 }, { "ce_loss": 0.10600591450929642, "epoch": 3.3889259506337557, "step": 10160 }, { "distill_loss": 0.12407410144805908, "epoch": 3.3889259506337557, "step": 10160 }, { "epoch": 3.3889259506337557, "ref_ce_loss": 0.1121428906917572, "step": 10160 }, { "epoch": 3.3889259506337557, "loss": 0.6696457862854004, "step": 10160 }, { "ce_loss": 0.17717599868774414, "epoch": 3.3889259506337557, "step": 10160 }, { "distill_loss": 0.16447025537490845, "epoch": 3.3889259506337557, "step": 10160 }, { "epoch": 3.3889259506337557, "ref_ce_loss": 0.11270187050104141, "step": 10160 }, { "epoch": 3.392261507671781, "loss": 0.4992, "step": 10170 }, { "epoch": 3.392261507671781, "grad_norm": 2.9642333984375, "step": 10170 }, { "epoch": 3.392261507671781, "learning_rate": 0.00019356208511084693, "step": 10170 }, { "epoch": 3.392261507671781, "loss": 0.8481308221817017, "step": 10170 }, { "ce_loss": 0.23037905991077423, "epoch": 3.392261507671781, "step": 10170 }, { "distill_loss": 0.21714502573013306, "epoch": 3.392261507671781, "step": 10170 }, { "epoch": 3.392261507671781, "ref_ce_loss": 0.2115284502506256, "step": 10170 }, { "epoch": 3.392261507671781, "loss": 0.387999951839447, "step": 10170 }, { "ce_loss": 0.14844535291194916, "epoch": 3.392261507671781, "step": 10170 }, { "distill_loss": 0.11714182794094086, "epoch": 3.392261507671781, "step": 10170 }, { "epoch": 3.392261507671781, "ref_ce_loss": 0.12236713618040085, "step": 10170 }, { "epoch": 3.392261507671781, "loss": 0.5657041072845459, "step": 10170 }, { "ce_loss": 0.2327728122472763, "epoch": 3.392261507671781, "step": 10170 }, { "distill_loss": 0.2262754589319229, "epoch": 3.392261507671781, "step": 10170 }, { "epoch": 3.392261507671781, "ref_ce_loss": 0.10661839693784714, "step": 10170 }, { "epoch": 3.392261507671781, "loss": 0.39357078075408936, "step": 10170 }, { "ce_loss": 0.1515912264585495, "epoch": 3.392261507671781, "step": 10170 }, { "distill_loss": 0.15029920637607574, "epoch": 3.392261507671781, "step": 10170 }, { "epoch": 3.392261507671781, "ref_ce_loss": 0.09164946526288986, "step": 10170 }, { "epoch": 3.3955970647098064, "loss": 0.532, "step": 10180 }, { "epoch": 3.3955970647098064, "grad_norm": 2.959740161895752, "step": 10180 }, { "epoch": 3.3955970647098064, "learning_rate": 0.00019336821419436794, "step": 10180 }, { "epoch": 3.3955970647098064, "loss": 0.45094069838523865, "step": 10180 }, { "ce_loss": 0.1460072249174118, "epoch": 3.3955970647098064, "step": 10180 }, { "distill_loss": 0.13633152842521667, "epoch": 3.3955970647098064, "step": 10180 }, { "epoch": 3.3955970647098064, "ref_ce_loss": 0.09167934954166412, "step": 10180 }, { "epoch": 3.3955970647098064, "loss": 0.3513057827949524, "step": 10180 }, { "ce_loss": 0.08992922306060791, "epoch": 3.3955970647098064, "step": 10180 }, { "distill_loss": 0.12288524955511093, "epoch": 3.3955970647098064, "step": 10180 }, { "epoch": 3.3955970647098064, "ref_ce_loss": 0.10806012898683548, "step": 10180 }, { "epoch": 3.3955970647098064, "loss": 0.5274137854576111, "step": 10180 }, { "ce_loss": 0.14708077907562256, "epoch": 3.3955970647098064, "step": 10180 }, { "distill_loss": 0.14461305737495422, "epoch": 3.3955970647098064, "step": 10180 }, { "epoch": 3.3955970647098064, "ref_ce_loss": 0.1427834928035736, "step": 10180 }, { "epoch": 3.3955970647098064, "loss": 0.6027082204818726, "step": 10180 }, { "ce_loss": 0.15740935504436493, "epoch": 3.3955970647098064, "step": 10180 }, { "distill_loss": 0.18717409670352936, "epoch": 3.3955970647098064, "step": 10180 }, { "epoch": 3.3955970647098064, "ref_ce_loss": 0.13125848770141602, "step": 10180 }, { "epoch": 3.3989326217478317, "loss": 0.5064, "step": 10190 }, { "epoch": 3.3989326217478317, "grad_norm": 2.5797903537750244, "step": 10190 }, { "epoch": 3.3989326217478317, "learning_rate": 0.0001931742641913736, "step": 10190 }, { "epoch": 3.3989326217478317, "loss": 0.4065500795841217, "step": 10190 }, { "ce_loss": 0.13067033886909485, "epoch": 3.3989326217478317, "step": 10190 }, { "distill_loss": 0.13068890571594238, "epoch": 3.3989326217478317, "step": 10190 }, { "epoch": 3.3989326217478317, "ref_ce_loss": 0.12061259895563126, "step": 10190 }, { "epoch": 3.3989326217478317, "loss": 0.43447381258010864, "step": 10190 }, { "ce_loss": 0.12058955430984497, "epoch": 3.3989326217478317, "step": 10190 }, { "distill_loss": 0.19879361987113953, "epoch": 3.3989326217478317, "step": 10190 }, { "epoch": 3.3989326217478317, "ref_ce_loss": 0.07918906956911087, "step": 10190 }, { "epoch": 3.3989326217478317, "loss": 0.7729294300079346, "step": 10190 }, { "ce_loss": 0.1260051280260086, "epoch": 3.3989326217478317, "step": 10190 }, { "distill_loss": 0.2032921016216278, "epoch": 3.3989326217478317, "step": 10190 }, { "epoch": 3.3989326217478317, "ref_ce_loss": 0.11601860076189041, "step": 10190 }, { "epoch": 3.3989326217478317, "loss": 0.5093587040901184, "step": 10190 }, { "ce_loss": 0.2039278894662857, "epoch": 3.3989326217478317, "step": 10190 }, { "distill_loss": 0.18354101479053497, "epoch": 3.3989326217478317, "step": 10190 }, { "epoch": 3.3989326217478317, "ref_ce_loss": 0.12095209211111069, "step": 10190 }, { "epoch": 3.402268178785857, "loss": 0.4849, "step": 10200 }, { "epoch": 3.402268178785857, "grad_norm": 2.8363871574401855, "step": 10200 }, { "epoch": 3.402268178785857, "learning_rate": 0.00019298023545555226, "step": 10200 }, { "epoch": 3.402268178785857, "loss": 0.30198198556900024, "step": 10200 }, { "ce_loss": 0.10195101797580719, "epoch": 3.402268178785857, "step": 10200 }, { "distill_loss": 0.09005577862262726, "epoch": 3.402268178785857, "step": 10200 }, { "epoch": 3.402268178785857, "ref_ce_loss": 0.08784396946430206, "step": 10200 }, { "epoch": 3.402268178785857, "loss": 0.4342412054538727, "step": 10200 }, { "ce_loss": 0.10846803337335587, "epoch": 3.402268178785857, "step": 10200 }, { "distill_loss": 0.15420377254486084, "epoch": 3.402268178785857, "step": 10200 }, { "epoch": 3.402268178785857, "ref_ce_loss": 0.12385132163763046, "step": 10200 }, { "epoch": 3.402268178785857, "loss": 0.4970320761203766, "step": 10200 }, { "ce_loss": 0.10349898785352707, "epoch": 3.402268178785857, "step": 10200 }, { "distill_loss": 0.11978067457675934, "epoch": 3.402268178785857, "step": 10200 }, { "epoch": 3.402268178785857, "ref_ce_loss": 0.049799658358097076, "step": 10200 }, { "epoch": 3.402268178785857, "loss": 0.30252933502197266, "step": 10200 }, { "ce_loss": 0.040033504366874695, "epoch": 3.402268178785857, "step": 10200 }, { "distill_loss": 0.08907225728034973, "epoch": 3.402268178785857, "step": 10200 }, { "epoch": 3.402268178785857, "ref_ce_loss": 0.10964923352003098, "step": 10200 }, { "epoch": 3.4056037358238824, "loss": 0.4874, "step": 10210 }, { "epoch": 3.4056037358238824, "grad_norm": 3.8480939865112305, "step": 10210 }, { "epoch": 3.4056037358238824, "learning_rate": 0.00019278612834073574, "step": 10210 }, { "epoch": 3.4056037358238824, "loss": 0.4816315770149231, "step": 10210 }, { "ce_loss": 0.21080242097377777, "epoch": 3.4056037358238824, "step": 10210 }, { "distill_loss": 0.12634778022766113, "epoch": 3.4056037358238824, "step": 10210 }, { "epoch": 3.4056037358238824, "ref_ce_loss": 0.09341747313737869, "step": 10210 }, { "epoch": 3.4056037358238824, "loss": 0.6531007289886475, "step": 10210 }, { "ce_loss": 0.15230649709701538, "epoch": 3.4056037358238824, "step": 10210 }, { "distill_loss": 0.1511775106191635, "epoch": 3.4056037358238824, "step": 10210 }, { "epoch": 3.4056037358238824, "ref_ce_loss": 0.1311090886592865, "step": 10210 }, { "epoch": 3.4056037358238824, "loss": 0.7131592631340027, "step": 10210 }, { "ce_loss": 0.19272080063819885, "epoch": 3.4056037358238824, "step": 10210 }, { "distill_loss": 0.170637309551239, "epoch": 3.4056037358238824, "step": 10210 }, { "epoch": 3.4056037358238824, "ref_ce_loss": 0.1068262979388237, "step": 10210 }, { "epoch": 3.4056037358238824, "loss": 0.3750826418399811, "step": 10210 }, { "ce_loss": 0.06305833905935287, "epoch": 3.4056037358238824, "step": 10210 }, { "distill_loss": 0.09007889032363892, "epoch": 3.4056037358238824, "step": 10210 }, { "epoch": 3.4056037358238824, "ref_ce_loss": 0.05692875385284424, "step": 10210 }, { "epoch": 3.4089392928619078, "loss": 0.4827, "step": 10220 }, { "epoch": 3.4089392928619078, "grad_norm": 4.855068206787109, "step": 10220 }, { "epoch": 3.4089392928619078, "learning_rate": 0.00019259194320089888, "step": 10220 }, { "epoch": 3.4089392928619078, "loss": 0.365900456905365, "step": 10220 }, { "ce_loss": 0.116310253739357, "epoch": 3.4089392928619078, "step": 10220 }, { "distill_loss": 0.11983893066644669, "epoch": 3.4089392928619078, "step": 10220 }, { "epoch": 3.4089392928619078, "ref_ce_loss": 0.08262918144464493, "step": 10220 }, { "epoch": 3.4089392928619078, "loss": 0.45967572927474976, "step": 10220 }, { "ce_loss": 0.156527578830719, "epoch": 3.4089392928619078, "step": 10220 }, { "distill_loss": 0.13135460019111633, "epoch": 3.4089392928619078, "step": 10220 }, { "epoch": 3.4089392928619078, "ref_ce_loss": 0.13319435715675354, "step": 10220 }, { "epoch": 3.4089392928619078, "loss": 0.4894421696662903, "step": 10220 }, { "ce_loss": 0.138182133436203, "epoch": 3.4089392928619078, "step": 10220 }, { "distill_loss": 0.12863339483737946, "epoch": 3.4089392928619078, "step": 10220 }, { "epoch": 3.4089392928619078, "ref_ce_loss": 0.14758390188217163, "step": 10220 }, { "epoch": 3.4089392928619078, "loss": 0.3858674168586731, "step": 10220 }, { "ce_loss": 0.0867292508482933, "epoch": 3.4089392928619078, "step": 10220 }, { "distill_loss": 0.13426059484481812, "epoch": 3.4089392928619078, "step": 10220 }, { "epoch": 3.4089392928619078, "ref_ce_loss": 0.1308407485485077, "step": 10220 }, { "epoch": 3.412274849899933, "loss": 0.4488, "step": 10230 }, { "epoch": 3.412274849899933, "grad_norm": 2.2313504219055176, "step": 10230 }, { "epoch": 3.412274849899933, "learning_rate": 0.00019239768039015884, "step": 10230 }, { "epoch": 3.412274849899933, "loss": 0.4219973087310791, "step": 10230 }, { "ce_loss": 0.1498318910598755, "epoch": 3.412274849899933, "step": 10230 }, { "distill_loss": 0.1641000658273697, "epoch": 3.412274849899933, "step": 10230 }, { "epoch": 3.412274849899933, "ref_ce_loss": 0.10786410421133041, "step": 10230 }, { "epoch": 3.412274849899933, "loss": 0.4946644902229309, "step": 10230 }, { "ce_loss": 0.17174388468265533, "epoch": 3.412274849899933, "step": 10230 }, { "distill_loss": 0.15777334570884705, "epoch": 3.412274849899933, "step": 10230 }, { "epoch": 3.412274849899933, "ref_ce_loss": 0.13005994260311127, "step": 10230 }, { "epoch": 3.412274849899933, "loss": 0.7625753879547119, "step": 10230 }, { "ce_loss": 0.2250397950410843, "epoch": 3.412274849899933, "step": 10230 }, { "distill_loss": 0.27483320236206055, "epoch": 3.412274849899933, "step": 10230 }, { "epoch": 3.412274849899933, "ref_ce_loss": 0.11940769106149673, "step": 10230 }, { "epoch": 3.412274849899933, "loss": 0.3382866680622101, "step": 10230 }, { "ce_loss": 0.10599769651889801, "epoch": 3.412274849899933, "step": 10230 }, { "distill_loss": 0.1568976789712906, "epoch": 3.412274849899933, "step": 10230 }, { "epoch": 3.412274849899933, "ref_ce_loss": 0.07457992434501648, "step": 10230 }, { "epoch": 3.4156104069379585, "loss": 0.5524, "step": 10240 }, { "epoch": 3.4156104069379585, "grad_norm": 5.488867282867432, "step": 10240 }, { "epoch": 3.4156104069379585, "learning_rate": 0.0001922033402627742, "step": 10240 }, { "epoch": 3.4156104069379585, "loss": 0.5070091485977173, "step": 10240 }, { "ce_loss": 0.14857244491577148, "epoch": 3.4156104069379585, "step": 10240 }, { "distill_loss": 0.17652364075183868, "epoch": 3.4156104069379585, "step": 10240 }, { "epoch": 3.4156104069379585, "ref_ce_loss": 0.07085951417684555, "step": 10240 }, { "epoch": 3.4156104069379585, "loss": 0.5407269597053528, "step": 10240 }, { "ce_loss": 0.24016886949539185, "epoch": 3.4156104069379585, "step": 10240 }, { "distill_loss": 0.17092876136302948, "epoch": 3.4156104069379585, "step": 10240 }, { "epoch": 3.4156104069379585, "ref_ce_loss": 0.12917813658714294, "step": 10240 }, { "epoch": 3.4156104069379585, "loss": 0.35367733240127563, "step": 10240 }, { "ce_loss": 0.10315673053264618, "epoch": 3.4156104069379585, "step": 10240 }, { "distill_loss": 0.11190631985664368, "epoch": 3.4156104069379585, "step": 10240 }, { "epoch": 3.4156104069379585, "ref_ce_loss": 0.09125013649463654, "step": 10240 }, { "epoch": 3.4156104069379585, "loss": 0.39612677693367004, "step": 10240 }, { "ce_loss": 0.1108587235212326, "epoch": 3.4156104069379585, "step": 10240 }, { "distill_loss": 0.09831427782773972, "epoch": 3.4156104069379585, "step": 10240 }, { "epoch": 3.4156104069379585, "ref_ce_loss": 0.0691721960902214, "step": 10240 }, { "epoch": 3.418945963975984, "loss": 0.4671, "step": 10250 }, { "epoch": 3.418945963975984, "grad_norm": 3.53739595413208, "step": 10250 }, { "epoch": 3.418945963975984, "learning_rate": 0.00019200892317314486, "step": 10250 }, { "epoch": 3.418945963975984, "loss": 0.4813249707221985, "step": 10250 }, { "ce_loss": 0.10285835713148117, "epoch": 3.418945963975984, "step": 10250 }, { "distill_loss": 0.11528073996305466, "epoch": 3.418945963975984, "step": 10250 }, { "epoch": 3.418945963975984, "ref_ce_loss": 0.10409308969974518, "step": 10250 }, { "epoch": 3.418945963975984, "loss": 0.943084180355072, "step": 10250 }, { "ce_loss": 0.09678593277931213, "epoch": 3.418945963975984, "step": 10250 }, { "distill_loss": 0.13432377576828003, "epoch": 3.418945963975984, "step": 10250 }, { "epoch": 3.418945963975984, "ref_ce_loss": 0.07842497527599335, "step": 10250 }, { "epoch": 3.418945963975984, "loss": 0.5886873602867126, "step": 10250 }, { "ce_loss": 0.1650712937116623, "epoch": 3.418945963975984, "step": 10250 }, { "distill_loss": 0.170527845621109, "epoch": 3.418945963975984, "step": 10250 }, { "epoch": 3.418945963975984, "ref_ce_loss": 0.15662546455860138, "step": 10250 }, { "epoch": 3.418945963975984, "loss": 0.46970099210739136, "step": 10250 }, { "ce_loss": 0.13701611757278442, "epoch": 3.418945963975984, "step": 10250 }, { "distill_loss": 0.130446657538414, "epoch": 3.418945963975984, "step": 10250 }, { "epoch": 3.418945963975984, "ref_ce_loss": 0.07624396681785583, "step": 10250 }, { "epoch": 3.422281521014009, "loss": 0.5555, "step": 10260 }, { "epoch": 3.422281521014009, "grad_norm": 2.5280873775482178, "step": 10260 }, { "epoch": 3.422281521014009, "learning_rate": 0.00019181442947581074, "step": 10260 }, { "epoch": 3.422281521014009, "loss": 0.29995018243789673, "step": 10260 }, { "ce_loss": 0.07752680778503418, "epoch": 3.422281521014009, "step": 10260 }, { "distill_loss": 0.11020202189683914, "epoch": 3.422281521014009, "step": 10260 }, { "epoch": 3.422281521014009, "ref_ce_loss": 0.1116335466504097, "step": 10260 }, { "epoch": 3.422281521014009, "loss": 0.38425710797309875, "step": 10260 }, { "ce_loss": 0.10159788280725479, "epoch": 3.422281521014009, "step": 10260 }, { "distill_loss": 0.20541100203990936, "epoch": 3.422281521014009, "step": 10260 }, { "epoch": 3.422281521014009, "ref_ce_loss": 0.07702256739139557, "step": 10260 }, { "epoch": 3.422281521014009, "loss": 0.39652562141418457, "step": 10260 }, { "ce_loss": 0.07538190484046936, "epoch": 3.422281521014009, "step": 10260 }, { "distill_loss": 0.08564054220914841, "epoch": 3.422281521014009, "step": 10260 }, { "epoch": 3.422281521014009, "ref_ce_loss": 0.07945127040147781, "step": 10260 }, { "epoch": 3.422281521014009, "loss": 0.24914473295211792, "step": 10260 }, { "ce_loss": 0.04793926328420639, "epoch": 3.422281521014009, "step": 10260 }, { "distill_loss": 0.07164011895656586, "epoch": 3.422281521014009, "step": 10260 }, { "epoch": 3.422281521014009, "ref_ce_loss": 0.07984654605388641, "step": 10260 }, { "epoch": 3.4256170780520345, "loss": 0.4457, "step": 10270 }, { "epoch": 3.4256170780520345, "grad_norm": 5.044663429260254, "step": 10270 }, { "epoch": 3.4256170780520345, "learning_rate": 0.00019161985952545173, "step": 10270 }, { "epoch": 3.4256170780520345, "loss": 0.5661646127700806, "step": 10270 }, { "ce_loss": 0.18041296303272247, "epoch": 3.4256170780520345, "step": 10270 }, { "distill_loss": 0.15173012018203735, "epoch": 3.4256170780520345, "step": 10270 }, { "epoch": 3.4256170780520345, "ref_ce_loss": 0.13187175989151, "step": 10270 }, { "epoch": 3.4256170780520345, "loss": 0.5040100812911987, "step": 10270 }, { "ce_loss": 0.16350196301937103, "epoch": 3.4256170780520345, "step": 10270 }, { "distill_loss": 0.17784355580806732, "epoch": 3.4256170780520345, "step": 10270 }, { "epoch": 3.4256170780520345, "ref_ce_loss": 0.1626138836145401, "step": 10270 }, { "epoch": 3.4256170780520345, "loss": 0.44560155272483826, "step": 10270 }, { "ce_loss": 0.11676087230443954, "epoch": 3.4256170780520345, "step": 10270 }, { "distill_loss": 0.11542589217424393, "epoch": 3.4256170780520345, "step": 10270 }, { "epoch": 3.4256170780520345, "ref_ce_loss": 0.09448987245559692, "step": 10270 }, { "epoch": 3.4256170780520345, "loss": 0.3565663695335388, "step": 10270 }, { "ce_loss": 0.06954836845397949, "epoch": 3.4256170780520345, "step": 10270 }, { "distill_loss": 0.12725864350795746, "epoch": 3.4256170780520345, "step": 10270 }, { "epoch": 3.4256170780520345, "ref_ce_loss": 0.13181714713573456, "step": 10270 }, { "epoch": 3.42895263509006, "loss": 0.4707, "step": 10280 }, { "epoch": 3.42895263509006, "grad_norm": 2.253469228744507, "step": 10280 }, { "epoch": 3.42895263509006, "learning_rate": 0.0001914252136768867, "step": 10280 }, { "epoch": 3.42895263509006, "loss": 0.3005276322364807, "step": 10280 }, { "ce_loss": 0.05415527522563934, "epoch": 3.42895263509006, "step": 10280 }, { "distill_loss": 0.10406085848808289, "epoch": 3.42895263509006, "step": 10280 }, { "epoch": 3.42895263509006, "ref_ce_loss": 0.07171543687582016, "step": 10280 }, { "epoch": 3.42895263509006, "loss": 0.30626076459884644, "step": 10280 }, { "ce_loss": 0.10488387942314148, "epoch": 3.42895263509006, "step": 10280 }, { "distill_loss": 0.11459501087665558, "epoch": 3.42895263509006, "step": 10280 }, { "epoch": 3.42895263509006, "ref_ce_loss": 0.08659002184867859, "step": 10280 }, { "epoch": 3.42895263509006, "loss": 0.6513745188713074, "step": 10280 }, { "ce_loss": 0.19871819019317627, "epoch": 3.42895263509006, "step": 10280 }, { "distill_loss": 0.18216168880462646, "epoch": 3.42895263509006, "step": 10280 }, { "epoch": 3.42895263509006, "ref_ce_loss": 0.11713993549346924, "step": 10280 }, { "epoch": 3.42895263509006, "loss": 0.5560740232467651, "step": 10280 }, { "ce_loss": 0.13708558678627014, "epoch": 3.42895263509006, "step": 10280 }, { "distill_loss": 0.17769046127796173, "epoch": 3.42895263509006, "step": 10280 }, { "epoch": 3.42895263509006, "ref_ce_loss": 0.1358463168144226, "step": 10280 }, { "epoch": 3.4322881921280852, "loss": 0.4525, "step": 10290 }, { "epoch": 3.4322881921280852, "grad_norm": 2.5623693466186523, "step": 10290 }, { "epoch": 3.4322881921280852, "learning_rate": 0.00019123049228507278, "step": 10290 }, { "epoch": 3.4322881921280852, "loss": 0.2555229961872101, "step": 10290 }, { "ce_loss": 0.08814626187086105, "epoch": 3.4322881921280852, "step": 10290 }, { "distill_loss": 0.09802016615867615, "epoch": 3.4322881921280852, "step": 10290 }, { "epoch": 3.4322881921280852, "ref_ce_loss": 0.06921228021383286, "step": 10290 }, { "epoch": 3.4322881921280852, "loss": 0.4880223870277405, "step": 10290 }, { "ce_loss": 0.19454342126846313, "epoch": 3.4322881921280852, "step": 10290 }, { "distill_loss": 0.14570696651935577, "epoch": 3.4322881921280852, "step": 10290 }, { "epoch": 3.4322881921280852, "ref_ce_loss": 0.12097126245498657, "step": 10290 }, { "epoch": 3.4322881921280852, "loss": 0.8403055667877197, "step": 10290 }, { "ce_loss": 0.15819591283798218, "epoch": 3.4322881921280852, "step": 10290 }, { "distill_loss": 0.18111252784729004, "epoch": 3.4322881921280852, "step": 10290 }, { "epoch": 3.4322881921280852, "ref_ce_loss": 0.0896746814250946, "step": 10290 }, { "epoch": 3.4322881921280852, "loss": 0.6239392757415771, "step": 10290 }, { "ce_loss": 0.14710165560245514, "epoch": 3.4322881921280852, "step": 10290 }, { "distill_loss": 0.19059288501739502, "epoch": 3.4322881921280852, "step": 10290 }, { "epoch": 3.4322881921280852, "ref_ce_loss": 0.10249871760606766, "step": 10290 }, { "epoch": 3.4356237491661106, "loss": 0.546, "step": 10300 }, { "epoch": 3.4356237491661106, "grad_norm": 4.703062057495117, "step": 10300 }, { "epoch": 3.4356237491661106, "learning_rate": 0.000191035695705105, "step": 10300 }, { "epoch": 3.4356237491661106, "loss": 0.304233580827713, "step": 10300 }, { "ce_loss": 0.08161269873380661, "epoch": 3.4356237491661106, "step": 10300 }, { "distill_loss": 0.12234167754650116, "epoch": 3.4356237491661106, "step": 10300 }, { "epoch": 3.4356237491661106, "ref_ce_loss": 0.08137601613998413, "step": 10300 }, { "epoch": 3.4356237491661106, "loss": 0.3445499539375305, "step": 10300 }, { "ce_loss": 0.1015457957983017, "epoch": 3.4356237491661106, "step": 10300 }, { "distill_loss": 0.13137434422969818, "epoch": 3.4356237491661106, "step": 10300 }, { "epoch": 3.4356237491661106, "ref_ce_loss": 0.08375562727451324, "step": 10300 }, { "epoch": 3.4356237491661106, "loss": 0.404718279838562, "step": 10300 }, { "ce_loss": 0.09954530745744705, "epoch": 3.4356237491661106, "step": 10300 }, { "distill_loss": 0.15256458520889282, "epoch": 3.4356237491661106, "step": 10300 }, { "epoch": 3.4356237491661106, "ref_ce_loss": 0.12649698555469513, "step": 10300 }, { "epoch": 3.4356237491661106, "loss": 0.5050632953643799, "step": 10300 }, { "ce_loss": 0.07534976303577423, "epoch": 3.4356237491661106, "step": 10300 }, { "distill_loss": 0.12757937610149384, "epoch": 3.4356237491661106, "step": 10300 }, { "epoch": 3.4356237491661106, "ref_ce_loss": 0.10308182239532471, "step": 10300 }, { "epoch": 3.438959306204136, "loss": 0.4814, "step": 10310 }, { "epoch": 3.438959306204136, "grad_norm": 2.611424446105957, "step": 10310 }, { "epoch": 3.438959306204136, "learning_rate": 0.00019084082429221558, "step": 10310 }, { "epoch": 3.438959306204136, "loss": 0.4377182722091675, "step": 10310 }, { "ce_loss": 0.09518056362867355, "epoch": 3.438959306204136, "step": 10310 }, { "distill_loss": 0.12689684331417084, "epoch": 3.438959306204136, "step": 10310 }, { "epoch": 3.438959306204136, "ref_ce_loss": 0.16369374096393585, "step": 10310 }, { "epoch": 3.438959306204136, "loss": 0.6643956899642944, "step": 10310 }, { "ce_loss": 0.16947884857654572, "epoch": 3.438959306204136, "step": 10310 }, { "distill_loss": 0.1439935564994812, "epoch": 3.438959306204136, "step": 10310 }, { "epoch": 3.438959306204136, "ref_ce_loss": 0.11696118861436844, "step": 10310 }, { "epoch": 3.438959306204136, "loss": 0.5247431397438049, "step": 10310 }, { "ce_loss": 0.20082227885723114, "epoch": 3.438959306204136, "step": 10310 }, { "distill_loss": 0.14931391179561615, "epoch": 3.438959306204136, "step": 10310 }, { "epoch": 3.438959306204136, "ref_ce_loss": 0.12185055762529373, "step": 10310 }, { "epoch": 3.438959306204136, "loss": 0.3367827534675598, "step": 10310 }, { "ce_loss": 0.09930837154388428, "epoch": 3.438959306204136, "step": 10310 }, { "distill_loss": 0.1177770271897316, "epoch": 3.438959306204136, "step": 10310 }, { "epoch": 3.438959306204136, "ref_ce_loss": 0.08937904238700867, "step": 10310 }, { "epoch": 3.4422948632421613, "loss": 0.5355, "step": 10320 }, { "epoch": 3.4422948632421613, "grad_norm": 2.856062889099121, "step": 10320 }, { "epoch": 3.4422948632421613, "learning_rate": 0.00019064587840177306, "step": 10320 }, { "epoch": 3.4422948632421613, "loss": 0.3735664486885071, "step": 10320 }, { "ce_loss": 0.1009257510304451, "epoch": 3.4422948632421613, "step": 10320 }, { "distill_loss": 0.11772341281175613, "epoch": 3.4422948632421613, "step": 10320 }, { "epoch": 3.4422948632421613, "ref_ce_loss": 0.08748678863048553, "step": 10320 }, { "epoch": 3.4422948632421613, "loss": 0.4337637424468994, "step": 10320 }, { "ce_loss": 0.08262146264314651, "epoch": 3.4422948632421613, "step": 10320 }, { "distill_loss": 0.21009844541549683, "epoch": 3.4422948632421613, "step": 10320 }, { "epoch": 3.4422948632421613, "ref_ce_loss": 0.10304173082113266, "step": 10320 }, { "epoch": 3.4422948632421613, "loss": 0.3304564654827118, "step": 10320 }, { "ce_loss": 0.11042464524507523, "epoch": 3.4422948632421613, "step": 10320 }, { "distill_loss": 0.10334540903568268, "epoch": 3.4422948632421613, "step": 10320 }, { "epoch": 3.4422948632421613, "ref_ce_loss": 0.08047734946012497, "step": 10320 }, { "epoch": 3.4422948632421613, "loss": 0.45189177989959717, "step": 10320 }, { "ce_loss": 0.1377006471157074, "epoch": 3.4422948632421613, "step": 10320 }, { "distill_loss": 0.1019442155957222, "epoch": 3.4422948632421613, "step": 10320 }, { "epoch": 3.4422948632421613, "ref_ce_loss": 0.06532624363899231, "step": 10320 }, { "epoch": 3.4456304202801866, "loss": 0.4176, "step": 10330 }, { "epoch": 3.4456304202801866, "grad_norm": 4.124449729919434, "step": 10330 }, { "epoch": 3.4456304202801866, "learning_rate": 0.00019045085838928174, "step": 10330 }, { "epoch": 3.4456304202801866, "loss": 0.43326324224472046, "step": 10330 }, { "ce_loss": 0.14759215712547302, "epoch": 3.4456304202801866, "step": 10330 }, { "distill_loss": 0.1370130479335785, "epoch": 3.4456304202801866, "step": 10330 }, { "epoch": 3.4456304202801866, "ref_ce_loss": 0.10759326070547104, "step": 10330 }, { "epoch": 3.4456304202801866, "loss": 0.392121285200119, "step": 10330 }, { "ce_loss": 0.1546168476343155, "epoch": 3.4456304202801866, "step": 10330 }, { "distill_loss": 0.13393008708953857, "epoch": 3.4456304202801866, "step": 10330 }, { "epoch": 3.4456304202801866, "ref_ce_loss": 0.1034577339887619, "step": 10330 }, { "epoch": 3.4456304202801866, "loss": 0.42810681462287903, "step": 10330 }, { "ce_loss": 0.18438415229320526, "epoch": 3.4456304202801866, "step": 10330 }, { "distill_loss": 0.14138942956924438, "epoch": 3.4456304202801866, "step": 10330 }, { "epoch": 3.4456304202801866, "ref_ce_loss": 0.10176559537649155, "step": 10330 }, { "epoch": 3.4456304202801866, "loss": 0.42089158296585083, "step": 10330 }, { "ce_loss": 0.11071355640888214, "epoch": 3.4456304202801866, "step": 10330 }, { "distill_loss": 0.10371338576078415, "epoch": 3.4456304202801866, "step": 10330 }, { "epoch": 3.4456304202801866, "ref_ce_loss": 0.12404021620750427, "step": 10330 }, { "epoch": 3.448965977318212, "loss": 0.4524, "step": 10340 }, { "epoch": 3.448965977318212, "grad_norm": 3.522857189178467, "step": 10340 }, { "epoch": 3.448965977318212, "learning_rate": 0.00019025576461038134, "step": 10340 }, { "epoch": 3.448965977318212, "loss": 0.5325416922569275, "step": 10340 }, { "ce_loss": 0.231018528342247, "epoch": 3.448965977318212, "step": 10340 }, { "distill_loss": 0.11983978748321533, "epoch": 3.448965977318212, "step": 10340 }, { "epoch": 3.448965977318212, "ref_ce_loss": 0.11137314885854721, "step": 10340 }, { "epoch": 3.448965977318212, "loss": 0.3700246214866638, "step": 10340 }, { "ce_loss": 0.12125429511070251, "epoch": 3.448965977318212, "step": 10340 }, { "distill_loss": 0.12481550872325897, "epoch": 3.448965977318212, "step": 10340 }, { "epoch": 3.448965977318212, "ref_ce_loss": 0.08128783106803894, "step": 10340 }, { "epoch": 3.448965977318212, "loss": 0.3559412360191345, "step": 10340 }, { "ce_loss": 0.10249993205070496, "epoch": 3.448965977318212, "step": 10340 }, { "distill_loss": 0.09033988416194916, "epoch": 3.448965977318212, "step": 10340 }, { "epoch": 3.448965977318212, "ref_ce_loss": 0.0899801105260849, "step": 10340 }, { "epoch": 3.448965977318212, "loss": 0.5967211127281189, "step": 10340 }, { "ce_loss": 0.1554853916168213, "epoch": 3.448965977318212, "step": 10340 }, { "distill_loss": 0.1830393224954605, "epoch": 3.448965977318212, "step": 10340 }, { "epoch": 3.448965977318212, "ref_ce_loss": 0.12638594210147858, "step": 10340 }, { "epoch": 3.4523015343562373, "loss": 0.4761, "step": 10350 }, { "epoch": 3.4523015343562373, "grad_norm": 2.2772934436798096, "step": 10350 }, { "epoch": 3.4523015343562373, "learning_rate": 0.0001900605974208459, "step": 10350 }, { "epoch": 3.4523015343562373, "loss": 0.28337162733078003, "step": 10350 }, { "ce_loss": 0.05180773138999939, "epoch": 3.4523015343562373, "step": 10350 }, { "distill_loss": 0.0789615586400032, "epoch": 3.4523015343562373, "step": 10350 }, { "epoch": 3.4523015343562373, "ref_ce_loss": 0.06448401510715485, "step": 10350 }, { "epoch": 3.4523015343562373, "loss": 0.2962020933628082, "step": 10350 }, { "ce_loss": 0.1171887069940567, "epoch": 3.4523015343562373, "step": 10350 }, { "distill_loss": 0.08088953793048859, "epoch": 3.4523015343562373, "step": 10350 }, { "epoch": 3.4523015343562373, "ref_ce_loss": 0.09784910082817078, "step": 10350 }, { "epoch": 3.4523015343562373, "loss": 0.7583683133125305, "step": 10350 }, { "ce_loss": 0.1913074254989624, "epoch": 3.4523015343562373, "step": 10350 }, { "distill_loss": 0.17555482685565948, "epoch": 3.4523015343562373, "step": 10350 }, { "epoch": 3.4523015343562373, "ref_ce_loss": 0.1408015340566635, "step": 10350 }, { "epoch": 3.4523015343562373, "loss": 0.487335205078125, "step": 10350 }, { "ce_loss": 0.1990186721086502, "epoch": 3.4523015343562373, "step": 10350 }, { "distill_loss": 0.11871914565563202, "epoch": 3.4523015343562373, "step": 10350 }, { "epoch": 3.4523015343562373, "ref_ce_loss": 0.11281263083219528, "step": 10350 }, { "epoch": 3.4556370913942627, "loss": 0.4634, "step": 10360 }, { "epoch": 3.4556370913942627, "grad_norm": 2.651352882385254, "step": 10360 }, { "epoch": 3.4556370913942627, "learning_rate": 0.00018986535717658334, "step": 10360 }, { "epoch": 3.4556370913942627, "loss": 0.6339795589447021, "step": 10360 }, { "ce_loss": 0.20450928807258606, "epoch": 3.4556370913942627, "step": 10360 }, { "distill_loss": 0.15441319346427917, "epoch": 3.4556370913942627, "step": 10360 }, { "epoch": 3.4556370913942627, "ref_ce_loss": 0.14329175651073456, "step": 10360 }, { "epoch": 3.4556370913942627, "loss": 0.8533214926719666, "step": 10360 }, { "ce_loss": 0.07007604837417603, "epoch": 3.4556370913942627, "step": 10360 }, { "distill_loss": 0.13101819157600403, "epoch": 3.4556370913942627, "step": 10360 }, { "epoch": 3.4556370913942627, "ref_ce_loss": 0.10507778078317642, "step": 10360 }, { "epoch": 3.4556370913942627, "loss": 0.41858166456222534, "step": 10360 }, { "ce_loss": 0.0904201790690422, "epoch": 3.4556370913942627, "step": 10360 }, { "distill_loss": 0.11084774136543274, "epoch": 3.4556370913942627, "step": 10360 }, { "epoch": 3.4556370913942627, "ref_ce_loss": 0.14289547502994537, "step": 10360 }, { "epoch": 3.4556370913942627, "loss": 0.4427366554737091, "step": 10360 }, { "ce_loss": 0.15832187235355377, "epoch": 3.4556370913942627, "step": 10360 }, { "distill_loss": 0.1405555158853531, "epoch": 3.4556370913942627, "step": 10360 }, { "epoch": 3.4556370913942627, "ref_ce_loss": 0.08129150420427322, "step": 10360 }, { "epoch": 3.458972648432288, "loss": 0.5116, "step": 10370 }, { "epoch": 3.458972648432288, "grad_norm": 1.8335307836532593, "step": 10370 }, { "epoch": 3.458972648432288, "learning_rate": 0.0001896700442336349, "step": 10370 }, { "epoch": 3.458972648432288, "loss": 1.324198603630066, "step": 10370 }, { "ce_loss": 0.11334358155727386, "epoch": 3.458972648432288, "step": 10370 }, { "distill_loss": 0.12460387498140335, "epoch": 3.458972648432288, "step": 10370 }, { "epoch": 3.458972648432288, "ref_ce_loss": 0.06593289226293564, "step": 10370 }, { "epoch": 3.458972648432288, "loss": 0.47392234206199646, "step": 10370 }, { "ce_loss": 0.16286267340183258, "epoch": 3.458972648432288, "step": 10370 }, { "distill_loss": 0.14801934361457825, "epoch": 3.458972648432288, "step": 10370 }, { "epoch": 3.458972648432288, "ref_ce_loss": 0.1139780580997467, "step": 10370 }, { "epoch": 3.458972648432288, "loss": 0.513584554195404, "step": 10370 }, { "ce_loss": 0.15807706117630005, "epoch": 3.458972648432288, "step": 10370 }, { "distill_loss": 0.19912990927696228, "epoch": 3.458972648432288, "step": 10370 }, { "epoch": 3.458972648432288, "ref_ce_loss": 0.11430535465478897, "step": 10370 }, { "epoch": 3.458972648432288, "loss": 0.4971799850463867, "step": 10370 }, { "ce_loss": 0.14749285578727722, "epoch": 3.458972648432288, "step": 10370 }, { "distill_loss": 0.16247427463531494, "epoch": 3.458972648432288, "step": 10370 }, { "epoch": 3.458972648432288, "ref_ce_loss": 0.1421637386083603, "step": 10370 }, { "epoch": 3.4623082054703134, "loss": 0.544, "step": 10380 }, { "epoch": 3.4623082054703134, "grad_norm": 2.416501522064209, "step": 10380 }, { "epoch": 3.4623082054703134, "learning_rate": 0.00018947465894817434, "step": 10380 }, { "epoch": 3.4623082054703134, "loss": 0.8389631509780884, "step": 10380 }, { "ce_loss": 0.19231632351875305, "epoch": 3.4623082054703134, "step": 10380 }, { "distill_loss": 0.19508925080299377, "epoch": 3.4623082054703134, "step": 10380 }, { "epoch": 3.4623082054703134, "ref_ce_loss": 0.1727515161037445, "step": 10380 }, { "epoch": 3.4623082054703134, "loss": 0.4167601764202118, "step": 10380 }, { "ce_loss": 0.13529586791992188, "epoch": 3.4623082054703134, "step": 10380 }, { "distill_loss": 0.13505946099758148, "epoch": 3.4623082054703134, "step": 10380 }, { "epoch": 3.4623082054703134, "ref_ce_loss": 0.1462845355272293, "step": 10380 }, { "epoch": 3.4623082054703134, "loss": 0.42783311009407043, "step": 10380 }, { "ce_loss": 0.1599048376083374, "epoch": 3.4623082054703134, "step": 10380 }, { "distill_loss": 0.12761659920215607, "epoch": 3.4623082054703134, "step": 10380 }, { "epoch": 3.4623082054703134, "ref_ce_loss": 0.09368833154439926, "step": 10380 }, { "epoch": 3.4623082054703134, "loss": 0.6533681154251099, "step": 10380 }, { "ce_loss": 0.14638778567314148, "epoch": 3.4623082054703134, "step": 10380 }, { "distill_loss": 0.1574423760175705, "epoch": 3.4623082054703134, "step": 10380 }, { "epoch": 3.4623082054703134, "ref_ce_loss": 0.1208135262131691, "step": 10380 }, { "epoch": 3.4656437625083387, "loss": 0.456, "step": 10390 }, { "epoch": 3.4656437625083387, "grad_norm": 2.474968671798706, "step": 10390 }, { "epoch": 3.4656437625083387, "learning_rate": 0.00018927920167650735, "step": 10390 }, { "epoch": 3.4656437625083387, "loss": 0.5869070887565613, "step": 10390 }, { "ce_loss": 0.2259010523557663, "epoch": 3.4656437625083387, "step": 10390 }, { "distill_loss": 0.17279842495918274, "epoch": 3.4656437625083387, "step": 10390 }, { "epoch": 3.4656437625083387, "ref_ce_loss": 0.13575266301631927, "step": 10390 }, { "epoch": 3.4656437625083387, "loss": 0.6994563341140747, "step": 10390 }, { "ce_loss": 0.22502291202545166, "epoch": 3.4656437625083387, "step": 10390 }, { "distill_loss": 0.17224964499473572, "epoch": 3.4656437625083387, "step": 10390 }, { "epoch": 3.4656437625083387, "ref_ce_loss": 0.11385179311037064, "step": 10390 }, { "epoch": 3.4656437625083387, "loss": 0.38169121742248535, "step": 10390 }, { "ce_loss": 0.07678114622831345, "epoch": 3.4656437625083387, "step": 10390 }, { "distill_loss": 0.0935371145606041, "epoch": 3.4656437625083387, "step": 10390 }, { "epoch": 3.4656437625083387, "ref_ce_loss": 0.06890590488910675, "step": 10390 }, { "epoch": 3.4656437625083387, "loss": 0.7031863927841187, "step": 10390 }, { "ce_loss": 0.1995294988155365, "epoch": 3.4656437625083387, "step": 10390 }, { "distill_loss": 0.1283656358718872, "epoch": 3.4656437625083387, "step": 10390 }, { "epoch": 3.4656437625083387, "ref_ce_loss": 0.0902036800980568, "step": 10390 }, { "epoch": 3.468979319546364, "loss": 0.4899, "step": 10400 }, { "epoch": 3.468979319546364, "grad_norm": 2.958378314971924, "step": 10400 }, { "epoch": 3.468979319546364, "learning_rate": 0.0001890836727750709, "step": 10400 }, { "epoch": 3.468979319546364, "loss": 0.4847678542137146, "step": 10400 }, { "ce_loss": 0.23173287510871887, "epoch": 3.468979319546364, "step": 10400 }, { "distill_loss": 0.1382112205028534, "epoch": 3.468979319546364, "step": 10400 }, { "epoch": 3.468979319546364, "ref_ce_loss": 0.11151211708784103, "step": 10400 }, { "epoch": 3.468979319546364, "loss": 0.440000057220459, "step": 10400 }, { "ce_loss": 0.1410650759935379, "epoch": 3.468979319546364, "step": 10400 }, { "distill_loss": 0.11761602759361267, "epoch": 3.468979319546364, "step": 10400 }, { "epoch": 3.468979319546364, "ref_ce_loss": 0.1356046199798584, "step": 10400 }, { "epoch": 3.468979319546364, "loss": 0.6042724847793579, "step": 10400 }, { "ce_loss": 0.17960430681705475, "epoch": 3.468979319546364, "step": 10400 }, { "distill_loss": 0.13856202363967896, "epoch": 3.468979319546364, "step": 10400 }, { "epoch": 3.468979319546364, "ref_ce_loss": 0.07231870293617249, "step": 10400 }, { "epoch": 3.468979319546364, "loss": 0.4100415110588074, "step": 10400 }, { "ce_loss": 0.15962854027748108, "epoch": 3.468979319546364, "step": 10400 }, { "distill_loss": 0.12709757685661316, "epoch": 3.468979319546364, "step": 10400 }, { "epoch": 3.468979319546364, "ref_ce_loss": 0.0880059152841568, "step": 10400 }, { "epoch": 3.4723148765843894, "loss": 0.4718, "step": 10410 }, { "epoch": 3.4723148765843894, "grad_norm": 2.205946683883667, "step": 10410 }, { "epoch": 3.4723148765843894, "learning_rate": 0.00018888807260043249, "step": 10410 }, { "epoch": 3.4723148765843894, "loss": 0.5133829116821289, "step": 10410 }, { "ce_loss": 0.13501222431659698, "epoch": 3.4723148765843894, "step": 10410 }, { "distill_loss": 0.11960557103157043, "epoch": 3.4723148765843894, "step": 10410 }, { "epoch": 3.4723148765843894, "ref_ce_loss": 0.09122700989246368, "step": 10410 }, { "epoch": 3.4723148765843894, "loss": 0.3817310631275177, "step": 10410 }, { "ce_loss": 0.07214323431253433, "epoch": 3.4723148765843894, "step": 10410 }, { "distill_loss": 0.1570921242237091, "epoch": 3.4723148765843894, "step": 10410 }, { "epoch": 3.4723148765843894, "ref_ce_loss": 0.11837377399206161, "step": 10410 }, { "epoch": 3.4723148765843894, "loss": 0.530503511428833, "step": 10410 }, { "ce_loss": 0.23058447241783142, "epoch": 3.4723148765843894, "step": 10410 }, { "distill_loss": 0.16666772961616516, "epoch": 3.4723148765843894, "step": 10410 }, { "epoch": 3.4723148765843894, "ref_ce_loss": 0.10783181339502335, "step": 10410 }, { "epoch": 3.4723148765843894, "loss": 0.4435175955295563, "step": 10410 }, { "ce_loss": 0.11766248196363449, "epoch": 3.4723148765843894, "step": 10410 }, { "distill_loss": 0.1328033208847046, "epoch": 3.4723148765843894, "step": 10410 }, { "epoch": 3.4723148765843894, "ref_ce_loss": 0.11009076982736588, "step": 10410 }, { "epoch": 3.4756504336224148, "loss": 0.5215, "step": 10420 }, { "epoch": 3.4756504336224148, "grad_norm": 5.0243682861328125, "step": 10420 }, { "epoch": 3.4756504336224148, "learning_rate": 0.0001886924015092898, "step": 10420 }, { "epoch": 3.4756504336224148, "loss": 0.8781328201293945, "step": 10420 }, { "ce_loss": 0.24173305928707123, "epoch": 3.4756504336224148, "step": 10420 }, { "distill_loss": 0.16323822736740112, "epoch": 3.4756504336224148, "step": 10420 }, { "epoch": 3.4756504336224148, "ref_ce_loss": 0.14663997292518616, "step": 10420 }, { "epoch": 3.4756504336224148, "loss": 0.38078439235687256, "step": 10420 }, { "ce_loss": 0.1302793025970459, "epoch": 3.4756504336224148, "step": 10420 }, { "distill_loss": 0.1325978934764862, "epoch": 3.4756504336224148, "step": 10420 }, { "epoch": 3.4756504336224148, "ref_ce_loss": 0.0761353150010109, "step": 10420 }, { "epoch": 3.4756504336224148, "loss": 0.5718013048171997, "step": 10420 }, { "ce_loss": 0.160188689827919, "epoch": 3.4756504336224148, "step": 10420 }, { "distill_loss": 0.13954801857471466, "epoch": 3.4756504336224148, "step": 10420 }, { "epoch": 3.4756504336224148, "ref_ce_loss": 0.10986100137233734, "step": 10420 }, { "epoch": 3.4756504336224148, "loss": 0.44210585951805115, "step": 10420 }, { "ce_loss": 0.12488564103841782, "epoch": 3.4756504336224148, "step": 10420 }, { "distill_loss": 0.13288307189941406, "epoch": 3.4756504336224148, "step": 10420 }, { "epoch": 3.4756504336224148, "ref_ce_loss": 0.07312694936990738, "step": 10420 }, { "epoch": 3.47898599066044, "loss": 0.5233, "step": 10430 }, { "epoch": 3.47898599066044, "grad_norm": 4.157192230224609, "step": 10430 }, { "epoch": 3.47898599066044, "learning_rate": 0.00018849665985846967, "step": 10430 }, { "epoch": 3.47898599066044, "loss": 0.4405640959739685, "step": 10430 }, { "ce_loss": 0.0566069558262825, "epoch": 3.47898599066044, "step": 10430 }, { "distill_loss": 0.10539036989212036, "epoch": 3.47898599066044, "step": 10430 }, { "epoch": 3.47898599066044, "ref_ce_loss": 0.07350980490446091, "step": 10430 }, { "epoch": 3.47898599066044, "loss": 0.3644062876701355, "step": 10430 }, { "ce_loss": 0.11124683171510696, "epoch": 3.47898599066044, "step": 10430 }, { "distill_loss": 0.11445042490959167, "epoch": 3.47898599066044, "step": 10430 }, { "epoch": 3.47898599066044, "ref_ce_loss": 0.10792446881532669, "step": 10430 }, { "epoch": 3.47898599066044, "loss": 0.23093454539775848, "step": 10430 }, { "ce_loss": 0.07317502796649933, "epoch": 3.47898599066044, "step": 10430 }, { "distill_loss": 0.09828302264213562, "epoch": 3.47898599066044, "step": 10430 }, { "epoch": 3.47898599066044, "ref_ce_loss": 0.059230588376522064, "step": 10430 }, { "epoch": 3.47898599066044, "loss": 0.40664955973625183, "step": 10430 }, { "ce_loss": 0.1365078091621399, "epoch": 3.47898599066044, "step": 10430 }, { "distill_loss": 0.12795759737491608, "epoch": 3.47898599066044, "step": 10430 }, { "epoch": 3.47898599066044, "ref_ce_loss": 0.09858806431293488, "step": 10430 }, { "epoch": 3.4823215476984655, "loss": 0.4908, "step": 10440 }, { "epoch": 3.4823215476984655, "grad_norm": 7.698869228363037, "step": 10440 }, { "epoch": 3.4823215476984655, "learning_rate": 0.0001883008480049276, "step": 10440 }, { "epoch": 3.4823215476984655, "loss": 0.8871216773986816, "step": 10440 }, { "ce_loss": 0.09219731390476227, "epoch": 3.4823215476984655, "step": 10440 }, { "distill_loss": 0.12486536055803299, "epoch": 3.4823215476984655, "step": 10440 }, { "epoch": 3.4823215476984655, "ref_ce_loss": 0.12163364142179489, "step": 10440 }, { "epoch": 3.4823215476984655, "loss": 0.4638303816318512, "step": 10440 }, { "ce_loss": 0.09351971000432968, "epoch": 3.4823215476984655, "step": 10440 }, { "distill_loss": 0.17921598255634308, "epoch": 3.4823215476984655, "step": 10440 }, { "epoch": 3.4823215476984655, "ref_ce_loss": 0.06343629211187363, "step": 10440 }, { "epoch": 3.4823215476984655, "loss": 0.3681568503379822, "step": 10440 }, { "ce_loss": 0.048713743686676025, "epoch": 3.4823215476984655, "step": 10440 }, { "distill_loss": 0.14604920148849487, "epoch": 3.4823215476984655, "step": 10440 }, { "epoch": 3.4823215476984655, "ref_ce_loss": 0.054822877049446106, "step": 10440 }, { "epoch": 3.4823215476984655, "loss": 0.44607189297676086, "step": 10440 }, { "ce_loss": 0.16012151539325714, "epoch": 3.4823215476984655, "step": 10440 }, { "distill_loss": 0.12422513961791992, "epoch": 3.4823215476984655, "step": 10440 }, { "epoch": 3.4823215476984655, "ref_ce_loss": 0.1300317794084549, "step": 10440 }, { "epoch": 3.485657104736491, "loss": 0.5404, "step": 10450 }, { "epoch": 3.485657104736491, "grad_norm": 2.3111071586608887, "step": 10450 }, { "epoch": 3.485657104736491, "learning_rate": 0.0001881049663057473, "step": 10450 }, { "epoch": 3.485657104736491, "loss": 0.6618536710739136, "step": 10450 }, { "ce_loss": 0.19961726665496826, "epoch": 3.485657104736491, "step": 10450 }, { "distill_loss": 0.1706632822751999, "epoch": 3.485657104736491, "step": 10450 }, { "epoch": 3.485657104736491, "ref_ce_loss": 0.13822656869888306, "step": 10450 }, { "epoch": 3.485657104736491, "loss": 0.31794148683547974, "step": 10450 }, { "ce_loss": 0.09808186441659927, "epoch": 3.485657104736491, "step": 10450 }, { "distill_loss": 0.131150022149086, "epoch": 3.485657104736491, "step": 10450 }, { "epoch": 3.485657104736491, "ref_ce_loss": 0.08795040100812912, "step": 10450 }, { "epoch": 3.485657104736491, "loss": 0.6011213064193726, "step": 10450 }, { "ce_loss": 0.08202079683542252, "epoch": 3.485657104736491, "step": 10450 }, { "distill_loss": 0.11499512195587158, "epoch": 3.485657104736491, "step": 10450 }, { "epoch": 3.485657104736491, "ref_ce_loss": 0.1003682017326355, "step": 10450 }, { "epoch": 3.485657104736491, "loss": 0.33552274107933044, "step": 10450 }, { "ce_loss": 0.06787855923175812, "epoch": 3.485657104736491, "step": 10450 }, { "distill_loss": 0.10218380391597748, "epoch": 3.485657104736491, "step": 10450 }, { "epoch": 3.485657104736491, "ref_ce_loss": 0.1092182844877243, "step": 10450 }, { "epoch": 3.488992661774516, "loss": 0.4999, "step": 10460 }, { "epoch": 3.488992661774516, "grad_norm": 2.911756753921509, "step": 10460 }, { "epoch": 3.488992661774516, "learning_rate": 0.00018790901511813962, "step": 10460 }, { "epoch": 3.488992661774516, "loss": 0.47455549240112305, "step": 10460 }, { "ce_loss": 0.1233755499124527, "epoch": 3.488992661774516, "step": 10460 }, { "distill_loss": 0.11691313236951828, "epoch": 3.488992661774516, "step": 10460 }, { "epoch": 3.488992661774516, "ref_ce_loss": 0.1434135138988495, "step": 10460 }, { "epoch": 3.488992661774516, "loss": 0.5427463054656982, "step": 10460 }, { "ce_loss": 0.09475378692150116, "epoch": 3.488992661774516, "step": 10460 }, { "distill_loss": 0.15717893838882446, "epoch": 3.488992661774516, "step": 10460 }, { "epoch": 3.488992661774516, "ref_ce_loss": 0.11762606352567673, "step": 10460 }, { "epoch": 3.488992661774516, "loss": 1.103336215019226, "step": 10460 }, { "ce_loss": 0.22373856604099274, "epoch": 3.488992661774516, "step": 10460 }, { "distill_loss": 0.22364136576652527, "epoch": 3.488992661774516, "step": 10460 }, { "epoch": 3.488992661774516, "ref_ce_loss": 0.11285527795553207, "step": 10460 }, { "epoch": 3.488992661774516, "loss": 0.49190598726272583, "step": 10460 }, { "ce_loss": 0.09300311654806137, "epoch": 3.488992661774516, "step": 10460 }, { "distill_loss": 0.16091318428516388, "epoch": 3.488992661774516, "step": 10460 }, { "epoch": 3.488992661774516, "ref_ce_loss": 0.09601572901010513, "step": 10460 }, { "epoch": 3.4923282188125415, "loss": 0.4919, "step": 10470 }, { "epoch": 3.4923282188125415, "grad_norm": 3.8968892097473145, "step": 10470 }, { "epoch": 3.4923282188125415, "learning_rate": 0.00018771299479944218, "step": 10470 }, { "epoch": 3.4923282188125415, "loss": 0.6155132055282593, "step": 10470 }, { "ce_loss": 0.07882112264633179, "epoch": 3.4923282188125415, "step": 10470 }, { "distill_loss": 0.1573733240365982, "epoch": 3.4923282188125415, "step": 10470 }, { "epoch": 3.4923282188125415, "ref_ce_loss": 0.13969795405864716, "step": 10470 }, { "epoch": 3.4923282188125415, "loss": 0.5589838624000549, "step": 10470 }, { "ce_loss": 0.24933810532093048, "epoch": 3.4923282188125415, "step": 10470 }, { "distill_loss": 0.11963921785354614, "epoch": 3.4923282188125415, "step": 10470 }, { "epoch": 3.4923282188125415, "ref_ce_loss": 0.12221920490264893, "step": 10470 }, { "epoch": 3.4923282188125415, "loss": 0.5622706413269043, "step": 10470 }, { "ce_loss": 0.17802667617797852, "epoch": 3.4923282188125415, "step": 10470 }, { "distill_loss": 0.15607579052448273, "epoch": 3.4923282188125415, "step": 10470 }, { "epoch": 3.4923282188125415, "ref_ce_loss": 0.1545151174068451, "step": 10470 }, { "epoch": 3.4923282188125415, "loss": 0.43948787450790405, "step": 10470 }, { "ce_loss": 0.155122771859169, "epoch": 3.4923282188125415, "step": 10470 }, { "distill_loss": 0.16137826442718506, "epoch": 3.4923282188125415, "step": 10470 }, { "epoch": 3.4923282188125415, "ref_ce_loss": 0.12277895212173462, "step": 10470 }, { "epoch": 3.495663775850567, "loss": 0.4717, "step": 10480 }, { "epoch": 3.495663775850567, "grad_norm": 3.666346311569214, "step": 10480 }, { "epoch": 3.495663775850567, "learning_rate": 0.00018751690570711885, "step": 10480 }, { "epoch": 3.495663775850567, "loss": 0.2625570297241211, "step": 10480 }, { "ce_loss": 0.07310660928487778, "epoch": 3.495663775850567, "step": 10480 }, { "distill_loss": 0.08818459510803223, "epoch": 3.495663775850567, "step": 10480 }, { "epoch": 3.495663775850567, "ref_ce_loss": 0.07402611523866653, "step": 10480 }, { "epoch": 3.495663775850567, "loss": 0.49244633316993713, "step": 10480 }, { "ce_loss": 0.13064201176166534, "epoch": 3.495663775850567, "step": 10480 }, { "distill_loss": 0.15354804694652557, "epoch": 3.495663775850567, "step": 10480 }, { "epoch": 3.495663775850567, "ref_ce_loss": 0.11334028840065002, "step": 10480 }, { "epoch": 3.495663775850567, "loss": 0.42023003101348877, "step": 10480 }, { "ce_loss": 0.0777747705578804, "epoch": 3.495663775850567, "step": 10480 }, { "distill_loss": 0.12232057750225067, "epoch": 3.495663775850567, "step": 10480 }, { "epoch": 3.495663775850567, "ref_ce_loss": 0.09364210814237595, "step": 10480 }, { "epoch": 3.495663775850567, "loss": 0.3053067624568939, "step": 10480 }, { "ce_loss": 0.09775067120790482, "epoch": 3.495663775850567, "step": 10480 }, { "distill_loss": 0.12994107604026794, "epoch": 3.495663775850567, "step": 10480 }, { "epoch": 3.495663775850567, "ref_ce_loss": 0.07745039463043213, "step": 10480 }, { "epoch": 3.498999332888592, "loss": 0.4981, "step": 10490 }, { "epoch": 3.498999332888592, "grad_norm": 2.8985745906829834, "step": 10490 }, { "epoch": 3.498999332888592, "learning_rate": 0.00018732074819875872, "step": 10490 }, { "epoch": 3.498999332888592, "loss": 0.3826335668563843, "step": 10490 }, { "ce_loss": 0.09962109476327896, "epoch": 3.498999332888592, "step": 10490 }, { "distill_loss": 0.11335495859384537, "epoch": 3.498999332888592, "step": 10490 }, { "epoch": 3.498999332888592, "ref_ce_loss": 0.07441458851099014, "step": 10490 }, { "epoch": 3.498999332888592, "loss": 0.4468466639518738, "step": 10490 }, { "ce_loss": 0.06108476594090462, "epoch": 3.498999332888592, "step": 10490 }, { "distill_loss": 0.10852596163749695, "epoch": 3.498999332888592, "step": 10490 }, { "epoch": 3.498999332888592, "ref_ce_loss": 0.07920630276203156, "step": 10490 }, { "epoch": 3.498999332888592, "loss": 0.42822498083114624, "step": 10490 }, { "ce_loss": 0.12201324850320816, "epoch": 3.498999332888592, "step": 10490 }, { "distill_loss": 0.10355043411254883, "epoch": 3.498999332888592, "step": 10490 }, { "epoch": 3.498999332888592, "ref_ce_loss": 0.15949228405952454, "step": 10490 }, { "epoch": 3.498999332888592, "loss": 0.5487578511238098, "step": 10490 }, { "ce_loss": 0.17656852304935455, "epoch": 3.498999332888592, "step": 10490 }, { "distill_loss": 0.14200134575366974, "epoch": 3.498999332888592, "step": 10490 }, { "epoch": 3.498999332888592, "ref_ce_loss": 0.10942427814006805, "step": 10490 }, { "epoch": 3.502334889926618, "loss": 0.472, "step": 10500 }, { "epoch": 3.502334889926618, "grad_norm": 1.802612543106079, "step": 10500 }, { "epoch": 3.502334889926618, "learning_rate": 0.0001871245226320757, "step": 10500 }, { "epoch": 3.502334889926618, "loss": 0.6035943031311035, "step": 10500 }, { "ce_loss": 0.17178229987621307, "epoch": 3.502334889926618, "step": 10500 }, { "distill_loss": 0.14971838891506195, "epoch": 3.502334889926618, "step": 10500 }, { "epoch": 3.502334889926618, "ref_ce_loss": 0.0989990159869194, "step": 10500 }, { "epoch": 3.502334889926618, "loss": 0.36782369017601013, "step": 10500 }, { "ce_loss": 0.12339980900287628, "epoch": 3.502334889926618, "step": 10500 }, { "distill_loss": 0.14437860250473022, "epoch": 3.502334889926618, "step": 10500 }, { "epoch": 3.502334889926618, "ref_ce_loss": 0.09994472563266754, "step": 10500 }, { "epoch": 3.502334889926618, "loss": 0.4430643916130066, "step": 10500 }, { "ce_loss": 0.17111869156360626, "epoch": 3.502334889926618, "step": 10500 }, { "distill_loss": 0.1422690451145172, "epoch": 3.502334889926618, "step": 10500 }, { "epoch": 3.502334889926618, "ref_ce_loss": 0.1295473873615265, "step": 10500 }, { "epoch": 3.502334889926618, "loss": 0.5198507905006409, "step": 10500 }, { "ce_loss": 0.07539082318544388, "epoch": 3.502334889926618, "step": 10500 }, { "distill_loss": 0.13079798221588135, "epoch": 3.502334889926618, "step": 10500 }, { "epoch": 3.502334889926618, "ref_ce_loss": 0.1290043443441391, "step": 10500 }, { "epoch": 3.5056704469646434, "loss": 0.5079, "step": 10510 }, { "epoch": 3.5056704469646434, "grad_norm": 4.614372253417969, "step": 10510 }, { "epoch": 3.5056704469646434, "learning_rate": 0.00018692822936490784, "step": 10510 }, { "epoch": 3.5056704469646434, "loss": 0.41191405057907104, "step": 10510 }, { "ce_loss": 0.06786329299211502, "epoch": 3.5056704469646434, "step": 10510 }, { "distill_loss": 0.1034030169248581, "epoch": 3.5056704469646434, "step": 10510 }, { "epoch": 3.5056704469646434, "ref_ce_loss": 0.1216384768486023, "step": 10510 }, { "epoch": 3.5056704469646434, "loss": 0.4477238655090332, "step": 10510 }, { "ce_loss": 0.11512485891580582, "epoch": 3.5056704469646434, "step": 10510 }, { "distill_loss": 0.13957633078098297, "epoch": 3.5056704469646434, "step": 10510 }, { "epoch": 3.5056704469646434, "ref_ce_loss": 0.09763330221176147, "step": 10510 }, { "epoch": 3.5056704469646434, "loss": 0.6907591223716736, "step": 10510 }, { "ce_loss": 0.20804905891418457, "epoch": 3.5056704469646434, "step": 10510 }, { "distill_loss": 0.21027851104736328, "epoch": 3.5056704469646434, "step": 10510 }, { "epoch": 3.5056704469646434, "ref_ce_loss": 0.11685810983181, "step": 10510 }, { "epoch": 3.5056704469646434, "loss": 0.5303629636764526, "step": 10510 }, { "ce_loss": 0.17830605804920197, "epoch": 3.5056704469646434, "step": 10510 }, { "distill_loss": 0.20348048210144043, "epoch": 3.5056704469646434, "step": 10510 }, { "epoch": 3.5056704469646434, "ref_ce_loss": 0.10697298496961594, "step": 10510 }, { "epoch": 3.5090060040026687, "loss": 0.5298, "step": 10520 }, { "epoch": 3.5090060040026687, "grad_norm": 2.2804954051971436, "step": 10520 }, { "epoch": 3.5090060040026687, "learning_rate": 0.00018673186875521657, "step": 10520 }, { "epoch": 3.5090060040026687, "loss": 0.39185506105422974, "step": 10520 }, { "ce_loss": 0.15698541700839996, "epoch": 3.5090060040026687, "step": 10520 }, { "distill_loss": 0.1254022717475891, "epoch": 3.5090060040026687, "step": 10520 }, { "epoch": 3.5090060040026687, "ref_ce_loss": 0.10929703712463379, "step": 10520 }, { "epoch": 3.5090060040026687, "loss": 0.27361857891082764, "step": 10520 }, { "ce_loss": 0.10073763132095337, "epoch": 3.5090060040026687, "step": 10520 }, { "distill_loss": 0.1090550348162651, "epoch": 3.5090060040026687, "step": 10520 }, { "epoch": 3.5090060040026687, "ref_ce_loss": 0.06371267884969711, "step": 10520 }, { "epoch": 3.5090060040026687, "loss": 0.47119683027267456, "step": 10520 }, { "ce_loss": 0.1285894811153412, "epoch": 3.5090060040026687, "step": 10520 }, { "distill_loss": 0.15611502528190613, "epoch": 3.5090060040026687, "step": 10520 }, { "epoch": 3.5090060040026687, "ref_ce_loss": 0.1049136221408844, "step": 10520 }, { "epoch": 3.5090060040026687, "loss": 0.28055694699287415, "step": 10520 }, { "ce_loss": 0.037441760301589966, "epoch": 3.5090060040026687, "step": 10520 }, { "distill_loss": 0.09433382004499435, "epoch": 3.5090060040026687, "step": 10520 }, { "epoch": 3.5090060040026687, "ref_ce_loss": 0.07896076887845993, "step": 10520 }, { "epoch": 3.512341561040694, "loss": 0.4552, "step": 10530 }, { "epoch": 3.512341561040694, "grad_norm": 2.3658037185668945, "step": 10530 }, { "epoch": 3.512341561040694, "learning_rate": 0.00018653544116108625, "step": 10530 }, { "epoch": 3.512341561040694, "loss": 0.5972455143928528, "step": 10530 }, { "ce_loss": 0.16446569561958313, "epoch": 3.512341561040694, "step": 10530 }, { "distill_loss": 0.1522887945175171, "epoch": 3.512341561040694, "step": 10530 }, { "epoch": 3.512341561040694, "ref_ce_loss": 0.1197158694267273, "step": 10530 }, { "epoch": 3.512341561040694, "loss": 0.522158682346344, "step": 10530 }, { "ce_loss": 0.250407874584198, "epoch": 3.512341561040694, "step": 10530 }, { "distill_loss": 0.12939266860485077, "epoch": 3.512341561040694, "step": 10530 }, { "epoch": 3.512341561040694, "ref_ce_loss": 0.11329744011163712, "step": 10530 }, { "epoch": 3.512341561040694, "loss": 0.294116348028183, "step": 10530 }, { "ce_loss": 0.09095935523509979, "epoch": 3.512341561040694, "step": 10530 }, { "distill_loss": 0.10605276376008987, "epoch": 3.512341561040694, "step": 10530 }, { "epoch": 3.512341561040694, "ref_ce_loss": 0.054930780082941055, "step": 10530 }, { "epoch": 3.512341561040694, "loss": 0.24210438132286072, "step": 10530 }, { "ce_loss": 0.029579443857073784, "epoch": 3.512341561040694, "step": 10530 }, { "distill_loss": 0.1047045886516571, "epoch": 3.512341561040694, "step": 10530 }, { "epoch": 3.512341561040694, "ref_ce_loss": 0.05087565630674362, "step": 10530 }, { "epoch": 3.5156771180787194, "loss": 0.5145, "step": 10540 }, { "epoch": 3.5156771180787194, "grad_norm": 2.366520643234253, "step": 10540 }, { "epoch": 3.5156771180787194, "learning_rate": 0.00018633894694072337, "step": 10540 }, { "epoch": 3.5156771180787194, "loss": 0.6220971941947937, "step": 10540 }, { "ce_loss": 0.12055147439241409, "epoch": 3.5156771180787194, "step": 10540 }, { "distill_loss": 0.1395721137523651, "epoch": 3.5156771180787194, "step": 10540 }, { "epoch": 3.5156771180787194, "ref_ce_loss": 0.17843414843082428, "step": 10540 }, { "epoch": 3.5156771180787194, "loss": 0.5132659673690796, "step": 10540 }, { "ce_loss": 0.11431053280830383, "epoch": 3.5156771180787194, "step": 10540 }, { "distill_loss": 0.10479498654603958, "epoch": 3.5156771180787194, "step": 10540 }, { "epoch": 3.5156771180787194, "ref_ce_loss": 0.12466438114643097, "step": 10540 }, { "epoch": 3.5156771180787194, "loss": 0.805692732334137, "step": 10540 }, { "ce_loss": 0.22681838274002075, "epoch": 3.5156771180787194, "step": 10540 }, { "distill_loss": 0.16361652314662933, "epoch": 3.5156771180787194, "step": 10540 }, { "epoch": 3.5156771180787194, "ref_ce_loss": 0.1392831951379776, "step": 10540 }, { "epoch": 3.5156771180787194, "loss": 0.5683649182319641, "step": 10540 }, { "ce_loss": 0.1613687127828598, "epoch": 3.5156771180787194, "step": 10540 }, { "distill_loss": 0.17765222489833832, "epoch": 3.5156771180787194, "step": 10540 }, { "epoch": 3.5156771180787194, "ref_ce_loss": 0.15474840998649597, "step": 10540 }, { "epoch": 3.5190126751167448, "loss": 0.5035, "step": 10550 }, { "epoch": 3.5190126751167448, "grad_norm": 2.3484420776367188, "step": 10550 }, { "epoch": 3.5190126751167448, "learning_rate": 0.00018614238645245574, "step": 10550 }, { "epoch": 3.5190126751167448, "loss": 0.3599805235862732, "step": 10550 }, { "ce_loss": 0.11466601490974426, "epoch": 3.5190126751167448, "step": 10550 }, { "distill_loss": 0.15461599826812744, "epoch": 3.5190126751167448, "step": 10550 }, { "epoch": 3.5190126751167448, "ref_ce_loss": 0.08937636762857437, "step": 10550 }, { "epoch": 3.5190126751167448, "loss": 0.5544697046279907, "step": 10550 }, { "ce_loss": 0.09430625289678574, "epoch": 3.5190126751167448, "step": 10550 }, { "distill_loss": 0.12568268179893494, "epoch": 3.5190126751167448, "step": 10550 }, { "epoch": 3.5190126751167448, "ref_ce_loss": 0.13568513095378876, "step": 10550 }, { "epoch": 3.5190126751167448, "loss": 0.5882728099822998, "step": 10550 }, { "ce_loss": 0.12761719524860382, "epoch": 3.5190126751167448, "step": 10550 }, { "distill_loss": 0.14731423556804657, "epoch": 3.5190126751167448, "step": 10550 }, { "epoch": 3.5190126751167448, "ref_ce_loss": 0.05525919422507286, "step": 10550 }, { "epoch": 3.5190126751167448, "loss": 0.4472968578338623, "step": 10550 }, { "ce_loss": 0.14880354702472687, "epoch": 3.5190126751167448, "step": 10550 }, { "distill_loss": 0.16218119859695435, "epoch": 3.5190126751167448, "step": 10550 }, { "epoch": 3.5190126751167448, "ref_ce_loss": 0.08404110372066498, "step": 10550 }, { "epoch": 3.52234823215477, "loss": 0.5271, "step": 10560 }, { "epoch": 3.52234823215477, "grad_norm": 2.0116426944732666, "step": 10560 }, { "epoch": 3.52234823215477, "learning_rate": 0.00018594576005473228, "step": 10560 }, { "epoch": 3.52234823215477, "loss": 0.33950960636138916, "step": 10560 }, { "ce_loss": 0.10914910584688187, "epoch": 3.52234823215477, "step": 10560 }, { "distill_loss": 0.14298668503761292, "epoch": 3.52234823215477, "step": 10560 }, { "epoch": 3.52234823215477, "ref_ce_loss": 0.08681947737932205, "step": 10560 }, { "epoch": 3.52234823215477, "loss": 1.114986538887024, "step": 10560 }, { "ce_loss": 0.1504950225353241, "epoch": 3.52234823215477, "step": 10560 }, { "distill_loss": 0.12499750405550003, "epoch": 3.52234823215477, "step": 10560 }, { "epoch": 3.52234823215477, "ref_ce_loss": 0.10533667355775833, "step": 10560 }, { "epoch": 3.52234823215477, "loss": 0.3771263360977173, "step": 10560 }, { "ce_loss": 0.09664015471935272, "epoch": 3.52234823215477, "step": 10560 }, { "distill_loss": 0.12093089520931244, "epoch": 3.52234823215477, "step": 10560 }, { "epoch": 3.52234823215477, "ref_ce_loss": 0.08612294495105743, "step": 10560 }, { "epoch": 3.52234823215477, "loss": 0.4830138385295868, "step": 10560 }, { "ce_loss": 0.12343651801347733, "epoch": 3.52234823215477, "step": 10560 }, { "distill_loss": 0.17175374925136566, "epoch": 3.52234823215477, "step": 10560 }, { "epoch": 3.52234823215477, "ref_ce_loss": 0.08810058981180191, "step": 10560 }, { "epoch": 3.5256837891927955, "loss": 0.5195, "step": 10570 }, { "epoch": 3.5256837891927955, "grad_norm": 3.038257122039795, "step": 10570 }, { "epoch": 3.5256837891927955, "learning_rate": 0.00018574906810612187, "step": 10570 }, { "epoch": 3.5256837891927955, "loss": 0.3658252954483032, "step": 10570 }, { "ce_loss": 0.13670586049556732, "epoch": 3.5256837891927955, "step": 10570 }, { "distill_loss": 0.1163606196641922, "epoch": 3.5256837891927955, "step": 10570 }, { "epoch": 3.5256837891927955, "ref_ce_loss": 0.08734627068042755, "step": 10570 }, { "epoch": 3.5256837891927955, "loss": 0.37185975909233093, "step": 10570 }, { "ce_loss": 0.10969644784927368, "epoch": 3.5256837891927955, "step": 10570 }, { "distill_loss": 0.12583912909030914, "epoch": 3.5256837891927955, "step": 10570 }, { "epoch": 3.5256837891927955, "ref_ce_loss": 0.10249805450439453, "step": 10570 }, { "epoch": 3.5256837891927955, "loss": 0.48817941546440125, "step": 10570 }, { "ce_loss": 0.1386563777923584, "epoch": 3.5256837891927955, "step": 10570 }, { "distill_loss": 0.18367363512516022, "epoch": 3.5256837891927955, "step": 10570 }, { "epoch": 3.5256837891927955, "ref_ce_loss": 0.12123288214206696, "step": 10570 }, { "epoch": 3.5256837891927955, "loss": 0.39616870880126953, "step": 10570 }, { "ce_loss": 0.0724596455693245, "epoch": 3.5256837891927955, "step": 10570 }, { "distill_loss": 0.1940024048089981, "epoch": 3.5256837891927955, "step": 10570 }, { "epoch": 3.5256837891927955, "ref_ce_loss": 0.09160967916250229, "step": 10570 }, { "epoch": 3.529019346230821, "loss": 0.5065, "step": 10580 }, { "epoch": 3.529019346230821, "grad_norm": 2.6603708267211914, "step": 10580 }, { "epoch": 3.529019346230821, "learning_rate": 0.0001855523109653131, "step": 10580 }, { "epoch": 3.529019346230821, "loss": 0.46012818813323975, "step": 10580 }, { "ce_loss": 0.13441084325313568, "epoch": 3.529019346230821, "step": 10580 }, { "distill_loss": 0.18987199664115906, "epoch": 3.529019346230821, "step": 10580 }, { "epoch": 3.529019346230821, "ref_ce_loss": 0.1355372965335846, "step": 10580 }, { "epoch": 3.529019346230821, "loss": 0.367043673992157, "step": 10580 }, { "ce_loss": 0.10793867707252502, "epoch": 3.529019346230821, "step": 10580 }, { "distill_loss": 0.13718649744987488, "epoch": 3.529019346230821, "step": 10580 }, { "epoch": 3.529019346230821, "ref_ce_loss": 0.08491414785385132, "step": 10580 }, { "epoch": 3.529019346230821, "loss": 0.4276635944843292, "step": 10580 }, { "ce_loss": 0.1227010190486908, "epoch": 3.529019346230821, "step": 10580 }, { "distill_loss": 0.16282536089420319, "epoch": 3.529019346230821, "step": 10580 }, { "epoch": 3.529019346230821, "ref_ce_loss": 0.08905931562185287, "step": 10580 }, { "epoch": 3.529019346230821, "loss": 0.573056161403656, "step": 10580 }, { "ce_loss": 0.1919986456632614, "epoch": 3.529019346230821, "step": 10580 }, { "distill_loss": 0.17559261620044708, "epoch": 3.529019346230821, "step": 10580 }, { "epoch": 3.529019346230821, "ref_ce_loss": 0.14072009921073914, "step": 10580 }, { "epoch": 3.532354903268846, "loss": 0.4826, "step": 10590 }, { "epoch": 3.532354903268846, "grad_norm": 1.7876719236373901, "step": 10590 }, { "epoch": 3.532354903268846, "learning_rate": 0.00018535548899111342, "step": 10590 }, { "epoch": 3.532354903268846, "loss": 0.4782566428184509, "step": 10590 }, { "ce_loss": 0.10534249991178513, "epoch": 3.532354903268846, "step": 10590 }, { "distill_loss": 0.14548616111278534, "epoch": 3.532354903268846, "step": 10590 }, { "epoch": 3.532354903268846, "ref_ce_loss": 0.08125923573970795, "step": 10590 }, { "epoch": 3.532354903268846, "loss": 0.4691029489040375, "step": 10590 }, { "ce_loss": 0.16666607558727264, "epoch": 3.532354903268846, "step": 10590 }, { "distill_loss": 0.1389254778623581, "epoch": 3.532354903268846, "step": 10590 }, { "epoch": 3.532354903268846, "ref_ce_loss": 0.11366435140371323, "step": 10590 }, { "epoch": 3.532354903268846, "loss": 0.3638518750667572, "step": 10590 }, { "ce_loss": 0.09428536146879196, "epoch": 3.532354903268846, "step": 10590 }, { "distill_loss": 0.15336307883262634, "epoch": 3.532354903268846, "step": 10590 }, { "epoch": 3.532354903268846, "ref_ce_loss": 0.11602702736854553, "step": 10590 }, { "epoch": 3.532354903268846, "loss": 0.8029030561447144, "step": 10590 }, { "ce_loss": 0.23406115174293518, "epoch": 3.532354903268846, "step": 10590 }, { "distill_loss": 0.14278815686702728, "epoch": 3.532354903268846, "step": 10590 }, { "epoch": 3.532354903268846, "ref_ce_loss": 0.14123186469078064, "step": 10590 }, { "epoch": 3.5356904603068715, "loss": 0.5171, "step": 10600 }, { "epoch": 3.5356904603068715, "grad_norm": 4.143054485321045, "step": 10600 }, { "epoch": 3.5356904603068715, "learning_rate": 0.00018515860254244844, "step": 10600 }, { "epoch": 3.5356904603068715, "loss": 0.27573245763778687, "step": 10600 }, { "ce_loss": 0.08236755430698395, "epoch": 3.5356904603068715, "step": 10600 }, { "distill_loss": 0.09246663004159927, "epoch": 3.5356904603068715, "step": 10600 }, { "epoch": 3.5356904603068715, "ref_ce_loss": 0.06662546843290329, "step": 10600 }, { "epoch": 3.5356904603068715, "loss": 0.510116696357727, "step": 10600 }, { "ce_loss": 0.17182530462741852, "epoch": 3.5356904603068715, "step": 10600 }, { "distill_loss": 0.118833988904953, "epoch": 3.5356904603068715, "step": 10600 }, { "epoch": 3.5356904603068715, "ref_ce_loss": 0.1660315990447998, "step": 10600 }, { "epoch": 3.5356904603068715, "loss": 0.4465329945087433, "step": 10600 }, { "ce_loss": 0.1422976702451706, "epoch": 3.5356904603068715, "step": 10600 }, { "distill_loss": 0.12561358511447906, "epoch": 3.5356904603068715, "step": 10600 }, { "epoch": 3.5356904603068715, "ref_ce_loss": 0.06846870481967926, "step": 10600 }, { "epoch": 3.5356904603068715, "loss": 0.8853085041046143, "step": 10600 }, { "ce_loss": 0.10994315892457962, "epoch": 3.5356904603068715, "step": 10600 }, { "distill_loss": 0.12597742676734924, "epoch": 3.5356904603068715, "step": 10600 }, { "epoch": 3.5356904603068715, "ref_ce_loss": 0.07962527126073837, "step": 10600 }, { "epoch": 3.539026017344897, "loss": 0.4569, "step": 10610 }, { "epoch": 3.539026017344897, "grad_norm": 1.6940289735794067, "step": 10610 }, { "epoch": 3.539026017344897, "learning_rate": 0.0001849616519783613, "step": 10610 }, { "epoch": 3.539026017344897, "loss": 0.5881628394126892, "step": 10610 }, { "ce_loss": 0.16431401669979095, "epoch": 3.539026017344897, "step": 10610 }, { "distill_loss": 0.14111711084842682, "epoch": 3.539026017344897, "step": 10610 }, { "epoch": 3.539026017344897, "ref_ce_loss": 0.08906078338623047, "step": 10610 }, { "epoch": 3.539026017344897, "loss": 0.31506747007369995, "step": 10610 }, { "ce_loss": 0.05988002568483353, "epoch": 3.539026017344897, "step": 10610 }, { "distill_loss": 0.12606583535671234, "epoch": 3.539026017344897, "step": 10610 }, { "epoch": 3.539026017344897, "ref_ce_loss": 0.12904591858386993, "step": 10610 }, { "epoch": 3.539026017344897, "loss": 0.6737792491912842, "step": 10610 }, { "ce_loss": 0.06801124662160873, "epoch": 3.539026017344897, "step": 10610 }, { "distill_loss": 0.10389474034309387, "epoch": 3.539026017344897, "step": 10610 }, { "epoch": 3.539026017344897, "ref_ce_loss": 0.11164550483226776, "step": 10610 }, { "epoch": 3.539026017344897, "loss": 0.43432092666625977, "step": 10610 }, { "ce_loss": 0.12314651161432266, "epoch": 3.539026017344897, "step": 10610 }, { "distill_loss": 0.10934734344482422, "epoch": 3.539026017344897, "step": 10610 }, { "epoch": 3.539026017344897, "ref_ce_loss": 0.10453308373689651, "step": 10610 }, { "epoch": 3.542361574382922, "loss": 0.4952, "step": 10620 }, { "epoch": 3.542361574382922, "grad_norm": 2.9235355854034424, "step": 10620 }, { "epoch": 3.542361574382922, "learning_rate": 0.00018476463765801216, "step": 10620 }, { "epoch": 3.542361574382922, "loss": 0.49110227823257446, "step": 10620 }, { "ce_loss": 0.10042405873537064, "epoch": 3.542361574382922, "step": 10620 }, { "distill_loss": 0.1030670553445816, "epoch": 3.542361574382922, "step": 10620 }, { "epoch": 3.542361574382922, "ref_ce_loss": 0.09770134836435318, "step": 10620 }, { "epoch": 3.542361574382922, "loss": 0.35014456510543823, "step": 10620 }, { "ce_loss": 0.1137394830584526, "epoch": 3.542361574382922, "step": 10620 }, { "distill_loss": 0.09898874908685684, "epoch": 3.542361574382922, "step": 10620 }, { "epoch": 3.542361574382922, "ref_ce_loss": 0.13730838894844055, "step": 10620 }, { "epoch": 3.542361574382922, "loss": 0.2383749634027481, "step": 10620 }, { "ce_loss": 0.06447065621614456, "epoch": 3.542361574382922, "step": 10620 }, { "distill_loss": 0.08753521740436554, "epoch": 3.542361574382922, "step": 10620 }, { "epoch": 3.542361574382922, "ref_ce_loss": 0.0862131118774414, "step": 10620 }, { "epoch": 3.542361574382922, "loss": 0.35761362314224243, "step": 10620 }, { "ce_loss": 0.07475518435239792, "epoch": 3.542361574382922, "step": 10620 }, { "distill_loss": 0.12471903860569, "epoch": 3.542361574382922, "step": 10620 }, { "epoch": 3.542361574382922, "ref_ce_loss": 0.0990569218993187, "step": 10620 }, { "epoch": 3.5456971314209476, "loss": 0.4151, "step": 10630 }, { "epoch": 3.5456971314209476, "grad_norm": 2.373331069946289, "step": 10630 }, { "epoch": 3.5456971314209476, "learning_rate": 0.00018456755994067758, "step": 10630 }, { "epoch": 3.5456971314209476, "loss": 0.35218167304992676, "step": 10630 }, { "ce_loss": 0.12417849153280258, "epoch": 3.5456971314209476, "step": 10630 }, { "distill_loss": 0.10794119536876678, "epoch": 3.5456971314209476, "step": 10630 }, { "epoch": 3.5456971314209476, "ref_ce_loss": 0.09236135333776474, "step": 10630 }, { "epoch": 3.5456971314209476, "loss": 0.3189576268196106, "step": 10630 }, { "ce_loss": 0.057388123124837875, "epoch": 3.5456971314209476, "step": 10630 }, { "distill_loss": 0.08465489745140076, "epoch": 3.5456971314209476, "step": 10630 }, { "epoch": 3.5456971314209476, "ref_ce_loss": 0.051392436027526855, "step": 10630 }, { "epoch": 3.5456971314209476, "loss": 0.43372079730033875, "step": 10630 }, { "ce_loss": 0.17675387859344482, "epoch": 3.5456971314209476, "step": 10630 }, { "distill_loss": 0.10880421847105026, "epoch": 3.5456971314209476, "step": 10630 }, { "epoch": 3.5456971314209476, "ref_ce_loss": 0.10260260105133057, "step": 10630 }, { "epoch": 3.5456971314209476, "loss": 0.32213619351387024, "step": 10630 }, { "ce_loss": 0.11360274255275726, "epoch": 3.5456971314209476, "step": 10630 }, { "distill_loss": 0.10571669787168503, "epoch": 3.5456971314209476, "step": 10630 }, { "epoch": 3.5456971314209476, "ref_ce_loss": 0.0778336301445961, "step": 10630 }, { "epoch": 3.549032688458973, "loss": 0.4453, "step": 10640 }, { "epoch": 3.549032688458973, "grad_norm": 2.135650157928467, "step": 10640 }, { "epoch": 3.549032688458973, "learning_rate": 0.00018437041918574937, "step": 10640 }, { "epoch": 3.549032688458973, "loss": 0.2195785492658615, "step": 10640 }, { "ce_loss": 0.04529380798339844, "epoch": 3.549032688458973, "step": 10640 }, { "distill_loss": 0.0795193761587143, "epoch": 3.549032688458973, "step": 10640 }, { "epoch": 3.549032688458973, "ref_ce_loss": 0.0945015624165535, "step": 10640 }, { "epoch": 3.549032688458973, "loss": 0.4393448531627655, "step": 10640 }, { "ce_loss": 0.1371816247701645, "epoch": 3.549032688458973, "step": 10640 }, { "distill_loss": 0.11540985852479935, "epoch": 3.549032688458973, "step": 10640 }, { "epoch": 3.549032688458973, "ref_ce_loss": 0.1122361347079277, "step": 10640 }, { "epoch": 3.549032688458973, "loss": 0.9115976691246033, "step": 10640 }, { "ce_loss": 0.15142419934272766, "epoch": 3.549032688458973, "step": 10640 }, { "distill_loss": 0.10086096078157425, "epoch": 3.549032688458973, "step": 10640 }, { "epoch": 3.549032688458973, "ref_ce_loss": 0.13035528361797333, "step": 10640 }, { "epoch": 3.549032688458973, "loss": 0.7502701282501221, "step": 10640 }, { "ce_loss": 0.14902283251285553, "epoch": 3.549032688458973, "step": 10640 }, { "distill_loss": 0.11329962313175201, "epoch": 3.549032688458973, "step": 10640 }, { "epoch": 3.549032688458973, "ref_ce_loss": 0.12520849704742432, "step": 10640 }, { "epoch": 3.5523682454969983, "loss": 0.4406, "step": 10650 }, { "epoch": 3.5523682454969983, "grad_norm": 2.5156853199005127, "step": 10650 }, { "epoch": 3.5523682454969983, "learning_rate": 0.00018417321575273462, "step": 10650 }, { "epoch": 3.5523682454969983, "loss": 0.2595893442630768, "step": 10650 }, { "ce_loss": 0.07727423310279846, "epoch": 3.5523682454969983, "step": 10650 }, { "distill_loss": 0.08966349065303802, "epoch": 3.5523682454969983, "step": 10650 }, { "epoch": 3.5523682454969983, "ref_ce_loss": 0.06863352656364441, "step": 10650 }, { "epoch": 3.5523682454969983, "loss": 0.6745221018791199, "step": 10650 }, { "ce_loss": 0.1965639591217041, "epoch": 3.5523682454969983, "step": 10650 }, { "distill_loss": 0.12107309699058533, "epoch": 3.5523682454969983, "step": 10650 }, { "epoch": 3.5523682454969983, "ref_ce_loss": 0.11794691532850266, "step": 10650 }, { "epoch": 3.5523682454969983, "loss": 0.4867534935474396, "step": 10650 }, { "ce_loss": 0.19840309023857117, "epoch": 3.5523682454969983, "step": 10650 }, { "distill_loss": 0.12193414568901062, "epoch": 3.5523682454969983, "step": 10650 }, { "epoch": 3.5523682454969983, "ref_ce_loss": 0.11920852214097977, "step": 10650 }, { "epoch": 3.5523682454969983, "loss": 0.2611696720123291, "step": 10650 }, { "ce_loss": 0.05607572942972183, "epoch": 3.5523682454969983, "step": 10650 }, { "distill_loss": 0.07235915213823318, "epoch": 3.5523682454969983, "step": 10650 }, { "epoch": 3.5523682454969983, "ref_ce_loss": 0.08902841061353683, "step": 10650 }, { "epoch": 3.5557038025350236, "loss": 0.4682, "step": 10660 }, { "epoch": 3.5557038025350236, "grad_norm": 1.9032304286956787, "step": 10660 }, { "epoch": 3.5557038025350236, "learning_rate": 0.00018397595000125454, "step": 10660 }, { "epoch": 3.5557038025350236, "loss": 0.5261881947517395, "step": 10660 }, { "ce_loss": 0.11859346181154251, "epoch": 3.5557038025350236, "step": 10660 }, { "distill_loss": 0.11338892579078674, "epoch": 3.5557038025350236, "step": 10660 }, { "epoch": 3.5557038025350236, "ref_ce_loss": 0.07596824318170547, "step": 10660 }, { "epoch": 3.5557038025350236, "loss": 0.29960116744041443, "step": 10660 }, { "ce_loss": 0.10883176326751709, "epoch": 3.5557038025350236, "step": 10660 }, { "distill_loss": 0.09616100788116455, "epoch": 3.5557038025350236, "step": 10660 }, { "epoch": 3.5557038025350236, "ref_ce_loss": 0.0670178085565567, "step": 10660 }, { "epoch": 3.5557038025350236, "loss": 0.5896754264831543, "step": 10660 }, { "ce_loss": 0.16429340839385986, "epoch": 3.5557038025350236, "step": 10660 }, { "distill_loss": 0.09835916757583618, "epoch": 3.5557038025350236, "step": 10660 }, { "epoch": 3.5557038025350236, "ref_ce_loss": 0.08974584937095642, "step": 10660 }, { "epoch": 3.5557038025350236, "loss": 0.281375527381897, "step": 10660 }, { "ce_loss": 0.07209251821041107, "epoch": 3.5557038025350236, "step": 10660 }, { "distill_loss": 0.10707713663578033, "epoch": 3.5557038025350236, "step": 10660 }, { "epoch": 3.5557038025350236, "ref_ce_loss": 0.10172483325004578, "step": 10660 }, { "epoch": 3.559039359573049, "loss": 0.4404, "step": 10670 }, { "epoch": 3.559039359573049, "grad_norm": 3.276700496673584, "step": 10670 }, { "epoch": 3.559039359573049, "learning_rate": 0.0001837786222910441, "step": 10670 }, { "epoch": 3.559039359573049, "loss": 0.46738162636756897, "step": 10670 }, { "ce_loss": 0.17903101444244385, "epoch": 3.559039359573049, "step": 10670 }, { "distill_loss": 0.15275321900844574, "epoch": 3.559039359573049, "step": 10670 }, { "epoch": 3.559039359573049, "ref_ce_loss": 0.10681027919054031, "step": 10670 }, { "epoch": 3.559039359573049, "loss": 0.3164319694042206, "step": 10670 }, { "ce_loss": 0.09890428930521011, "epoch": 3.559039359573049, "step": 10670 }, { "distill_loss": 0.0927501991391182, "epoch": 3.559039359573049, "step": 10670 }, { "epoch": 3.559039359573049, "ref_ce_loss": 0.08335583657026291, "step": 10670 }, { "epoch": 3.559039359573049, "loss": 0.3328627347946167, "step": 10670 }, { "ce_loss": 0.08496581763029099, "epoch": 3.559039359573049, "step": 10670 }, { "distill_loss": 0.1014360636472702, "epoch": 3.559039359573049, "step": 10670 }, { "epoch": 3.559039359573049, "ref_ce_loss": 0.095091313123703, "step": 10670 }, { "epoch": 3.559039359573049, "loss": 0.3671519458293915, "step": 10670 }, { "ce_loss": 0.14440912008285522, "epoch": 3.559039359573049, "step": 10670 }, { "distill_loss": 0.1046408861875534, "epoch": 3.559039359573049, "step": 10670 }, { "epoch": 3.559039359573049, "ref_ce_loss": 0.11774665862321854, "step": 10670 }, { "epoch": 3.5623749166110743, "loss": 0.454, "step": 10680 }, { "epoch": 3.5623749166110743, "grad_norm": 2.460484504699707, "step": 10680 }, { "epoch": 3.5623749166110743, "learning_rate": 0.00018358123298195119, "step": 10680 }, { "epoch": 3.5623749166110743, "loss": 0.46191108226776123, "step": 10680 }, { "ce_loss": 0.1277770698070526, "epoch": 3.5623749166110743, "step": 10680 }, { "distill_loss": 0.11336661875247955, "epoch": 3.5623749166110743, "step": 10680 }, { "epoch": 3.5623749166110743, "ref_ce_loss": 0.13188976049423218, "step": 10680 }, { "epoch": 3.5623749166110743, "loss": 0.3886517286300659, "step": 10680 }, { "ce_loss": 0.1443498581647873, "epoch": 3.5623749166110743, "step": 10680 }, { "distill_loss": 0.12114789336919785, "epoch": 3.5623749166110743, "step": 10680 }, { "epoch": 3.5623749166110743, "ref_ce_loss": 0.0856688991189003, "step": 10680 }, { "epoch": 3.5623749166110743, "loss": 0.4110898971557617, "step": 10680 }, { "ce_loss": 0.15371674299240112, "epoch": 3.5623749166110743, "step": 10680 }, { "distill_loss": 0.11015059053897858, "epoch": 3.5623749166110743, "step": 10680 }, { "epoch": 3.5623749166110743, "ref_ce_loss": 0.08587423712015152, "step": 10680 }, { "epoch": 3.5623749166110743, "loss": 0.3749980926513672, "step": 10680 }, { "ce_loss": 0.1017778292298317, "epoch": 3.5623749166110743, "step": 10680 }, { "distill_loss": 0.1276194453239441, "epoch": 3.5623749166110743, "step": 10680 }, { "epoch": 3.5623749166110743, "ref_ce_loss": 0.11321152746677399, "step": 10680 }, { "epoch": 3.5657104736490997, "loss": 0.4719, "step": 10690 }, { "epoch": 3.5657104736490997, "grad_norm": 2.446310520172119, "step": 10690 }, { "epoch": 3.5657104736490997, "learning_rate": 0.00018338378243393604, "step": 10690 }, { "epoch": 3.5657104736490997, "loss": 0.44579532742500305, "step": 10690 }, { "ce_loss": 0.11378941684961319, "epoch": 3.5657104736490997, "step": 10690 }, { "distill_loss": 0.13413788378238678, "epoch": 3.5657104736490997, "step": 10690 }, { "epoch": 3.5657104736490997, "ref_ce_loss": 0.09782540798187256, "step": 10690 }, { "epoch": 3.5657104736490997, "loss": 0.5395914316177368, "step": 10690 }, { "ce_loss": 0.12870147824287415, "epoch": 3.5657104736490997, "step": 10690 }, { "distill_loss": 0.126537024974823, "epoch": 3.5657104736490997, "step": 10690 }, { "epoch": 3.5657104736490997, "ref_ce_loss": 0.09157304465770721, "step": 10690 }, { "epoch": 3.5657104736490997, "loss": 0.3987729549407959, "step": 10690 }, { "ce_loss": 0.0682898759841919, "epoch": 3.5657104736490997, "step": 10690 }, { "distill_loss": 0.13631054759025574, "epoch": 3.5657104736490997, "step": 10690 }, { "epoch": 3.5657104736490997, "ref_ce_loss": 0.10172045975923538, "step": 10690 }, { "epoch": 3.5657104736490997, "loss": 0.40862464904785156, "step": 10690 }, { "ce_loss": 0.13958579301834106, "epoch": 3.5657104736490997, "step": 10690 }, { "distill_loss": 0.10409563779830933, "epoch": 3.5657104736490997, "step": 10690 }, { "epoch": 3.5657104736490997, "ref_ce_loss": 0.09258479624986649, "step": 10690 }, { "epoch": 3.569046030687125, "loss": 0.4992, "step": 10700 }, { "epoch": 3.569046030687125, "grad_norm": 3.553321599960327, "step": 10700 }, { "epoch": 3.569046030687125, "learning_rate": 0.00018318627100707052, "step": 10700 }, { "epoch": 3.569046030687125, "loss": 1.3002853393554688, "step": 10700 }, { "ce_loss": 0.13170751929283142, "epoch": 3.569046030687125, "step": 10700 }, { "distill_loss": 0.14222148060798645, "epoch": 3.569046030687125, "step": 10700 }, { "epoch": 3.569046030687125, "ref_ce_loss": 0.10056454688310623, "step": 10700 }, { "epoch": 3.569046030687125, "loss": 0.3703250586986542, "step": 10700 }, { "ce_loss": 0.0983681008219719, "epoch": 3.569046030687125, "step": 10700 }, { "distill_loss": 0.16033944487571716, "epoch": 3.569046030687125, "step": 10700 }, { "epoch": 3.569046030687125, "ref_ce_loss": 0.11107808351516724, "step": 10700 }, { "epoch": 3.569046030687125, "loss": 0.4253198504447937, "step": 10700 }, { "ce_loss": 0.15250420570373535, "epoch": 3.569046030687125, "step": 10700 }, { "distill_loss": 0.1289929747581482, "epoch": 3.569046030687125, "step": 10700 }, { "epoch": 3.569046030687125, "ref_ce_loss": 0.1237674355506897, "step": 10700 }, { "epoch": 3.569046030687125, "loss": 0.5686759352684021, "step": 10700 }, { "ce_loss": 0.23660250008106232, "epoch": 3.569046030687125, "step": 10700 }, { "distill_loss": 0.20011036098003387, "epoch": 3.569046030687125, "step": 10700 }, { "epoch": 3.569046030687125, "ref_ce_loss": 0.12985314428806305, "step": 10700 }, { "epoch": 3.5723815877251504, "loss": 0.5315, "step": 10710 }, { "epoch": 3.5723815877251504, "grad_norm": 2.264549493789673, "step": 10710 }, { "epoch": 3.5723815877251504, "learning_rate": 0.00018298869906153764, "step": 10710 }, { "epoch": 3.5723815877251504, "loss": 0.48029279708862305, "step": 10710 }, { "ce_loss": 0.13935871422290802, "epoch": 3.5723815877251504, "step": 10710 }, { "distill_loss": 0.18435510993003845, "epoch": 3.5723815877251504, "step": 10710 }, { "epoch": 3.5723815877251504, "ref_ce_loss": 0.12784120440483093, "step": 10710 }, { "epoch": 3.5723815877251504, "loss": 0.26222920417785645, "step": 10710 }, { "ce_loss": 0.04506101459264755, "epoch": 3.5723815877251504, "step": 10710 }, { "distill_loss": 0.09920864552259445, "epoch": 3.5723815877251504, "step": 10710 }, { "epoch": 3.5723815877251504, "ref_ce_loss": 0.06999955326318741, "step": 10710 }, { "epoch": 3.5723815877251504, "loss": 0.32495203614234924, "step": 10710 }, { "ce_loss": 0.0796368196606636, "epoch": 3.5723815877251504, "step": 10710 }, { "distill_loss": 0.117865189909935, "epoch": 3.5723815877251504, "step": 10710 }, { "epoch": 3.5723815877251504, "ref_ce_loss": 0.10748656094074249, "step": 10710 }, { "epoch": 3.5723815877251504, "loss": 0.5730641484260559, "step": 10710 }, { "ce_loss": 0.20408304035663605, "epoch": 3.5723815877251504, "step": 10710 }, { "distill_loss": 0.18805451691150665, "epoch": 3.5723815877251504, "step": 10710 }, { "epoch": 3.5723815877251504, "ref_ce_loss": 0.13756021857261658, "step": 10710 }, { "epoch": 3.5757171447631757, "loss": 0.4969, "step": 10720 }, { "epoch": 3.5757171447631757, "grad_norm": 3.3614892959594727, "step": 10720 }, { "epoch": 3.5757171447631757, "learning_rate": 0.00018279106695763065, "step": 10720 }, { "epoch": 3.5757171447631757, "loss": 0.5536407828330994, "step": 10720 }, { "ce_loss": 0.18258598446846008, "epoch": 3.5757171447631757, "step": 10720 }, { "distill_loss": 0.18230997025966644, "epoch": 3.5757171447631757, "step": 10720 }, { "epoch": 3.5757171447631757, "ref_ce_loss": 0.14992879331111908, "step": 10720 }, { "epoch": 3.5757171447631757, "loss": 0.37388432025909424, "step": 10720 }, { "ce_loss": 0.12813162803649902, "epoch": 3.5757171447631757, "step": 10720 }, { "distill_loss": 0.12815801799297333, "epoch": 3.5757171447631757, "step": 10720 }, { "epoch": 3.5757171447631757, "ref_ce_loss": 0.08915966749191284, "step": 10720 }, { "epoch": 3.5757171447631757, "loss": 0.4762488305568695, "step": 10720 }, { "ce_loss": 0.11369907110929489, "epoch": 3.5757171447631757, "step": 10720 }, { "distill_loss": 0.14350983500480652, "epoch": 3.5757171447631757, "step": 10720 }, { "epoch": 3.5757171447631757, "ref_ce_loss": 0.12805962562561035, "step": 10720 }, { "epoch": 3.5757171447631757, "loss": 0.4970819652080536, "step": 10720 }, { "ce_loss": 0.10555389523506165, "epoch": 3.5757171447631757, "step": 10720 }, { "distill_loss": 0.14790529012680054, "epoch": 3.5757171447631757, "step": 10720 }, { "epoch": 3.5757171447631757, "ref_ce_loss": 0.06945054978132248, "step": 10720 }, { "epoch": 3.579052701801201, "loss": 0.4624, "step": 10730 }, { "epoch": 3.579052701801201, "grad_norm": 2.028350591659546, "step": 10730 }, { "epoch": 3.579052701801201, "learning_rate": 0.0001825933750557525, "step": 10730 }, { "epoch": 3.579052701801201, "loss": 1.064501404762268, "step": 10730 }, { "ce_loss": 0.26156678795814514, "epoch": 3.579052701801201, "step": 10730 }, { "distill_loss": 0.19015151262283325, "epoch": 3.579052701801201, "step": 10730 }, { "epoch": 3.579052701801201, "ref_ce_loss": 0.14202123880386353, "step": 10730 }, { "epoch": 3.579052701801201, "loss": 0.721092700958252, "step": 10730 }, { "ce_loss": 0.10923820734024048, "epoch": 3.579052701801201, "step": 10730 }, { "distill_loss": 0.159623384475708, "epoch": 3.579052701801201, "step": 10730 }, { "epoch": 3.579052701801201, "ref_ce_loss": 0.0903729498386383, "step": 10730 }, { "epoch": 3.579052701801201, "loss": 0.6637827157974243, "step": 10730 }, { "ce_loss": 0.12120405584573746, "epoch": 3.579052701801201, "step": 10730 }, { "distill_loss": 0.19847629964351654, "epoch": 3.579052701801201, "step": 10730 }, { "epoch": 3.579052701801201, "ref_ce_loss": 0.09555134922266006, "step": 10730 }, { "epoch": 3.579052701801201, "loss": 0.47606194019317627, "step": 10730 }, { "ce_loss": 0.1550755351781845, "epoch": 3.579052701801201, "step": 10730 }, { "distill_loss": 0.14270076155662537, "epoch": 3.579052701801201, "step": 10730 }, { "epoch": 3.579052701801201, "ref_ce_loss": 0.09250533580780029, "step": 10730 }, { "epoch": 3.5823882588392264, "loss": 0.5166, "step": 10740 }, { "epoch": 3.5823882588392264, "grad_norm": 2.4106342792510986, "step": 10740 }, { "epoch": 3.5823882588392264, "learning_rate": 0.00018239562371641537, "step": 10740 }, { "epoch": 3.5823882588392264, "loss": 1.0733962059020996, "step": 10740 }, { "ce_loss": 0.2264718860387802, "epoch": 3.5823882588392264, "step": 10740 }, { "distill_loss": 0.1669166088104248, "epoch": 3.5823882588392264, "step": 10740 }, { "epoch": 3.5823882588392264, "ref_ce_loss": 0.17645739018917084, "step": 10740 }, { "epoch": 3.5823882588392264, "loss": 0.3027918040752411, "step": 10740 }, { "ce_loss": 0.08049654960632324, "epoch": 3.5823882588392264, "step": 10740 }, { "distill_loss": 0.07265809178352356, "epoch": 3.5823882588392264, "step": 10740 }, { "epoch": 3.5823882588392264, "ref_ce_loss": 0.10848908871412277, "step": 10740 }, { "epoch": 3.5823882588392264, "loss": 0.45776477456092834, "step": 10740 }, { "ce_loss": 0.11973146349191666, "epoch": 3.5823882588392264, "step": 10740 }, { "distill_loss": 0.1842823475599289, "epoch": 3.5823882588392264, "step": 10740 }, { "epoch": 3.5823882588392264, "ref_ce_loss": 0.1533636748790741, "step": 10740 }, { "epoch": 3.5823882588392264, "loss": 0.5457181334495544, "step": 10740 }, { "ce_loss": 0.12401236593723297, "epoch": 3.5823882588392264, "step": 10740 }, { "distill_loss": 0.11291433125734329, "epoch": 3.5823882588392264, "step": 10740 }, { "epoch": 3.5823882588392264, "ref_ce_loss": 0.10846975445747375, "step": 10740 }, { "epoch": 3.5857238158772518, "loss": 0.5116, "step": 10750 }, { "epoch": 3.5857238158772518, "grad_norm": 3.506852865219116, "step": 10750 }, { "epoch": 3.5857238158772518, "learning_rate": 0.00018219781330023954, "step": 10750 }, { "epoch": 3.5857238158772518, "loss": 0.7628376483917236, "step": 10750 }, { "ce_loss": 0.07697892189025879, "epoch": 3.5857238158772518, "step": 10750 }, { "distill_loss": 0.08748073875904083, "epoch": 3.5857238158772518, "step": 10750 }, { "epoch": 3.5857238158772518, "ref_ce_loss": 0.06737169623374939, "step": 10750 }, { "epoch": 3.5857238158772518, "loss": 0.22098787128925323, "step": 10750 }, { "ce_loss": 0.044790759682655334, "epoch": 3.5857238158772518, "step": 10750 }, { "distill_loss": 0.09421558678150177, "epoch": 3.5857238158772518, "step": 10750 }, { "epoch": 3.5857238158772518, "ref_ce_loss": 0.06399078667163849, "step": 10750 }, { "epoch": 3.5857238158772518, "loss": 0.4772864580154419, "step": 10750 }, { "ce_loss": 0.17643699049949646, "epoch": 3.5857238158772518, "step": 10750 }, { "distill_loss": 0.14804883301258087, "epoch": 3.5857238158772518, "step": 10750 }, { "epoch": 3.5857238158772518, "ref_ce_loss": 0.11784358322620392, "step": 10750 }, { "epoch": 3.5857238158772518, "loss": 0.3947041928768158, "step": 10750 }, { "ce_loss": 0.09214789420366287, "epoch": 3.5857238158772518, "step": 10750 }, { "distill_loss": 0.11591991782188416, "epoch": 3.5857238158772518, "step": 10750 }, { "epoch": 3.5857238158772518, "ref_ce_loss": 0.104644276201725, "step": 10750 }, { "epoch": 3.589059372915277, "loss": 0.4357, "step": 10760 }, { "epoch": 3.589059372915277, "grad_norm": 1.944047212600708, "step": 10760 }, { "epoch": 3.589059372915277, "learning_rate": 0.00018199994416795323, "step": 10760 }, { "epoch": 3.589059372915277, "loss": 0.47622978687286377, "step": 10760 }, { "ce_loss": 0.06587539613246918, "epoch": 3.589059372915277, "step": 10760 }, { "distill_loss": 0.1422610878944397, "epoch": 3.589059372915277, "step": 10760 }, { "epoch": 3.589059372915277, "ref_ce_loss": 0.07343024760484695, "step": 10760 }, { "epoch": 3.589059372915277, "loss": 0.8206377625465393, "step": 10760 }, { "ce_loss": 0.12380576133728027, "epoch": 3.589059372915277, "step": 10760 }, { "distill_loss": 0.12928557395935059, "epoch": 3.589059372915277, "step": 10760 }, { "epoch": 3.589059372915277, "ref_ce_loss": 0.10670095682144165, "step": 10760 }, { "epoch": 3.589059372915277, "loss": 0.5720410943031311, "step": 10760 }, { "ce_loss": 0.10976602137088776, "epoch": 3.589059372915277, "step": 10760 }, { "distill_loss": 0.10744272172451019, "epoch": 3.589059372915277, "step": 10760 }, { "epoch": 3.589059372915277, "ref_ce_loss": 0.0642593652009964, "step": 10760 }, { "epoch": 3.589059372915277, "loss": 0.4113171398639679, "step": 10760 }, { "ce_loss": 0.16085822880268097, "epoch": 3.589059372915277, "step": 10760 }, { "distill_loss": 0.11362794786691666, "epoch": 3.589059372915277, "step": 10760 }, { "epoch": 3.589059372915277, "ref_ce_loss": 0.0914236307144165, "step": 10760 }, { "epoch": 3.5923949299533025, "loss": 0.4972, "step": 10770 }, { "epoch": 3.5923949299533025, "grad_norm": 7.0427727699279785, "step": 10770 }, { "epoch": 3.5923949299533025, "learning_rate": 0.0001818020166803918, "step": 10770 }, { "epoch": 3.5923949299533025, "loss": 0.807584285736084, "step": 10770 }, { "ce_loss": 0.22549563646316528, "epoch": 3.5923949299533025, "step": 10770 }, { "distill_loss": 0.20602241158485413, "epoch": 3.5923949299533025, "step": 10770 }, { "epoch": 3.5923949299533025, "ref_ce_loss": 0.24110868573188782, "step": 10770 }, { "epoch": 3.5923949299533025, "loss": 0.4110437035560608, "step": 10770 }, { "ce_loss": 0.10992979258298874, "epoch": 3.5923949299533025, "step": 10770 }, { "distill_loss": 0.15975235402584076, "epoch": 3.5923949299533025, "step": 10770 }, { "epoch": 3.5923949299533025, "ref_ce_loss": 0.09724153578281403, "step": 10770 }, { "epoch": 3.5923949299533025, "loss": 0.44220978021621704, "step": 10770 }, { "ce_loss": 0.1018572449684143, "epoch": 3.5923949299533025, "step": 10770 }, { "distill_loss": 0.16083693504333496, "epoch": 3.5923949299533025, "step": 10770 }, { "epoch": 3.5923949299533025, "ref_ce_loss": 0.08170153200626373, "step": 10770 }, { "epoch": 3.5923949299533025, "loss": 0.46427327394485474, "step": 10770 }, { "ce_loss": 0.11812613904476166, "epoch": 3.5923949299533025, "step": 10770 }, { "distill_loss": 0.17142073810100555, "epoch": 3.5923949299533025, "step": 10770 }, { "epoch": 3.5923949299533025, "ref_ce_loss": 0.11512557417154312, "step": 10770 }, { "epoch": 3.595730486991328, "loss": 0.4756, "step": 10780 }, { "epoch": 3.595730486991328, "grad_norm": 1.7970086336135864, "step": 10780 }, { "epoch": 3.595730486991328, "learning_rate": 0.00018160403119849673, "step": 10780 }, { "epoch": 3.595730486991328, "loss": 0.3645995855331421, "step": 10780 }, { "ce_loss": 0.1401004046201706, "epoch": 3.595730486991328, "step": 10780 }, { "distill_loss": 0.11823718249797821, "epoch": 3.595730486991328, "step": 10780 }, { "epoch": 3.595730486991328, "ref_ce_loss": 0.07413559406995773, "step": 10780 }, { "epoch": 3.595730486991328, "loss": 0.5593521595001221, "step": 10780 }, { "ce_loss": 0.12035626918077469, "epoch": 3.595730486991328, "step": 10780 }, { "distill_loss": 0.18114838004112244, "epoch": 3.595730486991328, "step": 10780 }, { "epoch": 3.595730486991328, "ref_ce_loss": 0.1041061133146286, "step": 10780 }, { "epoch": 3.595730486991328, "loss": 0.5523062348365784, "step": 10780 }, { "ce_loss": 0.11224917322397232, "epoch": 3.595730486991328, "step": 10780 }, { "distill_loss": 0.17238086462020874, "epoch": 3.595730486991328, "step": 10780 }, { "epoch": 3.595730486991328, "ref_ce_loss": 0.10551968216896057, "step": 10780 }, { "epoch": 3.595730486991328, "loss": 0.4836883544921875, "step": 10780 }, { "ce_loss": 0.09018741548061371, "epoch": 3.595730486991328, "step": 10780 }, { "distill_loss": 0.1445465087890625, "epoch": 3.595730486991328, "step": 10780 }, { "epoch": 3.595730486991328, "ref_ce_loss": 0.10304789990186691, "step": 10780 }, { "epoch": 3.599066044029353, "loss": 0.5156, "step": 10790 }, { "epoch": 3.599066044029353, "grad_norm": 2.589780330657959, "step": 10790 }, { "epoch": 3.599066044029353, "learning_rate": 0.00018140598808331557, "step": 10790 }, { "epoch": 3.599066044029353, "loss": 0.3814545273780823, "step": 10790 }, { "ce_loss": 0.14167527854442596, "epoch": 3.599066044029353, "step": 10790 }, { "distill_loss": 0.14271581172943115, "epoch": 3.599066044029353, "step": 10790 }, { "epoch": 3.599066044029353, "ref_ce_loss": 0.06693271547555923, "step": 10790 }, { "epoch": 3.599066044029353, "loss": 0.6055205464363098, "step": 10790 }, { "ce_loss": 0.19398923218250275, "epoch": 3.599066044029353, "step": 10790 }, { "distill_loss": 0.1850125938653946, "epoch": 3.599066044029353, "step": 10790 }, { "epoch": 3.599066044029353, "ref_ce_loss": 0.12055008858442307, "step": 10790 }, { "epoch": 3.599066044029353, "loss": 0.4280034005641937, "step": 10790 }, { "ce_loss": 0.13872362673282623, "epoch": 3.599066044029353, "step": 10790 }, { "distill_loss": 0.15624187886714935, "epoch": 3.599066044029353, "step": 10790 }, { "epoch": 3.599066044029353, "ref_ce_loss": 0.09329959005117416, "step": 10790 }, { "epoch": 3.599066044029353, "loss": 0.5235930681228638, "step": 10790 }, { "ce_loss": 0.21662373840808868, "epoch": 3.599066044029353, "step": 10790 }, { "distill_loss": 0.09369595348834991, "epoch": 3.599066044029353, "step": 10790 }, { "epoch": 3.599066044029353, "ref_ce_loss": 0.09563537687063217, "step": 10790 }, { "epoch": 3.6024016010673785, "loss": 0.4679, "step": 10800 }, { "epoch": 3.6024016010673785, "grad_norm": 1.9319144487380981, "step": 10800 }, { "epoch": 3.6024016010673785, "learning_rate": 0.0001812078876960008, "step": 10800 }, { "epoch": 3.6024016010673785, "loss": 0.2767658829689026, "step": 10800 }, { "ce_loss": 0.07563504576683044, "epoch": 3.6024016010673785, "step": 10800 }, { "distill_loss": 0.13797806203365326, "epoch": 3.6024016010673785, "step": 10800 }, { "epoch": 3.6024016010673785, "ref_ce_loss": 0.06295233219861984, "step": 10800 }, { "epoch": 3.6024016010673785, "loss": 0.3499141037464142, "step": 10800 }, { "ce_loss": 0.10508345067501068, "epoch": 3.6024016010673785, "step": 10800 }, { "distill_loss": 0.1375848948955536, "epoch": 3.6024016010673785, "step": 10800 }, { "epoch": 3.6024016010673785, "ref_ce_loss": 0.0647381991147995, "step": 10800 }, { "epoch": 3.6024016010673785, "loss": 0.5485572814941406, "step": 10800 }, { "ce_loss": 0.1507655829191208, "epoch": 3.6024016010673785, "step": 10800 }, { "distill_loss": 0.1268126517534256, "epoch": 3.6024016010673785, "step": 10800 }, { "epoch": 3.6024016010673785, "ref_ce_loss": 0.12260568141937256, "step": 10800 }, { "epoch": 3.6024016010673785, "loss": 0.6075488328933716, "step": 10800 }, { "ce_loss": 0.16358868777751923, "epoch": 3.6024016010673785, "step": 10800 }, { "distill_loss": 0.1781436651945114, "epoch": 3.6024016010673785, "step": 10800 }, { "epoch": 3.6024016010673785, "ref_ce_loss": 0.13535261154174805, "step": 10800 }, { "epoch": 3.605737158105404, "loss": 0.4998, "step": 10810 }, { "epoch": 3.605737158105404, "grad_norm": 5.021475791931152, "step": 10810 }, { "epoch": 3.605737158105404, "learning_rate": 0.00018100973039780933, "step": 10810 }, { "epoch": 3.605737158105404, "loss": 0.5075588226318359, "step": 10810 }, { "ce_loss": 0.15543168783187866, "epoch": 3.605737158105404, "step": 10810 }, { "distill_loss": 0.14139851927757263, "epoch": 3.605737158105404, "step": 10810 }, { "epoch": 3.605737158105404, "ref_ce_loss": 0.12484012544155121, "step": 10810 }, { "epoch": 3.605737158105404, "loss": 0.26541557908058167, "step": 10810 }, { "ce_loss": 0.08082418888807297, "epoch": 3.605737158105404, "step": 10810 }, { "distill_loss": 0.13024282455444336, "epoch": 3.605737158105404, "step": 10810 }, { "epoch": 3.605737158105404, "ref_ce_loss": 0.054262448102235794, "step": 10810 }, { "epoch": 3.605737158105404, "loss": 0.4680088758468628, "step": 10810 }, { "ce_loss": 0.1070551946759224, "epoch": 3.605737158105404, "step": 10810 }, { "distill_loss": 0.15847979485988617, "epoch": 3.605737158105404, "step": 10810 }, { "epoch": 3.605737158105404, "ref_ce_loss": 0.12366623431444168, "step": 10810 }, { "epoch": 3.605737158105404, "loss": 0.5198716521263123, "step": 10810 }, { "ce_loss": 0.18975123763084412, "epoch": 3.605737158105404, "step": 10810 }, { "distill_loss": 0.1683826595544815, "epoch": 3.605737158105404, "step": 10810 }, { "epoch": 3.605737158105404, "ref_ce_loss": 0.1616154909133911, "step": 10810 }, { "epoch": 3.609072715143429, "loss": 0.4423, "step": 10820 }, { "epoch": 3.609072715143429, "grad_norm": 1.987995982170105, "step": 10820 }, { "epoch": 3.609072715143429, "learning_rate": 0.00018081151655010202, "step": 10820 }, { "epoch": 3.609072715143429, "loss": 0.31308069825172424, "step": 10820 }, { "ce_loss": 0.10505445301532745, "epoch": 3.609072715143429, "step": 10820 }, { "distill_loss": 0.11159270256757736, "epoch": 3.609072715143429, "step": 10820 }, { "epoch": 3.609072715143429, "ref_ce_loss": 0.09632076323032379, "step": 10820 }, { "epoch": 3.609072715143429, "loss": 0.47415441274642944, "step": 10820 }, { "ce_loss": 0.04985740780830383, "epoch": 3.609072715143429, "step": 10820 }, { "distill_loss": 0.12312208116054535, "epoch": 3.609072715143429, "step": 10820 }, { "epoch": 3.609072715143429, "ref_ce_loss": 0.0715782567858696, "step": 10820 }, { "epoch": 3.609072715143429, "loss": 0.4616354703903198, "step": 10820 }, { "ce_loss": 0.11249656975269318, "epoch": 3.609072715143429, "step": 10820 }, { "distill_loss": 0.12125895917415619, "epoch": 3.609072715143429, "step": 10820 }, { "epoch": 3.609072715143429, "ref_ce_loss": 0.09925705194473267, "step": 10820 }, { "epoch": 3.609072715143429, "loss": 0.34156709909439087, "step": 10820 }, { "ce_loss": 0.06891462951898575, "epoch": 3.609072715143429, "step": 10820 }, { "distill_loss": 0.13339316844940186, "epoch": 3.609072715143429, "step": 10820 }, { "epoch": 3.609072715143429, "ref_ce_loss": 0.07170667499303818, "step": 10820 }, { "epoch": 3.6124082721814545, "loss": 0.4969, "step": 10830 }, { "epoch": 3.6124082721814545, "grad_norm": 2.7403793334960938, "step": 10830 }, { "epoch": 3.6124082721814545, "learning_rate": 0.00018061324651434267, "step": 10830 }, { "epoch": 3.6124082721814545, "loss": 0.471358984708786, "step": 10830 }, { "ce_loss": 0.10615131258964539, "epoch": 3.6124082721814545, "step": 10830 }, { "distill_loss": 0.14862710237503052, "epoch": 3.6124082721814545, "step": 10830 }, { "epoch": 3.6124082721814545, "ref_ce_loss": 0.10827454179525375, "step": 10830 }, { "epoch": 3.6124082721814545, "loss": 0.3429005742073059, "step": 10830 }, { "ce_loss": 0.07791846245527267, "epoch": 3.6124082721814545, "step": 10830 }, { "distill_loss": 0.09560949355363846, "epoch": 3.6124082721814545, "step": 10830 }, { "epoch": 3.6124082721814545, "ref_ce_loss": 0.10049939155578613, "step": 10830 }, { "epoch": 3.6124082721814545, "loss": 0.3915432095527649, "step": 10830 }, { "ce_loss": 0.12821824848651886, "epoch": 3.6124082721814545, "step": 10830 }, { "distill_loss": 0.12204201519489288, "epoch": 3.6124082721814545, "step": 10830 }, { "epoch": 3.6124082721814545, "ref_ce_loss": 0.09869548678398132, "step": 10830 }, { "epoch": 3.6124082721814545, "loss": 0.320291131734848, "step": 10830 }, { "ce_loss": 0.08379454910755157, "epoch": 3.6124082721814545, "step": 10830 }, { "distill_loss": 0.14412151277065277, "epoch": 3.6124082721814545, "step": 10830 }, { "epoch": 3.6124082721814545, "ref_ce_loss": 0.09217362850904465, "step": 10830 }, { "epoch": 3.61574382921948, "loss": 0.4848, "step": 10840 }, { "epoch": 3.61574382921948, "grad_norm": 2.4884891510009766, "step": 10840 }, { "epoch": 3.61574382921948, "learning_rate": 0.00018041492065209755, "step": 10840 }, { "epoch": 3.61574382921948, "loss": 0.277363121509552, "step": 10840 }, { "ce_loss": 0.0627259761095047, "epoch": 3.61574382921948, "step": 10840 }, { "distill_loss": 0.12686988711357117, "epoch": 3.61574382921948, "step": 10840 }, { "epoch": 3.61574382921948, "ref_ce_loss": 0.054359905421733856, "step": 10840 }, { "epoch": 3.61574382921948, "loss": 0.41814225912094116, "step": 10840 }, { "ce_loss": 0.13269467651844025, "epoch": 3.61574382921948, "step": 10840 }, { "distill_loss": 0.14560770988464355, "epoch": 3.61574382921948, "step": 10840 }, { "epoch": 3.61574382921948, "ref_ce_loss": 0.0991872251033783, "step": 10840 }, { "epoch": 3.61574382921948, "loss": 0.3902513384819031, "step": 10840 }, { "ce_loss": 0.11417891830205917, "epoch": 3.61574382921948, "step": 10840 }, { "distill_loss": 0.1708763986825943, "epoch": 3.61574382921948, "step": 10840 }, { "epoch": 3.61574382921948, "ref_ce_loss": 0.10510201752185822, "step": 10840 }, { "epoch": 3.61574382921948, "loss": 0.4186701476573944, "step": 10840 }, { "ce_loss": 0.0700027123093605, "epoch": 3.61574382921948, "step": 10840 }, { "distill_loss": 0.1922881007194519, "epoch": 3.61574382921948, "step": 10840 }, { "epoch": 3.61574382921948, "ref_ce_loss": 0.0704408511519432, "step": 10840 }, { "epoch": 3.6190793862575052, "loss": 0.4628, "step": 10850 }, { "epoch": 3.6190793862575052, "grad_norm": 5.111220359802246, "step": 10850 }, { "epoch": 3.6190793862575052, "learning_rate": 0.00018021653932503493, "step": 10850 }, { "epoch": 3.6190793862575052, "loss": 0.4838990569114685, "step": 10850 }, { "ce_loss": 0.13407105207443237, "epoch": 3.6190793862575052, "step": 10850 }, { "distill_loss": 0.14027553796768188, "epoch": 3.6190793862575052, "step": 10850 }, { "epoch": 3.6190793862575052, "ref_ce_loss": 0.11734570562839508, "step": 10850 }, { "epoch": 3.6190793862575052, "loss": 0.6074638366699219, "step": 10850 }, { "ce_loss": 0.17922604084014893, "epoch": 3.6190793862575052, "step": 10850 }, { "distill_loss": 0.1384764015674591, "epoch": 3.6190793862575052, "step": 10850 }, { "epoch": 3.6190793862575052, "ref_ce_loss": 0.12209158390760422, "step": 10850 }, { "epoch": 3.6190793862575052, "loss": 0.5015256404876709, "step": 10850 }, { "ce_loss": 0.14106027781963348, "epoch": 3.6190793862575052, "step": 10850 }, { "distill_loss": 0.17080964148044586, "epoch": 3.6190793862575052, "step": 10850 }, { "epoch": 3.6190793862575052, "ref_ce_loss": 0.08879175037145615, "step": 10850 }, { "epoch": 3.6190793862575052, "loss": 0.42805197834968567, "step": 10850 }, { "ce_loss": 0.15597136318683624, "epoch": 3.6190793862575052, "step": 10850 }, { "distill_loss": 0.10954298824071884, "epoch": 3.6190793862575052, "step": 10850 }, { "epoch": 3.6190793862575052, "ref_ce_loss": 0.1303330808877945, "step": 10850 }, { "epoch": 3.6224149432955306, "loss": 0.4698, "step": 10860 }, { "epoch": 3.6224149432955306, "grad_norm": 2.6651175022125244, "step": 10860 }, { "epoch": 3.6224149432955306, "learning_rate": 0.00018001810289492405, "step": 10860 }, { "epoch": 3.6224149432955306, "loss": 0.534658670425415, "step": 10860 }, { "ce_loss": 0.044203534722328186, "epoch": 3.6224149432955306, "step": 10860 }, { "distill_loss": 0.14794570207595825, "epoch": 3.6224149432955306, "step": 10860 }, { "epoch": 3.6224149432955306, "ref_ce_loss": 0.08161613345146179, "step": 10860 }, { "epoch": 3.6224149432955306, "loss": 0.7004905939102173, "step": 10860 }, { "ce_loss": 0.11076775193214417, "epoch": 3.6224149432955306, "step": 10860 }, { "distill_loss": 0.17937146127223969, "epoch": 3.6224149432955306, "step": 10860 }, { "epoch": 3.6224149432955306, "ref_ce_loss": 0.12348173558712006, "step": 10860 }, { "epoch": 3.6224149432955306, "loss": 0.460467666387558, "step": 10860 }, { "ce_loss": 0.12757205963134766, "epoch": 3.6224149432955306, "step": 10860 }, { "distill_loss": 0.13403235375881195, "epoch": 3.6224149432955306, "step": 10860 }, { "epoch": 3.6224149432955306, "ref_ce_loss": 0.15560974180698395, "step": 10860 }, { "epoch": 3.6224149432955306, "loss": 0.401950478553772, "step": 10860 }, { "ce_loss": 0.09800982475280762, "epoch": 3.6224149432955306, "step": 10860 }, { "distill_loss": 0.15696462988853455, "epoch": 3.6224149432955306, "step": 10860 }, { "epoch": 3.6224149432955306, "ref_ce_loss": 0.07699155062437057, "step": 10860 }, { "epoch": 3.625750500333556, "loss": 0.457, "step": 10870 }, { "epoch": 3.625750500333556, "grad_norm": 2.0317394733428955, "step": 10870 }, { "epoch": 3.625750500333556, "learning_rate": 0.00017981961172363462, "step": 10870 }, { "epoch": 3.625750500333556, "loss": 0.3700401782989502, "step": 10870 }, { "ce_loss": 0.10517442226409912, "epoch": 3.625750500333556, "step": 10870 }, { "distill_loss": 0.12143470346927643, "epoch": 3.625750500333556, "step": 10870 }, { "epoch": 3.625750500333556, "ref_ce_loss": 0.09167401492595673, "step": 10870 }, { "epoch": 3.625750500333556, "loss": 0.4842488169670105, "step": 10870 }, { "ce_loss": 0.1177164688706398, "epoch": 3.625750500333556, "step": 10870 }, { "distill_loss": 0.1058540940284729, "epoch": 3.625750500333556, "step": 10870 }, { "epoch": 3.625750500333556, "ref_ce_loss": 0.08583217859268188, "step": 10870 }, { "epoch": 3.625750500333556, "loss": 0.5331366658210754, "step": 10870 }, { "ce_loss": 0.13608968257904053, "epoch": 3.625750500333556, "step": 10870 }, { "distill_loss": 0.19230999052524567, "epoch": 3.625750500333556, "step": 10870 }, { "epoch": 3.625750500333556, "ref_ce_loss": 0.09139268100261688, "step": 10870 }, { "epoch": 3.625750500333556, "loss": 1.190303921699524, "step": 10870 }, { "ce_loss": 0.25734570622444153, "epoch": 3.625750500333556, "step": 10870 }, { "distill_loss": 0.16222265362739563, "epoch": 3.625750500333556, "step": 10870 }, { "epoch": 3.625750500333556, "ref_ce_loss": 0.18444815278053284, "step": 10870 }, { "epoch": 3.6290860573715813, "loss": 0.4817, "step": 10880 }, { "epoch": 3.6290860573715813, "grad_norm": 2.306195020675659, "step": 10880 }, { "epoch": 3.6290860573715813, "learning_rate": 0.00017962106617313626, "step": 10880 }, { "epoch": 3.6290860573715813, "loss": 0.6413604617118835, "step": 10880 }, { "ce_loss": 0.22188203036785126, "epoch": 3.6290860573715813, "step": 10880 }, { "distill_loss": 0.21289722621440887, "epoch": 3.6290860573715813, "step": 10880 }, { "epoch": 3.6290860573715813, "ref_ce_loss": 0.20646364986896515, "step": 10880 }, { "epoch": 3.6290860573715813, "loss": 0.36448946595191956, "step": 10880 }, { "ce_loss": 0.08776570856571198, "epoch": 3.6290860573715813, "step": 10880 }, { "distill_loss": 0.12724003195762634, "epoch": 3.6290860573715813, "step": 10880 }, { "epoch": 3.6290860573715813, "ref_ce_loss": 0.10194741189479828, "step": 10880 }, { "epoch": 3.6290860573715813, "loss": 0.472312331199646, "step": 10880 }, { "ce_loss": 0.16327980160713196, "epoch": 3.6290860573715813, "step": 10880 }, { "distill_loss": 0.16509795188903809, "epoch": 3.6290860573715813, "step": 10880 }, { "epoch": 3.6290860573715813, "ref_ce_loss": 0.10424279421567917, "step": 10880 }, { "epoch": 3.6290860573715813, "loss": 0.40606415271759033, "step": 10880 }, { "ce_loss": 0.047251492738723755, "epoch": 3.6290860573715813, "step": 10880 }, { "distill_loss": 0.1290288269519806, "epoch": 3.6290860573715813, "step": 10880 }, { "epoch": 3.6290860573715813, "ref_ce_loss": 0.09942927211523056, "step": 10880 }, { "epoch": 3.6324216144096066, "loss": 0.4377, "step": 10890 }, { "epoch": 3.6324216144096066, "grad_norm": 2.150160789489746, "step": 10890 }, { "epoch": 3.6324216144096066, "learning_rate": 0.0001794224666054978, "step": 10890 }, { "epoch": 3.6324216144096066, "loss": 0.3624217212200165, "step": 10890 }, { "ce_loss": 0.11279918998479843, "epoch": 3.6324216144096066, "step": 10890 }, { "distill_loss": 0.12640734016895294, "epoch": 3.6324216144096066, "step": 10890 }, { "epoch": 3.6324216144096066, "ref_ce_loss": 0.12293494492769241, "step": 10890 }, { "epoch": 3.6324216144096066, "loss": 0.40093767642974854, "step": 10890 }, { "ce_loss": 0.1267414391040802, "epoch": 3.6324216144096066, "step": 10890 }, { "distill_loss": 0.10430145263671875, "epoch": 3.6324216144096066, "step": 10890 }, { "epoch": 3.6324216144096066, "ref_ce_loss": 0.07403694093227386, "step": 10890 }, { "epoch": 3.6324216144096066, "loss": 0.41296109557151794, "step": 10890 }, { "ce_loss": 0.1552402824163437, "epoch": 3.6324216144096066, "step": 10890 }, { "distill_loss": 0.089487724006176, "epoch": 3.6324216144096066, "step": 10890 }, { "epoch": 3.6324216144096066, "ref_ce_loss": 0.13549669086933136, "step": 10890 }, { "epoch": 3.6324216144096066, "loss": 0.5205192565917969, "step": 10890 }, { "ce_loss": 0.1464664340019226, "epoch": 3.6324216144096066, "step": 10890 }, { "distill_loss": 0.1508297473192215, "epoch": 3.6324216144096066, "step": 10890 }, { "epoch": 3.6324216144096066, "ref_ce_loss": 0.11153878271579742, "step": 10890 }, { "epoch": 3.635757171447632, "loss": 0.4605, "step": 10900 }, { "epoch": 3.635757171447632, "grad_norm": 6.012615203857422, "step": 10900 }, { "epoch": 3.635757171447632, "learning_rate": 0.00017922381338288646, "step": 10900 }, { "epoch": 3.635757171447632, "loss": 0.835310161113739, "step": 10900 }, { "ce_loss": 0.2168867439031601, "epoch": 3.635757171447632, "step": 10900 }, { "distill_loss": 0.21579332649707794, "epoch": 3.635757171447632, "step": 10900 }, { "epoch": 3.635757171447632, "ref_ce_loss": 0.15783701837062836, "step": 10900 }, { "epoch": 3.635757171447632, "loss": 0.4475416839122772, "step": 10900 }, { "ce_loss": 0.18404029309749603, "epoch": 3.635757171447632, "step": 10900 }, { "distill_loss": 0.13007982075214386, "epoch": 3.635757171447632, "step": 10900 }, { "epoch": 3.635757171447632, "ref_ce_loss": 0.10602632164955139, "step": 10900 }, { "epoch": 3.635757171447632, "loss": 0.3571226894855499, "step": 10900 }, { "ce_loss": 0.08436594903469086, "epoch": 3.635757171447632, "step": 10900 }, { "distill_loss": 0.12900224328041077, "epoch": 3.635757171447632, "step": 10900 }, { "epoch": 3.635757171447632, "ref_ce_loss": 0.09496305882930756, "step": 10900 }, { "epoch": 3.635757171447632, "loss": 0.5006234645843506, "step": 10900 }, { "ce_loss": 0.10648602992296219, "epoch": 3.635757171447632, "step": 10900 }, { "distill_loss": 0.17396169900894165, "epoch": 3.635757171447632, "step": 10900 }, { "epoch": 3.635757171447632, "ref_ce_loss": 0.14211241900920868, "step": 10900 }, { "epoch": 3.6390927284856573, "loss": 0.5126, "step": 10910 }, { "epoch": 3.6390927284856573, "grad_norm": 2.959005355834961, "step": 10910 }, { "epoch": 3.6390927284856573, "learning_rate": 0.00017902510686756737, "step": 10910 }, { "epoch": 3.6390927284856573, "loss": 0.6385618448257446, "step": 10910 }, { "ce_loss": 0.21390993893146515, "epoch": 3.6390927284856573, "step": 10910 }, { "distill_loss": 0.2231408804655075, "epoch": 3.6390927284856573, "step": 10910 }, { "epoch": 3.6390927284856573, "ref_ce_loss": 0.12439989298582077, "step": 10910 }, { "epoch": 3.6390927284856573, "loss": 0.21764978766441345, "step": 10910 }, { "ce_loss": 0.04720484837889671, "epoch": 3.6390927284856573, "step": 10910 }, { "distill_loss": 0.09097013622522354, "epoch": 3.6390927284856573, "step": 10910 }, { "epoch": 3.6390927284856573, "ref_ce_loss": 0.058279022574424744, "step": 10910 }, { "epoch": 3.6390927284856573, "loss": 0.4075714349746704, "step": 10910 }, { "ce_loss": 0.10580960661172867, "epoch": 3.6390927284856573, "step": 10910 }, { "distill_loss": 0.2019980251789093, "epoch": 3.6390927284856573, "step": 10910 }, { "epoch": 3.6390927284856573, "ref_ce_loss": 0.09952415525913239, "step": 10910 }, { "epoch": 3.6390927284856573, "loss": 0.4693651497364044, "step": 10910 }, { "ce_loss": 0.1692083179950714, "epoch": 3.6390927284856573, "step": 10910 }, { "distill_loss": 0.1516937017440796, "epoch": 3.6390927284856573, "step": 10910 }, { "epoch": 3.6390927284856573, "ref_ce_loss": 0.08030670136213303, "step": 10910 }, { "epoch": 3.6424282855236827, "loss": 0.5253, "step": 10920 }, { "epoch": 3.6424282855236827, "grad_norm": 2.7803750038146973, "step": 10920 }, { "epoch": 3.6424282855236827, "learning_rate": 0.00017882634742190278, "step": 10920 }, { "epoch": 3.6424282855236827, "loss": 0.6413553953170776, "step": 10920 }, { "ce_loss": 0.13203197717666626, "epoch": 3.6424282855236827, "step": 10920 }, { "distill_loss": 0.1417599618434906, "epoch": 3.6424282855236827, "step": 10920 }, { "epoch": 3.6424282855236827, "ref_ce_loss": 0.09080624580383301, "step": 10920 }, { "epoch": 3.6424282855236827, "loss": 0.4760216474533081, "step": 10920 }, { "ce_loss": 0.09282150864601135, "epoch": 3.6424282855236827, "step": 10920 }, { "distill_loss": 0.14410977065563202, "epoch": 3.6424282855236827, "step": 10920 }, { "epoch": 3.6424282855236827, "ref_ce_loss": 0.07130226492881775, "step": 10920 }, { "epoch": 3.6424282855236827, "loss": 0.38041651248931885, "step": 10920 }, { "ce_loss": 0.04239816591143608, "epoch": 3.6424282855236827, "step": 10920 }, { "distill_loss": 0.15716636180877686, "epoch": 3.6424282855236827, "step": 10920 }, { "epoch": 3.6424282855236827, "ref_ce_loss": 0.06270455569028854, "step": 10920 }, { "epoch": 3.6424282855236827, "loss": 0.5958917140960693, "step": 10920 }, { "ce_loss": 0.12721644341945648, "epoch": 3.6424282855236827, "step": 10920 }, { "distill_loss": 0.19002792239189148, "epoch": 3.6424282855236827, "step": 10920 }, { "epoch": 3.6424282855236827, "ref_ce_loss": 0.13948236405849457, "step": 10920 }, { "epoch": 3.645763842561708, "loss": 0.4639, "step": 10930 }, { "epoch": 3.645763842561708, "grad_norm": 2.47678804397583, "step": 10930 }, { "epoch": 3.645763842561708, "learning_rate": 0.0001786275354083516, "step": 10930 }, { "epoch": 3.645763842561708, "loss": 0.567202627658844, "step": 10930 }, { "ce_loss": 0.11808883398771286, "epoch": 3.645763842561708, "step": 10930 }, { "distill_loss": 0.15721075236797333, "epoch": 3.645763842561708, "step": 10930 }, { "epoch": 3.645763842561708, "ref_ce_loss": 0.06056002900004387, "step": 10930 }, { "epoch": 3.645763842561708, "loss": 0.49473297595977783, "step": 10930 }, { "ce_loss": 0.15821705758571625, "epoch": 3.645763842561708, "step": 10930 }, { "distill_loss": 0.1436956524848938, "epoch": 3.645763842561708, "step": 10930 }, { "epoch": 3.645763842561708, "ref_ce_loss": 0.10956276953220367, "step": 10930 }, { "epoch": 3.645763842561708, "loss": 0.4342118203639984, "step": 10930 }, { "ce_loss": 0.14422941207885742, "epoch": 3.645763842561708, "step": 10930 }, { "distill_loss": 0.17922310531139374, "epoch": 3.645763842561708, "step": 10930 }, { "epoch": 3.645763842561708, "ref_ce_loss": 0.08042961359024048, "step": 10930 }, { "epoch": 3.645763842561708, "loss": 0.49232393503189087, "step": 10930 }, { "ce_loss": 0.14703483879566193, "epoch": 3.645763842561708, "step": 10930 }, { "distill_loss": 0.15014778077602386, "epoch": 3.645763842561708, "step": 10930 }, { "epoch": 3.645763842561708, "ref_ce_loss": 0.10249628871679306, "step": 10930 }, { "epoch": 3.6490993995997334, "loss": 0.4804, "step": 10940 }, { "epoch": 3.6490993995997334, "grad_norm": 3.3948302268981934, "step": 10940 }, { "epoch": 3.6490993995997334, "learning_rate": 0.0001784286711894685, "step": 10940 }, { "epoch": 3.6490993995997334, "loss": 0.5856025218963623, "step": 10940 }, { "ce_loss": 0.15770310163497925, "epoch": 3.6490993995997334, "step": 10940 }, { "distill_loss": 0.16146592795848846, "epoch": 3.6490993995997334, "step": 10940 }, { "epoch": 3.6490993995997334, "ref_ce_loss": 0.13260933756828308, "step": 10940 }, { "epoch": 3.6490993995997334, "loss": 0.3507378399372101, "step": 10940 }, { "ce_loss": 0.07525129616260529, "epoch": 3.6490993995997334, "step": 10940 }, { "distill_loss": 0.1550418585538864, "epoch": 3.6490993995997334, "step": 10940 }, { "epoch": 3.6490993995997334, "ref_ce_loss": 0.09258686006069183, "step": 10940 }, { "epoch": 3.6490993995997334, "loss": 0.2977530360221863, "step": 10940 }, { "ce_loss": 0.06264433264732361, "epoch": 3.6490993995997334, "step": 10940 }, { "distill_loss": 0.1284417361021042, "epoch": 3.6490993995997334, "step": 10940 }, { "epoch": 3.6490993995997334, "ref_ce_loss": 0.06895040720701218, "step": 10940 }, { "epoch": 3.6490993995997334, "loss": 0.28779518604278564, "step": 10940 }, { "ce_loss": 0.05994941294193268, "epoch": 3.6490993995997334, "step": 10940 }, { "distill_loss": 0.1388159692287445, "epoch": 3.6490993995997334, "step": 10940 }, { "epoch": 3.6490993995997334, "ref_ce_loss": 0.060144439339637756, "step": 10940 }, { "epoch": 3.6524349566377587, "loss": 0.5031, "step": 10950 }, { "epoch": 3.6524349566377587, "grad_norm": 5.514958381652832, "step": 10950 }, { "epoch": 3.6524349566377587, "learning_rate": 0.0001782297551279033, "step": 10950 }, { "epoch": 3.6524349566377587, "loss": 0.49093204736709595, "step": 10950 }, { "ce_loss": 0.15218622982501984, "epoch": 3.6524349566377587, "step": 10950 }, { "distill_loss": 0.11811237782239914, "epoch": 3.6524349566377587, "step": 10950 }, { "epoch": 3.6524349566377587, "ref_ce_loss": 0.0893976241350174, "step": 10950 }, { "epoch": 3.6524349566377587, "loss": 0.3062590956687927, "step": 10950 }, { "ce_loss": 0.04653836041688919, "epoch": 3.6524349566377587, "step": 10950 }, { "distill_loss": 0.10441438853740692, "epoch": 3.6524349566377587, "step": 10950 }, { "epoch": 3.6524349566377587, "ref_ce_loss": 0.08877456188201904, "step": 10950 }, { "epoch": 3.6524349566377587, "loss": 0.6125166416168213, "step": 10950 }, { "ce_loss": 0.117793507874012, "epoch": 3.6524349566377587, "step": 10950 }, { "distill_loss": 0.1392459124326706, "epoch": 3.6524349566377587, "step": 10950 }, { "epoch": 3.6524349566377587, "ref_ce_loss": 0.12085837125778198, "step": 10950 }, { "epoch": 3.6524349566377587, "loss": 0.5856779217720032, "step": 10950 }, { "ce_loss": 0.17838098108768463, "epoch": 3.6524349566377587, "step": 10950 }, { "distill_loss": 0.13888207077980042, "epoch": 3.6524349566377587, "step": 10950 }, { "epoch": 3.6524349566377587, "ref_ce_loss": 0.09558483958244324, "step": 10950 }, { "epoch": 3.655770513675784, "loss": 0.4924, "step": 10960 }, { "epoch": 3.655770513675784, "grad_norm": 3.244112730026245, "step": 10960 }, { "epoch": 3.655770513675784, "learning_rate": 0.00017803078758640053, "step": 10960 }, { "epoch": 3.655770513675784, "loss": 0.37806224822998047, "step": 10960 }, { "ce_loss": 0.1090494692325592, "epoch": 3.655770513675784, "step": 10960 }, { "distill_loss": 0.13142424821853638, "epoch": 3.655770513675784, "step": 10960 }, { "epoch": 3.655770513675784, "ref_ce_loss": 0.09739561378955841, "step": 10960 }, { "epoch": 3.655770513675784, "loss": 0.594853937625885, "step": 10960 }, { "ce_loss": 0.09211955219507217, "epoch": 3.655770513675784, "step": 10960 }, { "distill_loss": 0.12220696359872818, "epoch": 3.655770513675784, "step": 10960 }, { "epoch": 3.655770513675784, "ref_ce_loss": 0.0728682428598404, "step": 10960 }, { "epoch": 3.655770513675784, "loss": 0.22912947833538055, "step": 10960 }, { "ce_loss": 0.05628032609820366, "epoch": 3.655770513675784, "step": 10960 }, { "distill_loss": 0.11781372129917145, "epoch": 3.655770513675784, "step": 10960 }, { "epoch": 3.655770513675784, "ref_ce_loss": 0.054764240980148315, "step": 10960 }, { "epoch": 3.655770513675784, "loss": 0.2798725962638855, "step": 10960 }, { "ce_loss": 0.09618540853261948, "epoch": 3.655770513675784, "step": 10960 }, { "distill_loss": 0.11366327106952667, "epoch": 3.655770513675784, "step": 10960 }, { "epoch": 3.655770513675784, "ref_ce_loss": 0.05259474366903305, "step": 10960 }, { "epoch": 3.6591060707138094, "loss": 0.4712, "step": 10970 }, { "epoch": 3.6591060707138094, "grad_norm": 3.9033915996551514, "step": 10970 }, { "epoch": 3.6591060707138094, "learning_rate": 0.00017783176892779834, "step": 10970 }, { "epoch": 3.6591060707138094, "loss": 0.506156861782074, "step": 10970 }, { "ce_loss": 0.09005085378885269, "epoch": 3.6591060707138094, "step": 10970 }, { "distill_loss": 0.10344702750444412, "epoch": 3.6591060707138094, "step": 10970 }, { "epoch": 3.6591060707138094, "ref_ce_loss": 0.09592948108911514, "step": 10970 }, { "epoch": 3.6591060707138094, "loss": 0.4516763985157013, "step": 10970 }, { "ce_loss": 0.1697418987751007, "epoch": 3.6591060707138094, "step": 10970 }, { "distill_loss": 0.15038640797138214, "epoch": 3.6591060707138094, "step": 10970 }, { "epoch": 3.6591060707138094, "ref_ce_loss": 0.13058653473854065, "step": 10970 }, { "epoch": 3.6591060707138094, "loss": 0.3005906939506531, "step": 10970 }, { "ce_loss": 0.07280378043651581, "epoch": 3.6591060707138094, "step": 10970 }, { "distill_loss": 0.13411296904087067, "epoch": 3.6591060707138094, "step": 10970 }, { "epoch": 3.6591060707138094, "ref_ce_loss": 0.09324061870574951, "step": 10970 }, { "epoch": 3.6591060707138094, "loss": 0.4326677918434143, "step": 10970 }, { "ce_loss": 0.13207785785198212, "epoch": 3.6591060707138094, "step": 10970 }, { "distill_loss": 0.1927163451910019, "epoch": 3.6591060707138094, "step": 10970 }, { "epoch": 3.6591060707138094, "ref_ce_loss": 0.10759655386209488, "step": 10970 }, { "epoch": 3.662441627751835, "loss": 0.556, "step": 10980 }, { "epoch": 3.662441627751835, "grad_norm": 2.6602776050567627, "step": 10980 }, { "epoch": 3.662441627751835, "learning_rate": 0.00017763269951502844, "step": 10980 }, { "epoch": 3.662441627751835, "loss": 0.4683450162410736, "step": 10980 }, { "ce_loss": 0.14464636147022247, "epoch": 3.662441627751835, "step": 10980 }, { "distill_loss": 0.16256146132946014, "epoch": 3.662441627751835, "step": 10980 }, { "epoch": 3.662441627751835, "ref_ce_loss": 0.1233372688293457, "step": 10980 }, { "epoch": 3.662441627751835, "loss": 0.4439935088157654, "step": 10980 }, { "ce_loss": 0.15323685109615326, "epoch": 3.662441627751835, "step": 10980 }, { "distill_loss": 0.16922098398208618, "epoch": 3.662441627751835, "step": 10980 }, { "epoch": 3.662441627751835, "ref_ce_loss": 0.09869551658630371, "step": 10980 }, { "epoch": 3.662441627751835, "loss": 0.4825940728187561, "step": 10980 }, { "ce_loss": 0.14853030443191528, "epoch": 3.662441627751835, "step": 10980 }, { "distill_loss": 0.2026570588350296, "epoch": 3.662441627751835, "step": 10980 }, { "epoch": 3.662441627751835, "ref_ce_loss": 0.08260758221149445, "step": 10980 }, { "epoch": 3.662441627751835, "loss": 0.3823135793209076, "step": 10980 }, { "ce_loss": 0.09527653455734253, "epoch": 3.662441627751835, "step": 10980 }, { "distill_loss": 0.1679757833480835, "epoch": 3.662441627751835, "step": 10980 }, { "epoch": 3.662441627751835, "ref_ce_loss": 0.0711362436413765, "step": 10980 }, { "epoch": 3.66577718478986, "loss": 0.5047, "step": 10990 }, { "epoch": 3.66577718478986, "grad_norm": 4.2559494972229, "step": 10990 }, { "epoch": 3.66577718478986, "learning_rate": 0.00017743357971111487, "step": 10990 }, { "epoch": 3.66577718478986, "loss": 0.34644657373428345, "step": 10990 }, { "ce_loss": 0.12960556149482727, "epoch": 3.66577718478986, "step": 10990 }, { "distill_loss": 0.12763711810112, "epoch": 3.66577718478986, "step": 10990 }, { "epoch": 3.66577718478986, "ref_ce_loss": 0.08901886641979218, "step": 10990 }, { "epoch": 3.66577718478986, "loss": 0.41295936703681946, "step": 10990 }, { "ce_loss": 0.10422497987747192, "epoch": 3.66577718478986, "step": 10990 }, { "distill_loss": 0.10833515971899033, "epoch": 3.66577718478986, "step": 10990 }, { "epoch": 3.66577718478986, "ref_ce_loss": 0.12332861870527267, "step": 10990 }, { "epoch": 3.66577718478986, "loss": 0.6116182804107666, "step": 10990 }, { "ce_loss": 0.13335031270980835, "epoch": 3.66577718478986, "step": 10990 }, { "distill_loss": 0.20115993916988373, "epoch": 3.66577718478986, "step": 10990 }, { "epoch": 3.66577718478986, "ref_ce_loss": 0.11726392805576324, "step": 10990 }, { "epoch": 3.66577718478986, "loss": 1.1295512914657593, "step": 10990 }, { "ce_loss": 0.18608838319778442, "epoch": 3.66577718478986, "step": 10990 }, { "distill_loss": 0.2441026270389557, "epoch": 3.66577718478986, "step": 10990 }, { "epoch": 3.66577718478986, "ref_ce_loss": 0.11386443674564362, "step": 10990 }, { "epoch": 3.6691127418278855, "loss": 0.4915, "step": 11000 }, { "epoch": 3.6691127418278855, "grad_norm": 2.3816754817962646, "step": 11000 }, { "epoch": 3.6691127418278855, "learning_rate": 0.00017723440987917353, "step": 11000 }, { "epoch": 3.6691127418278855, "loss": 0.4174056053161621, "step": 11000 }, { "ce_loss": 0.12153197824954987, "epoch": 3.6691127418278855, "step": 11000 }, { "distill_loss": 0.12821528315544128, "epoch": 3.6691127418278855, "step": 11000 }, { "epoch": 3.6691127418278855, "ref_ce_loss": 0.08482295274734497, "step": 11000 }, { "epoch": 3.6691127418278855, "loss": 0.45367807149887085, "step": 11000 }, { "ce_loss": 0.1108308881521225, "epoch": 3.6691127418278855, "step": 11000 }, { "distill_loss": 0.14451178908348083, "epoch": 3.6691127418278855, "step": 11000 }, { "epoch": 3.6691127418278855, "ref_ce_loss": 0.1315654218196869, "step": 11000 }, { "epoch": 3.6691127418278855, "loss": 0.4495985507965088, "step": 11000 }, { "ce_loss": 0.09988871216773987, "epoch": 3.6691127418278855, "step": 11000 }, { "distill_loss": 0.13498930633068085, "epoch": 3.6691127418278855, "step": 11000 }, { "epoch": 3.6691127418278855, "ref_ce_loss": 0.09648939967155457, "step": 11000 }, { "epoch": 3.6691127418278855, "loss": 0.48693129420280457, "step": 11000 }, { "ce_loss": 0.19086134433746338, "epoch": 3.6691127418278855, "step": 11000 }, { "distill_loss": 0.1703328639268875, "epoch": 3.6691127418278855, "step": 11000 }, { "epoch": 3.6691127418278855, "ref_ce_loss": 0.12523435056209564, "step": 11000 }, { "epoch": 3.672448298865911, "loss": 0.5174, "step": 11010 }, { "epoch": 3.672448298865911, "grad_norm": 2.777346611022949, "step": 11010 }, { "epoch": 3.672448298865911, "learning_rate": 0.0001770351903824116, "step": 11010 }, { "epoch": 3.672448298865911, "loss": 0.39073672890663147, "step": 11010 }, { "ce_loss": 0.1334664672613144, "epoch": 3.672448298865911, "step": 11010 }, { "distill_loss": 0.14954569935798645, "epoch": 3.672448298865911, "step": 11010 }, { "epoch": 3.672448298865911, "ref_ce_loss": 0.10750074684619904, "step": 11010 }, { "epoch": 3.672448298865911, "loss": 0.4113316833972931, "step": 11010 }, { "ce_loss": 0.06988398730754852, "epoch": 3.672448298865911, "step": 11010 }, { "distill_loss": 0.16288743913173676, "epoch": 3.672448298865911, "step": 11010 }, { "epoch": 3.672448298865911, "ref_ce_loss": 0.08769288659095764, "step": 11010 }, { "epoch": 3.672448298865911, "loss": 0.6188124418258667, "step": 11010 }, { "ce_loss": 0.1457642912864685, "epoch": 3.672448298865911, "step": 11010 }, { "distill_loss": 0.15776768326759338, "epoch": 3.672448298865911, "step": 11010 }, { "epoch": 3.672448298865911, "ref_ce_loss": 0.11883719265460968, "step": 11010 }, { "epoch": 3.672448298865911, "loss": 0.531037449836731, "step": 11010 }, { "ce_loss": 0.14301258325576782, "epoch": 3.672448298865911, "step": 11010 }, { "distill_loss": 0.1943218857049942, "epoch": 3.672448298865911, "step": 11010 }, { "epoch": 3.672448298865911, "ref_ce_loss": 0.08588794618844986, "step": 11010 }, { "epoch": 3.675783855903936, "loss": 0.4489, "step": 11020 }, { "epoch": 3.675783855903936, "grad_norm": 2.049055814743042, "step": 11020 }, { "epoch": 3.675783855903936, "learning_rate": 0.00017683592158412704, "step": 11020 }, { "epoch": 3.675783855903936, "loss": 0.5385304689407349, "step": 11020 }, { "ce_loss": 0.08100423961877823, "epoch": 3.675783855903936, "step": 11020 }, { "distill_loss": 0.1679883748292923, "epoch": 3.675783855903936, "step": 11020 }, { "epoch": 3.675783855903936, "ref_ce_loss": 0.10024969279766083, "step": 11020 }, { "epoch": 3.675783855903936, "loss": 0.7263086438179016, "step": 11020 }, { "ce_loss": 0.09569170325994492, "epoch": 3.675783855903936, "step": 11020 }, { "distill_loss": 0.18969206511974335, "epoch": 3.675783855903936, "step": 11020 }, { "epoch": 3.675783855903936, "ref_ce_loss": 0.11134276539087296, "step": 11020 }, { "epoch": 3.675783855903936, "loss": 0.8118753433227539, "step": 11020 }, { "ce_loss": 0.200321227312088, "epoch": 3.675783855903936, "step": 11020 }, { "distill_loss": 0.20911133289337158, "epoch": 3.675783855903936, "step": 11020 }, { "epoch": 3.675783855903936, "ref_ce_loss": 0.11153283715248108, "step": 11020 }, { "epoch": 3.675783855903936, "loss": 0.4810273051261902, "step": 11020 }, { "ce_loss": 0.08565720915794373, "epoch": 3.675783855903936, "step": 11020 }, { "distill_loss": 0.13249194622039795, "epoch": 3.675783855903936, "step": 11020 }, { "epoch": 3.675783855903936, "ref_ce_loss": 0.11665981262922287, "step": 11020 }, { "epoch": 3.6791194129419615, "loss": 0.5263, "step": 11030 }, { "epoch": 3.6791194129419615, "grad_norm": 12.643746376037598, "step": 11030 }, { "epoch": 3.6791194129419615, "learning_rate": 0.00017663660384770739, "step": 11030 }, { "epoch": 3.6791194129419615, "loss": 0.8190609216690063, "step": 11030 }, { "ce_loss": 0.1600896418094635, "epoch": 3.6791194129419615, "step": 11030 }, { "distill_loss": 0.19733691215515137, "epoch": 3.6791194129419615, "step": 11030 }, { "epoch": 3.6791194129419615, "ref_ce_loss": 0.14073491096496582, "step": 11030 }, { "epoch": 3.6791194129419615, "loss": 1.0265297889709473, "step": 11030 }, { "ce_loss": 0.22034034132957458, "epoch": 3.6791194129419615, "step": 11030 }, { "distill_loss": 0.19493655860424042, "epoch": 3.6791194129419615, "step": 11030 }, { "epoch": 3.6791194129419615, "ref_ce_loss": 0.18260297179222107, "step": 11030 }, { "epoch": 3.6791194129419615, "loss": 0.36415034532546997, "step": 11030 }, { "ce_loss": 0.07882717996835709, "epoch": 3.6791194129419615, "step": 11030 }, { "distill_loss": 0.17585335671901703, "epoch": 3.6791194129419615, "step": 11030 }, { "epoch": 3.6791194129419615, "ref_ce_loss": 0.06001151353120804, "step": 11030 }, { "epoch": 3.6791194129419615, "loss": 0.47990405559539795, "step": 11030 }, { "ce_loss": 0.1564231514930725, "epoch": 3.6791194129419615, "step": 11030 }, { "distill_loss": 0.173221617937088, "epoch": 3.6791194129419615, "step": 11030 }, { "epoch": 3.6791194129419615, "ref_ce_loss": 0.07495393604040146, "step": 11030 }, { "epoch": 3.682454969979987, "loss": 0.5324, "step": 11040 }, { "epoch": 3.682454969979987, "grad_norm": 3.118147134780884, "step": 11040 }, { "epoch": 3.682454969979987, "learning_rate": 0.00017643723753662954, "step": 11040 }, { "epoch": 3.682454969979987, "loss": 0.5376815795898438, "step": 11040 }, { "ce_loss": 0.16959786415100098, "epoch": 3.682454969979987, "step": 11040 }, { "distill_loss": 0.16002260148525238, "epoch": 3.682454969979987, "step": 11040 }, { "epoch": 3.682454969979987, "ref_ce_loss": 0.1409972459077835, "step": 11040 }, { "epoch": 3.682454969979987, "loss": 0.5773664116859436, "step": 11040 }, { "ce_loss": 0.1668001413345337, "epoch": 3.682454969979987, "step": 11040 }, { "distill_loss": 0.15590955317020416, "epoch": 3.682454969979987, "step": 11040 }, { "epoch": 3.682454969979987, "ref_ce_loss": 0.11919999867677689, "step": 11040 }, { "epoch": 3.682454969979987, "loss": 0.45798033475875854, "step": 11040 }, { "ce_loss": 0.13292302191257477, "epoch": 3.682454969979987, "step": 11040 }, { "distill_loss": 0.16225001215934753, "epoch": 3.682454969979987, "step": 11040 }, { "epoch": 3.682454969979987, "ref_ce_loss": 0.08494387567043304, "step": 11040 }, { "epoch": 3.682454969979987, "loss": 0.5558249950408936, "step": 11040 }, { "ce_loss": 0.12711194157600403, "epoch": 3.682454969979987, "step": 11040 }, { "distill_loss": 0.21948517858982086, "epoch": 3.682454969979987, "step": 11040 }, { "epoch": 3.682454969979987, "ref_ce_loss": 0.0999494194984436, "step": 11040 }, { "epoch": 3.6857905270180122, "loss": 0.5009, "step": 11050 }, { "epoch": 3.6857905270180122, "grad_norm": 4.109448432922363, "step": 11050 }, { "epoch": 3.6857905270180122, "learning_rate": 0.00017623782301445917, "step": 11050 }, { "epoch": 3.6857905270180122, "loss": 0.43522462248802185, "step": 11050 }, { "ce_loss": 0.11968658119440079, "epoch": 3.6857905270180122, "step": 11050 }, { "distill_loss": 0.13889183104038239, "epoch": 3.6857905270180122, "step": 11050 }, { "epoch": 3.6857905270180122, "ref_ce_loss": 0.08983059227466583, "step": 11050 }, { "epoch": 3.6857905270180122, "loss": 0.6420830488204956, "step": 11050 }, { "ce_loss": 0.15069016814231873, "epoch": 3.6857905270180122, "step": 11050 }, { "distill_loss": 0.21941916644573212, "epoch": 3.6857905270180122, "step": 11050 }, { "epoch": 3.6857905270180122, "ref_ce_loss": 0.1224115639925003, "step": 11050 }, { "epoch": 3.6857905270180122, "loss": 0.3963969647884369, "step": 11050 }, { "ce_loss": 0.09678657352924347, "epoch": 3.6857905270180122, "step": 11050 }, { "distill_loss": 0.09813752770423889, "epoch": 3.6857905270180122, "step": 11050 }, { "epoch": 3.6857905270180122, "ref_ce_loss": 0.08016207069158554, "step": 11050 }, { "epoch": 3.6857905270180122, "loss": 0.21546831727027893, "step": 11050 }, { "ce_loss": 0.027873264625668526, "epoch": 3.6857905270180122, "step": 11050 }, { "distill_loss": 0.10052196681499481, "epoch": 3.6857905270180122, "step": 11050 }, { "epoch": 3.6857905270180122, "ref_ce_loss": 0.05747190862894058, "step": 11050 }, { "epoch": 3.6891260840560376, "loss": 0.4898, "step": 11060 }, { "epoch": 3.6891260840560376, "grad_norm": 2.2396340370178223, "step": 11060 }, { "epoch": 3.6891260840560376, "learning_rate": 0.00017603836064484949, "step": 11060 }, { "epoch": 3.6891260840560376, "loss": 0.16668157279491425, "step": 11060 }, { "ce_loss": 0.011731747537851334, "epoch": 3.6891260840560376, "step": 11060 }, { "distill_loss": 0.09391668438911438, "epoch": 3.6891260840560376, "step": 11060 }, { "epoch": 3.6891260840560376, "ref_ce_loss": 0.06087285652756691, "step": 11060 }, { "epoch": 3.6891260840560376, "loss": 0.36854448914527893, "step": 11060 }, { "ce_loss": 0.108416847884655, "epoch": 3.6891260840560376, "step": 11060 }, { "distill_loss": 0.13391919434070587, "epoch": 3.6891260840560376, "step": 11060 }, { "epoch": 3.6891260840560376, "ref_ce_loss": 0.08919122070074081, "step": 11060 }, { "epoch": 3.6891260840560376, "loss": 0.28041553497314453, "step": 11060 }, { "ce_loss": 0.05195368081331253, "epoch": 3.6891260840560376, "step": 11060 }, { "distill_loss": 0.07924183458089828, "epoch": 3.6891260840560376, "step": 11060 }, { "epoch": 3.6891260840560376, "ref_ce_loss": 0.09253819286823273, "step": 11060 }, { "epoch": 3.6891260840560376, "loss": 0.3154188096523285, "step": 11060 }, { "ce_loss": 0.10825911909341812, "epoch": 3.6891260840560376, "step": 11060 }, { "distill_loss": 0.11209943145513535, "epoch": 3.6891260840560376, "step": 11060 }, { "epoch": 3.6891260840560376, "ref_ce_loss": 0.09472549706697464, "step": 11060 }, { "epoch": 3.692461641094063, "loss": 0.5081, "step": 11070 }, { "epoch": 3.692461641094063, "grad_norm": 4.824976921081543, "step": 11070 }, { "epoch": 3.692461641094063, "learning_rate": 0.0001758388507915413, "step": 11070 }, { "epoch": 3.692461641094063, "loss": 0.5516034364700317, "step": 11070 }, { "ce_loss": 0.21088926494121552, "epoch": 3.692461641094063, "step": 11070 }, { "distill_loss": 0.1855248510837555, "epoch": 3.692461641094063, "step": 11070 }, { "epoch": 3.692461641094063, "ref_ce_loss": 0.11560109257698059, "step": 11070 }, { "epoch": 3.692461641094063, "loss": 0.32801610231399536, "step": 11070 }, { "ce_loss": 0.04863592982292175, "epoch": 3.692461641094063, "step": 11070 }, { "distill_loss": 0.14596258103847504, "epoch": 3.692461641094063, "step": 11070 }, { "epoch": 3.692461641094063, "ref_ce_loss": 0.06314782798290253, "step": 11070 }, { "epoch": 3.692461641094063, "loss": 0.583217203617096, "step": 11070 }, { "ce_loss": 0.15474650263786316, "epoch": 3.692461641094063, "step": 11070 }, { "distill_loss": 0.19245094060897827, "epoch": 3.692461641094063, "step": 11070 }, { "epoch": 3.692461641094063, "ref_ce_loss": 0.09346684068441391, "step": 11070 }, { "epoch": 3.692461641094063, "loss": 0.4624575972557068, "step": 11070 }, { "ce_loss": 0.1203080266714096, "epoch": 3.692461641094063, "step": 11070 }, { "distill_loss": 0.12611785531044006, "epoch": 3.692461641094063, "step": 11070 }, { "epoch": 3.692461641094063, "ref_ce_loss": 0.13801667094230652, "step": 11070 }, { "epoch": 3.6957971981320883, "loss": 0.4731, "step": 11080 }, { "epoch": 3.6957971981320883, "grad_norm": 3.346745014190674, "step": 11080 }, { "epoch": 3.6957971981320883, "learning_rate": 0.00017563929381836192, "step": 11080 }, { "epoch": 3.6957971981320883, "loss": 0.3251184821128845, "step": 11080 }, { "ce_loss": 0.1268000304698944, "epoch": 3.6957971981320883, "step": 11080 }, { "distill_loss": 0.10941871255636215, "epoch": 3.6957971981320883, "step": 11080 }, { "epoch": 3.6957971981320883, "ref_ce_loss": 0.0887828916311264, "step": 11080 }, { "epoch": 3.6957971981320883, "loss": 0.41327232122421265, "step": 11080 }, { "ce_loss": 0.1111992746591568, "epoch": 3.6957971981320883, "step": 11080 }, { "distill_loss": 0.11428511887788773, "epoch": 3.6957971981320883, "step": 11080 }, { "epoch": 3.6957971981320883, "ref_ce_loss": 0.08257275819778442, "step": 11080 }, { "epoch": 3.6957971981320883, "loss": 0.5031070113182068, "step": 11080 }, { "ce_loss": 0.16362826526165009, "epoch": 3.6957971981320883, "step": 11080 }, { "distill_loss": 0.1515190601348877, "epoch": 3.6957971981320883, "step": 11080 }, { "epoch": 3.6957971981320883, "ref_ce_loss": 0.15207554399967194, "step": 11080 }, { "epoch": 3.6957971981320883, "loss": 0.7305903434753418, "step": 11080 }, { "ce_loss": 0.13129164278507233, "epoch": 3.6957971981320883, "step": 11080 }, { "distill_loss": 0.181007519364357, "epoch": 3.6957971981320883, "step": 11080 }, { "epoch": 3.6957971981320883, "ref_ce_loss": 0.08313669264316559, "step": 11080 }, { "epoch": 3.6991327551701136, "loss": 0.5314, "step": 11090 }, { "epoch": 3.6991327551701136, "grad_norm": 4.584631443023682, "step": 11090 }, { "epoch": 3.6991327551701136, "learning_rate": 0.00017543969008922448, "step": 11090 }, { "epoch": 3.6991327551701136, "loss": 0.6552540063858032, "step": 11090 }, { "ce_loss": 0.1395750343799591, "epoch": 3.6991327551701136, "step": 11090 }, { "distill_loss": 0.22694694995880127, "epoch": 3.6991327551701136, "step": 11090 }, { "epoch": 3.6991327551701136, "ref_ce_loss": 0.12240070104598999, "step": 11090 }, { "epoch": 3.6991327551701136, "loss": 0.37868496775627136, "step": 11090 }, { "ce_loss": 0.13656875491142273, "epoch": 3.6991327551701136, "step": 11090 }, { "distill_loss": 0.1315259039402008, "epoch": 3.6991327551701136, "step": 11090 }, { "epoch": 3.6991327551701136, "ref_ce_loss": 0.0780695304274559, "step": 11090 }, { "epoch": 3.6991327551701136, "loss": 0.7789294719696045, "step": 11090 }, { "ce_loss": 0.15391351282596588, "epoch": 3.6991327551701136, "step": 11090 }, { "distill_loss": 0.2258004993200302, "epoch": 3.6991327551701136, "step": 11090 }, { "epoch": 3.6991327551701136, "ref_ce_loss": 0.1496977061033249, "step": 11090 }, { "epoch": 3.6991327551701136, "loss": 0.5458284020423889, "step": 11090 }, { "ce_loss": 0.08628889918327332, "epoch": 3.6991327551701136, "step": 11090 }, { "distill_loss": 0.11481862515211105, "epoch": 3.6991327551701136, "step": 11090 }, { "epoch": 3.6991327551701136, "ref_ce_loss": 0.08423425257205963, "step": 11090 }, { "epoch": 3.702468312208139, "loss": 0.4868, "step": 11100 }, { "epoch": 3.702468312208139, "grad_norm": 2.533743381500244, "step": 11100 }, { "epoch": 3.702468312208139, "learning_rate": 0.00017524003996812742, "step": 11100 }, { "epoch": 3.702468312208139, "loss": 0.7007694840431213, "step": 11100 }, { "ce_loss": 0.29086416959762573, "epoch": 3.702468312208139, "step": 11100 }, { "distill_loss": 0.20586749911308289, "epoch": 3.702468312208139, "step": 11100 }, { "epoch": 3.702468312208139, "ref_ce_loss": 0.0947490781545639, "step": 11100 }, { "epoch": 3.702468312208139, "loss": 0.8538395166397095, "step": 11100 }, { "ce_loss": 0.1478804051876068, "epoch": 3.702468312208139, "step": 11100 }, { "distill_loss": 0.14652153849601746, "epoch": 3.702468312208139, "step": 11100 }, { "epoch": 3.702468312208139, "ref_ce_loss": 0.11311804503202438, "step": 11100 }, { "epoch": 3.702468312208139, "loss": 0.3475764989852905, "step": 11100 }, { "ce_loss": 0.054964080452919006, "epoch": 3.702468312208139, "step": 11100 }, { "distill_loss": 0.1031276285648346, "epoch": 3.702468312208139, "step": 11100 }, { "epoch": 3.702468312208139, "ref_ce_loss": 0.0805249959230423, "step": 11100 }, { "epoch": 3.702468312208139, "loss": 0.28178277611732483, "step": 11100 }, { "ce_loss": 0.040550194680690765, "epoch": 3.702468312208139, "step": 11100 }, { "distill_loss": 0.12443134188652039, "epoch": 3.702468312208139, "step": 11100 }, { "epoch": 3.702468312208139, "ref_ce_loss": 0.08089882880449295, "step": 11100 }, { "epoch": 3.7058038692461643, "loss": 0.4953, "step": 11110 }, { "epoch": 3.7058038692461643, "grad_norm": 4.474017143249512, "step": 11110 }, { "epoch": 3.7058038692461643, "learning_rate": 0.00017504034381915387, "step": 11110 }, { "epoch": 3.7058038692461643, "loss": 0.2509787678718567, "step": 11110 }, { "ce_loss": 0.06018523499369621, "epoch": 3.7058038692461643, "step": 11110 }, { "distill_loss": 0.08989400416612625, "epoch": 3.7058038692461643, "step": 11110 }, { "epoch": 3.7058038692461643, "ref_ce_loss": 0.06109092757105827, "step": 11110 }, { "epoch": 3.7058038692461643, "loss": 0.6448087096214294, "step": 11110 }, { "ce_loss": 0.10397692024707794, "epoch": 3.7058038692461643, "step": 11110 }, { "distill_loss": 0.1309659481048584, "epoch": 3.7058038692461643, "step": 11110 }, { "epoch": 3.7058038692461643, "ref_ce_loss": 0.08222094178199768, "step": 11110 }, { "epoch": 3.7058038692461643, "loss": 0.4834359288215637, "step": 11110 }, { "ce_loss": 0.1207953616976738, "epoch": 3.7058038692461643, "step": 11110 }, { "distill_loss": 0.1629929095506668, "epoch": 3.7058038692461643, "step": 11110 }, { "epoch": 3.7058038692461643, "ref_ce_loss": 0.11464487761259079, "step": 11110 }, { "epoch": 3.7058038692461643, "loss": 0.2799364924430847, "step": 11110 }, { "ce_loss": 0.04275348037481308, "epoch": 3.7058038692461643, "step": 11110 }, { "distill_loss": 0.12101711332798004, "epoch": 3.7058038692461643, "step": 11110 }, { "epoch": 3.7058038692461643, "ref_ce_loss": 0.07686648517847061, "step": 11110 }, { "epoch": 3.7091394262841897, "loss": 0.4836, "step": 11120 }, { "epoch": 3.7091394262841897, "grad_norm": 2.729823589324951, "step": 11120 }, { "epoch": 3.7091394262841897, "learning_rate": 0.0001748406020064708, "step": 11120 }, { "epoch": 3.7091394262841897, "loss": 0.4658726155757904, "step": 11120 }, { "ce_loss": 0.1476747840642929, "epoch": 3.7091394262841897, "step": 11120 }, { "distill_loss": 0.21464860439300537, "epoch": 3.7091394262841897, "step": 11120 }, { "epoch": 3.7091394262841897, "ref_ce_loss": 0.08882784843444824, "step": 11120 }, { "epoch": 3.7091394262841897, "loss": 0.37286797165870667, "step": 11120 }, { "ce_loss": 0.05277108773589134, "epoch": 3.7091394262841897, "step": 11120 }, { "distill_loss": 0.21334585547447205, "epoch": 3.7091394262841897, "step": 11120 }, { "epoch": 3.7091394262841897, "ref_ce_loss": 0.07608553767204285, "step": 11120 }, { "epoch": 3.7091394262841897, "loss": 0.32241755723953247, "step": 11120 }, { "ce_loss": 0.07858740538358688, "epoch": 3.7091394262841897, "step": 11120 }, { "distill_loss": 0.10054327547550201, "epoch": 3.7091394262841897, "step": 11120 }, { "epoch": 3.7091394262841897, "ref_ce_loss": 0.09804639965295792, "step": 11120 }, { "epoch": 3.7091394262841897, "loss": 0.35880106687545776, "step": 11120 }, { "ce_loss": 0.08212389796972275, "epoch": 3.7091394262841897, "step": 11120 }, { "distill_loss": 0.12222301214933395, "epoch": 3.7091394262841897, "step": 11120 }, { "epoch": 3.7091394262841897, "ref_ce_loss": 0.11375194042921066, "step": 11120 }, { "epoch": 3.712474983322215, "loss": 0.4209, "step": 11130 }, { "epoch": 3.712474983322215, "grad_norm": 2.9692301750183105, "step": 11130 }, { "epoch": 3.712474983322215, "learning_rate": 0.0001746408148943285, "step": 11130 }, { "epoch": 3.712474983322215, "loss": 0.53624027967453, "step": 11130 }, { "ce_loss": 0.13671307265758514, "epoch": 3.712474983322215, "step": 11130 }, { "distill_loss": 0.11101743578910828, "epoch": 3.712474983322215, "step": 11130 }, { "epoch": 3.712474983322215, "ref_ce_loss": 0.11485659331083298, "step": 11130 }, { "epoch": 3.712474983322215, "loss": 0.5098305940628052, "step": 11130 }, { "ce_loss": 0.17487110197544098, "epoch": 3.712474983322215, "step": 11130 }, { "distill_loss": 0.1443914771080017, "epoch": 3.712474983322215, "step": 11130 }, { "epoch": 3.712474983322215, "ref_ce_loss": 0.12564757466316223, "step": 11130 }, { "epoch": 3.712474983322215, "loss": 0.38041433691978455, "step": 11130 }, { "ce_loss": 0.11597348749637604, "epoch": 3.712474983322215, "step": 11130 }, { "distill_loss": 0.09810422360897064, "epoch": 3.712474983322215, "step": 11130 }, { "epoch": 3.712474983322215, "ref_ce_loss": 0.10537905246019363, "step": 11130 }, { "epoch": 3.712474983322215, "loss": 0.4296853244304657, "step": 11130 }, { "ce_loss": 0.15972864627838135, "epoch": 3.712474983322215, "step": 11130 }, { "distill_loss": 0.13555540144443512, "epoch": 3.712474983322215, "step": 11130 }, { "epoch": 3.712474983322215, "ref_ce_loss": 0.13421572744846344, "step": 11130 }, { "epoch": 3.7158105403602404, "loss": 0.5083, "step": 11140 }, { "epoch": 3.7158105403602404, "grad_norm": 2.798766851425171, "step": 11140 }, { "epoch": 3.7158105403602404, "learning_rate": 0.00017444098284705983, "step": 11140 }, { "epoch": 3.7158105403602404, "loss": 0.32998281717300415, "step": 11140 }, { "ce_loss": 0.0770777240395546, "epoch": 3.7158105403602404, "step": 11140 }, { "distill_loss": 0.14420868456363678, "epoch": 3.7158105403602404, "step": 11140 }, { "epoch": 3.7158105403602404, "ref_ce_loss": 0.06987477093935013, "step": 11140 }, { "epoch": 3.7158105403602404, "loss": 0.30745163559913635, "step": 11140 }, { "ce_loss": 0.10314347594976425, "epoch": 3.7158105403602404, "step": 11140 }, { "distill_loss": 0.12897643446922302, "epoch": 3.7158105403602404, "step": 11140 }, { "epoch": 3.7158105403602404, "ref_ce_loss": 0.07500158250331879, "step": 11140 }, { "epoch": 3.7158105403602404, "loss": 0.5746026635169983, "step": 11140 }, { "ce_loss": 0.20710504055023193, "epoch": 3.7158105403602404, "step": 11140 }, { "distill_loss": 0.14439168572425842, "epoch": 3.7158105403602404, "step": 11140 }, { "epoch": 3.7158105403602404, "ref_ce_loss": 0.15772093832492828, "step": 11140 }, { "epoch": 3.7158105403602404, "loss": 0.4745178520679474, "step": 11140 }, { "ce_loss": 0.10678113996982574, "epoch": 3.7158105403602404, "step": 11140 }, { "distill_loss": 0.1293291449546814, "epoch": 3.7158105403602404, "step": 11140 }, { "epoch": 3.7158105403602404, "ref_ce_loss": 0.10236824303865433, "step": 11140 }, { "epoch": 3.7191460973982657, "loss": 0.4361, "step": 11150 }, { "epoch": 3.7191460973982657, "grad_norm": 3.206498861312866, "step": 11150 }, { "epoch": 3.7191460973982657, "learning_rate": 0.0001742411062290796, "step": 11150 }, { "epoch": 3.7191460973982657, "loss": 0.4355878531932831, "step": 11150 }, { "ce_loss": 0.12951213121414185, "epoch": 3.7191460973982657, "step": 11150 }, { "distill_loss": 0.1468600034713745, "epoch": 3.7191460973982657, "step": 11150 }, { "epoch": 3.7191460973982657, "ref_ce_loss": 0.12128368765115738, "step": 11150 }, { "epoch": 3.7191460973982657, "loss": 0.5208489298820496, "step": 11150 }, { "ce_loss": 0.18008677661418915, "epoch": 3.7191460973982657, "step": 11150 }, { "distill_loss": 0.1475437879562378, "epoch": 3.7191460973982657, "step": 11150 }, { "epoch": 3.7191460973982657, "ref_ce_loss": 0.12079443037509918, "step": 11150 }, { "epoch": 3.7191460973982657, "loss": 0.18894127011299133, "step": 11150 }, { "ce_loss": 0.04160792753100395, "epoch": 3.7191460973982657, "step": 11150 }, { "distill_loss": 0.061185117810964584, "epoch": 3.7191460973982657, "step": 11150 }, { "epoch": 3.7191460973982657, "ref_ce_loss": 0.08577242493629456, "step": 11150 }, { "epoch": 3.7191460973982657, "loss": 0.3723412752151489, "step": 11150 }, { "ce_loss": 0.10669536143541336, "epoch": 3.7191460973982657, "step": 11150 }, { "distill_loss": 0.09243173897266388, "epoch": 3.7191460973982657, "step": 11150 }, { "epoch": 3.7191460973982657, "ref_ce_loss": 0.11789239197969437, "step": 11150 }, { "epoch": 3.722481654436291, "loss": 0.4262, "step": 11160 }, { "epoch": 3.722481654436291, "grad_norm": 2.0635910034179688, "step": 11160 }, { "epoch": 3.722481654436291, "learning_rate": 0.00017404118540488396, "step": 11160 }, { "epoch": 3.722481654436291, "loss": 0.4313686788082123, "step": 11160 }, { "ce_loss": 0.12093643099069595, "epoch": 3.722481654436291, "step": 11160 }, { "distill_loss": 0.1406337320804596, "epoch": 3.722481654436291, "step": 11160 }, { "epoch": 3.722481654436291, "ref_ce_loss": 0.10394150018692017, "step": 11160 }, { "epoch": 3.722481654436291, "loss": 0.5439311265945435, "step": 11160 }, { "ce_loss": 0.14441914856433868, "epoch": 3.722481654436291, "step": 11160 }, { "distill_loss": 0.18547749519348145, "epoch": 3.722481654436291, "step": 11160 }, { "epoch": 3.722481654436291, "ref_ce_loss": 0.16917400062084198, "step": 11160 }, { "epoch": 3.722481654436291, "loss": 0.4756580591201782, "step": 11160 }, { "ce_loss": 0.0756123811006546, "epoch": 3.722481654436291, "step": 11160 }, { "distill_loss": 0.17472757399082184, "epoch": 3.722481654436291, "step": 11160 }, { "epoch": 3.722481654436291, "ref_ce_loss": 0.10686322301626205, "step": 11160 }, { "epoch": 3.722481654436291, "loss": 0.4173581600189209, "step": 11160 }, { "ce_loss": 0.10815001279115677, "epoch": 3.722481654436291, "step": 11160 }, { "distill_loss": 0.16694800555706024, "epoch": 3.722481654436291, "step": 11160 }, { "epoch": 3.722481654436291, "ref_ce_loss": 0.10757031291723251, "step": 11160 }, { "epoch": 3.7258172114743164, "loss": 0.5187, "step": 11170 }, { "epoch": 3.7258172114743164, "grad_norm": 2.583024501800537, "step": 11170 }, { "epoch": 3.7258172114743164, "learning_rate": 0.00017384122073904964, "step": 11170 }, { "epoch": 3.7258172114743164, "loss": 0.4847860038280487, "step": 11170 }, { "ce_loss": 0.1676764339208603, "epoch": 3.7258172114743164, "step": 11170 }, { "distill_loss": 0.14405414462089539, "epoch": 3.7258172114743164, "step": 11170 }, { "epoch": 3.7258172114743164, "ref_ce_loss": 0.12641099095344543, "step": 11170 }, { "epoch": 3.7258172114743164, "loss": 0.3804580867290497, "step": 11170 }, { "ce_loss": 0.14094428718090057, "epoch": 3.7258172114743164, "step": 11170 }, { "distill_loss": 0.13992449641227722, "epoch": 3.7258172114743164, "step": 11170 }, { "epoch": 3.7258172114743164, "ref_ce_loss": 0.06798779219388962, "step": 11170 }, { "epoch": 3.7258172114743164, "loss": 0.3895184099674225, "step": 11170 }, { "ce_loss": 0.12548105418682098, "epoch": 3.7258172114743164, "step": 11170 }, { "distill_loss": 0.16592243313789368, "epoch": 3.7258172114743164, "step": 11170 }, { "epoch": 3.7258172114743164, "ref_ce_loss": 0.09731275588274002, "step": 11170 }, { "epoch": 3.7258172114743164, "loss": 0.47543710470199585, "step": 11170 }, { "ce_loss": 0.15646733343601227, "epoch": 3.7258172114743164, "step": 11170 }, { "distill_loss": 0.12935645878314972, "epoch": 3.7258172114743164, "step": 11170 }, { "epoch": 3.7258172114743164, "ref_ce_loss": 0.11045189946889877, "step": 11170 }, { "epoch": 3.729152768512342, "loss": 0.4764, "step": 11180 }, { "epoch": 3.729152768512342, "grad_norm": 3.158120632171631, "step": 11180 }, { "epoch": 3.729152768512342, "learning_rate": 0.00017364121259623327, "step": 11180 }, { "epoch": 3.729152768512342, "loss": 0.3678155243396759, "step": 11180 }, { "ce_loss": 0.11589225381612778, "epoch": 3.729152768512342, "step": 11180 }, { "distill_loss": 0.12303879857063293, "epoch": 3.729152768512342, "step": 11180 }, { "epoch": 3.729152768512342, "ref_ce_loss": 0.12825645506381989, "step": 11180 }, { "epoch": 3.729152768512342, "loss": 0.5115286111831665, "step": 11180 }, { "ce_loss": 0.13510024547576904, "epoch": 3.729152768512342, "step": 11180 }, { "distill_loss": 0.22685614228248596, "epoch": 3.729152768512342, "step": 11180 }, { "epoch": 3.729152768512342, "ref_ce_loss": 0.11010359972715378, "step": 11180 }, { "epoch": 3.729152768512342, "loss": 0.4209415316581726, "step": 11180 }, { "ce_loss": 0.17654968798160553, "epoch": 3.729152768512342, "step": 11180 }, { "distill_loss": 0.12133859843015671, "epoch": 3.729152768512342, "step": 11180 }, { "epoch": 3.729152768512342, "ref_ce_loss": 0.09999922662973404, "step": 11180 }, { "epoch": 3.729152768512342, "loss": 0.46903812885284424, "step": 11180 }, { "ce_loss": 0.15544381737709045, "epoch": 3.729152768512342, "step": 11180 }, { "distill_loss": 0.1415010392665863, "epoch": 3.729152768512342, "step": 11180 }, { "epoch": 3.729152768512342, "ref_ce_loss": 0.108729287981987, "step": 11180 }, { "epoch": 3.732488325550367, "loss": 0.4928, "step": 11190 }, { "epoch": 3.732488325550367, "grad_norm": 3.1810953617095947, "step": 11190 }, { "epoch": 3.732488325550367, "learning_rate": 0.0001734411613411708, "step": 11190 }, { "epoch": 3.732488325550367, "loss": 0.46836310625076294, "step": 11190 }, { "ce_loss": 0.10594028979539871, "epoch": 3.732488325550367, "step": 11190 }, { "distill_loss": 0.13372491300106049, "epoch": 3.732488325550367, "step": 11190 }, { "epoch": 3.732488325550367, "ref_ce_loss": 0.08447429537773132, "step": 11190 }, { "epoch": 3.732488325550367, "loss": 0.4483862519264221, "step": 11190 }, { "ce_loss": 0.09653332829475403, "epoch": 3.732488325550367, "step": 11190 }, { "distill_loss": 0.13661277294158936, "epoch": 3.732488325550367, "step": 11190 }, { "epoch": 3.732488325550367, "ref_ce_loss": 0.10244203358888626, "step": 11190 }, { "epoch": 3.732488325550367, "loss": 0.5003663897514343, "step": 11190 }, { "ce_loss": 0.14703114330768585, "epoch": 3.732488325550367, "step": 11190 }, { "distill_loss": 0.18907728791236877, "epoch": 3.732488325550367, "step": 11190 }, { "epoch": 3.732488325550367, "ref_ce_loss": 0.12094144523143768, "step": 11190 }, { "epoch": 3.732488325550367, "loss": 0.7119038105010986, "step": 11190 }, { "ce_loss": 0.1835232377052307, "epoch": 3.732488325550367, "step": 11190 }, { "distill_loss": 0.20460136234760284, "epoch": 3.732488325550367, "step": 11190 }, { "epoch": 3.732488325550367, "ref_ce_loss": 0.10444125533103943, "step": 11190 }, { "epoch": 3.7358238825883925, "loss": 0.4707, "step": 11200 }, { "epoch": 3.7358238825883925, "grad_norm": 2.984477996826172, "step": 11200 }, { "epoch": 3.7358238825883925, "learning_rate": 0.00017324106733867687, "step": 11200 }, { "epoch": 3.7358238825883925, "loss": 0.351445734500885, "step": 11200 }, { "ce_loss": 0.09723985940217972, "epoch": 3.7358238825883925, "step": 11200 }, { "distill_loss": 0.11793617159128189, "epoch": 3.7358238825883925, "step": 11200 }, { "epoch": 3.7358238825883925, "ref_ce_loss": 0.07710584253072739, "step": 11200 }, { "epoch": 3.7358238825883925, "loss": 0.6089062690734863, "step": 11200 }, { "ce_loss": 0.16752246022224426, "epoch": 3.7358238825883925, "step": 11200 }, { "distill_loss": 0.12733298540115356, "epoch": 3.7358238825883925, "step": 11200 }, { "epoch": 3.7358238825883925, "ref_ce_loss": 0.1492699831724167, "step": 11200 }, { "epoch": 3.7358238825883925, "loss": 0.38957151770591736, "step": 11200 }, { "ce_loss": 0.13466042280197144, "epoch": 3.7358238825883925, "step": 11200 }, { "distill_loss": 0.12702538073062897, "epoch": 3.7358238825883925, "step": 11200 }, { "epoch": 3.7358238825883925, "ref_ce_loss": 0.08084917068481445, "step": 11200 }, { "epoch": 3.7358238825883925, "loss": 0.48998987674713135, "step": 11200 }, { "ce_loss": 0.1286323070526123, "epoch": 3.7358238825883925, "step": 11200 }, { "distill_loss": 0.11189994215965271, "epoch": 3.7358238825883925, "step": 11200 }, { "epoch": 3.7358238825883925, "ref_ce_loss": 0.11733870953321457, "step": 11200 }, { "epoch": 3.739159439626418, "loss": 0.4959, "step": 11210 }, { "epoch": 3.739159439626418, "grad_norm": 2.2686338424682617, "step": 11210 }, { "epoch": 3.739159439626418, "learning_rate": 0.0001730409309536439, "step": 11210 }, { "epoch": 3.739159439626418, "loss": 0.7203350067138672, "step": 11210 }, { "ce_loss": 0.12428060919046402, "epoch": 3.739159439626418, "step": 11210 }, { "distill_loss": 0.13582146167755127, "epoch": 3.739159439626418, "step": 11210 }, { "epoch": 3.739159439626418, "ref_ce_loss": 0.10568521171808243, "step": 11210 }, { "epoch": 3.739159439626418, "loss": 0.6753644943237305, "step": 11210 }, { "ce_loss": 0.12828564643859863, "epoch": 3.739159439626418, "step": 11210 }, { "distill_loss": 0.20919077098369598, "epoch": 3.739159439626418, "step": 11210 }, { "epoch": 3.739159439626418, "ref_ce_loss": 0.11066607385873795, "step": 11210 }, { "epoch": 3.739159439626418, "loss": 1.4700936079025269, "step": 11210 }, { "ce_loss": 0.07830823957920074, "epoch": 3.739159439626418, "step": 11210 }, { "distill_loss": 0.1397976577281952, "epoch": 3.739159439626418, "step": 11210 }, { "epoch": 3.739159439626418, "ref_ce_loss": 0.06886852532625198, "step": 11210 }, { "epoch": 3.739159439626418, "loss": 0.6401880979537964, "step": 11210 }, { "ce_loss": 0.07730946689844131, "epoch": 3.739159439626418, "step": 11210 }, { "distill_loss": 0.11617650091648102, "epoch": 3.739159439626418, "step": 11210 }, { "epoch": 3.739159439626418, "ref_ce_loss": 0.0711132138967514, "step": 11210 }, { "epoch": 3.742494996664443, "loss": 0.5192, "step": 11220 }, { "epoch": 3.742494996664443, "grad_norm": 3.824856758117676, "step": 11220 }, { "epoch": 3.742494996664443, "learning_rate": 0.00017284075255104186, "step": 11220 }, { "epoch": 3.742494996664443, "loss": 0.433307409286499, "step": 11220 }, { "ce_loss": 0.12180997431278229, "epoch": 3.742494996664443, "step": 11220 }, { "distill_loss": 0.1235133558511734, "epoch": 3.742494996664443, "step": 11220 }, { "epoch": 3.742494996664443, "ref_ce_loss": 0.09671497344970703, "step": 11220 }, { "epoch": 3.742494996664443, "loss": 0.42754268646240234, "step": 11220 }, { "ce_loss": 0.15168742835521698, "epoch": 3.742494996664443, "step": 11220 }, { "distill_loss": 0.1420907974243164, "epoch": 3.742494996664443, "step": 11220 }, { "epoch": 3.742494996664443, "ref_ce_loss": 0.09833323955535889, "step": 11220 }, { "epoch": 3.742494996664443, "loss": 0.4262707531452179, "step": 11220 }, { "ce_loss": 0.16959330439567566, "epoch": 3.742494996664443, "step": 11220 }, { "distill_loss": 0.1454181671142578, "epoch": 3.742494996664443, "step": 11220 }, { "epoch": 3.742494996664443, "ref_ce_loss": 0.08515819907188416, "step": 11220 }, { "epoch": 3.742494996664443, "loss": 0.34843114018440247, "step": 11220 }, { "ce_loss": 0.0687236413359642, "epoch": 3.742494996664443, "step": 11220 }, { "distill_loss": 0.14176149666309357, "epoch": 3.742494996664443, "step": 11220 }, { "epoch": 3.742494996664443, "ref_ce_loss": 0.09360263496637344, "step": 11220 }, { "epoch": 3.7458305537024685, "loss": 0.4694, "step": 11230 }, { "epoch": 3.7458305537024685, "grad_norm": 4.92000675201416, "step": 11230 }, { "epoch": 3.7458305537024685, "learning_rate": 0.00017264053249591704, "step": 11230 }, { "epoch": 3.7458305537024685, "loss": 0.609779953956604, "step": 11230 }, { "ce_loss": 0.1590002477169037, "epoch": 3.7458305537024685, "step": 11230 }, { "distill_loss": 0.14490337669849396, "epoch": 3.7458305537024685, "step": 11230 }, { "epoch": 3.7458305537024685, "ref_ce_loss": 0.14144301414489746, "step": 11230 }, { "epoch": 3.7458305537024685, "loss": 0.47014445066452026, "step": 11230 }, { "ce_loss": 0.1638866811990738, "epoch": 3.7458305537024685, "step": 11230 }, { "distill_loss": 0.14338326454162598, "epoch": 3.7458305537024685, "step": 11230 }, { "epoch": 3.7458305537024685, "ref_ce_loss": 0.12581142783164978, "step": 11230 }, { "epoch": 3.7458305537024685, "loss": 0.3591836094856262, "step": 11230 }, { "ce_loss": 0.11410403251647949, "epoch": 3.7458305537024685, "step": 11230 }, { "distill_loss": 0.13631364703178406, "epoch": 3.7458305537024685, "step": 11230 }, { "epoch": 3.7458305537024685, "ref_ce_loss": 0.0813060849905014, "step": 11230 }, { "epoch": 3.7458305537024685, "loss": 0.5285924077033997, "step": 11230 }, { "ce_loss": 0.2021360844373703, "epoch": 3.7458305537024685, "step": 11230 }, { "distill_loss": 0.19308476150035858, "epoch": 3.7458305537024685, "step": 11230 }, { "epoch": 3.7458305537024685, "ref_ce_loss": 0.10977895557880402, "step": 11230 }, { "epoch": 3.749166110740494, "loss": 0.4772, "step": 11240 }, { "epoch": 3.749166110740494, "grad_norm": 3.521881103515625, "step": 11240 }, { "epoch": 3.749166110740494, "learning_rate": 0.00017244027115339192, "step": 11240 }, { "epoch": 3.749166110740494, "loss": 0.37861761450767517, "step": 11240 }, { "ce_loss": 0.11489034444093704, "epoch": 3.749166110740494, "step": 11240 }, { "distill_loss": 0.12115241587162018, "epoch": 3.749166110740494, "step": 11240 }, { "epoch": 3.749166110740494, "ref_ce_loss": 0.10814513266086578, "step": 11240 }, { "epoch": 3.749166110740494, "loss": 0.42796438932418823, "step": 11240 }, { "ce_loss": 0.13979476690292358, "epoch": 3.749166110740494, "step": 11240 }, { "distill_loss": 0.12097348272800446, "epoch": 3.749166110740494, "step": 11240 }, { "epoch": 3.749166110740494, "ref_ce_loss": 0.0843663215637207, "step": 11240 }, { "epoch": 3.749166110740494, "loss": 0.3992958068847656, "step": 11240 }, { "ce_loss": 0.10800452530384064, "epoch": 3.749166110740494, "step": 11240 }, { "distill_loss": 0.17938587069511414, "epoch": 3.749166110740494, "step": 11240 }, { "epoch": 3.749166110740494, "ref_ce_loss": 0.07487044483423233, "step": 11240 }, { "epoch": 3.749166110740494, "loss": 0.808132529258728, "step": 11240 }, { "ce_loss": 0.1502559930086136, "epoch": 3.749166110740494, "step": 11240 }, { "distill_loss": 0.1359129697084427, "epoch": 3.749166110740494, "step": 11240 }, { "epoch": 3.749166110740494, "ref_ce_loss": 0.11720101535320282, "step": 11240 }, { "epoch": 3.7525016677785192, "loss": 0.4801, "step": 11250 }, { "epoch": 3.7525016677785192, "grad_norm": 3.7433605194091797, "step": 11250 }, { "epoch": 3.7525016677785192, "learning_rate": 0.00017223996888866423, "step": 11250 }, { "epoch": 3.7525016677785192, "loss": 0.4895704984664917, "step": 11250 }, { "ce_loss": 0.186688631772995, "epoch": 3.7525016677785192, "step": 11250 }, { "distill_loss": 0.12781491875648499, "epoch": 3.7525016677785192, "step": 11250 }, { "epoch": 3.7525016677785192, "ref_ce_loss": 0.12212307751178741, "step": 11250 }, { "epoch": 3.7525016677785192, "loss": 0.4208083152770996, "step": 11250 }, { "ce_loss": 0.10923559218645096, "epoch": 3.7525016677785192, "step": 11250 }, { "distill_loss": 0.134412482380867, "epoch": 3.7525016677785192, "step": 11250 }, { "epoch": 3.7525016677785192, "ref_ce_loss": 0.14637266099452972, "step": 11250 }, { "epoch": 3.7525016677785192, "loss": 0.48236986994743347, "step": 11250 }, { "ce_loss": 0.13481567800045013, "epoch": 3.7525016677785192, "step": 11250 }, { "distill_loss": 0.12056757509708405, "epoch": 3.7525016677785192, "step": 11250 }, { "epoch": 3.7525016677785192, "ref_ce_loss": 0.11300322413444519, "step": 11250 }, { "epoch": 3.7525016677785192, "loss": 0.33931252360343933, "step": 11250 }, { "ce_loss": 0.09116953611373901, "epoch": 3.7525016677785192, "step": 11250 }, { "distill_loss": 0.15315291285514832, "epoch": 3.7525016677785192, "step": 11250 }, { "epoch": 3.7525016677785192, "ref_ce_loss": 0.09439779818058014, "step": 11250 }, { "epoch": 3.7558372248165446, "loss": 0.4252, "step": 11260 }, { "epoch": 3.7558372248165446, "grad_norm": 2.2029407024383545, "step": 11260 }, { "epoch": 3.7558372248165446, "learning_rate": 0.00017203962606700618, "step": 11260 }, { "epoch": 3.7558372248165446, "loss": 0.7249472141265869, "step": 11260 }, { "ce_loss": 0.09998872131109238, "epoch": 3.7558372248165446, "step": 11260 }, { "distill_loss": 0.1515664905309677, "epoch": 3.7558372248165446, "step": 11260 }, { "epoch": 3.7558372248165446, "ref_ce_loss": 0.12862011790275574, "step": 11260 }, { "epoch": 3.7558372248165446, "loss": 0.5033693909645081, "step": 11260 }, { "ce_loss": 0.20025144517421722, "epoch": 3.7558372248165446, "step": 11260 }, { "distill_loss": 0.13580387830734253, "epoch": 3.7558372248165446, "step": 11260 }, { "epoch": 3.7558372248165446, "ref_ce_loss": 0.13315941393375397, "step": 11260 }, { "epoch": 3.7558372248165446, "loss": 0.3584093451499939, "step": 11260 }, { "ce_loss": 0.12843570113182068, "epoch": 3.7558372248165446, "step": 11260 }, { "distill_loss": 0.12751731276512146, "epoch": 3.7558372248165446, "step": 11260 }, { "epoch": 3.7558372248165446, "ref_ce_loss": 0.10165061056613922, "step": 11260 }, { "epoch": 3.7558372248165446, "loss": 0.416264146566391, "step": 11260 }, { "ce_loss": 0.15564116835594177, "epoch": 3.7558372248165446, "step": 11260 }, { "distill_loss": 0.11993543803691864, "epoch": 3.7558372248165446, "step": 11260 }, { "epoch": 3.7558372248165446, "ref_ce_loss": 0.1401854306459427, "step": 11260 }, { "epoch": 3.75917278185457, "loss": 0.4981, "step": 11270 }, { "epoch": 3.75917278185457, "grad_norm": 4.698095321655273, "step": 11270 }, { "epoch": 3.75917278185457, "learning_rate": 0.00017183924305376415, "step": 11270 }, { "epoch": 3.75917278185457, "loss": 0.7320033311843872, "step": 11270 }, { "ce_loss": 0.2233872264623642, "epoch": 3.75917278185457, "step": 11270 }, { "distill_loss": 0.14781275391578674, "epoch": 3.75917278185457, "step": 11270 }, { "epoch": 3.75917278185457, "ref_ce_loss": 0.13521909713745117, "step": 11270 }, { "epoch": 3.75917278185457, "loss": 0.3893508017063141, "step": 11270 }, { "ce_loss": 0.1054534986615181, "epoch": 3.75917278185457, "step": 11270 }, { "distill_loss": 0.11295004189014435, "epoch": 3.75917278185457, "step": 11270 }, { "epoch": 3.75917278185457, "ref_ce_loss": 0.08586245775222778, "step": 11270 }, { "epoch": 3.75917278185457, "loss": 0.4391804039478302, "step": 11270 }, { "ce_loss": 0.08601868897676468, "epoch": 3.75917278185457, "step": 11270 }, { "distill_loss": 0.13763193786144257, "epoch": 3.75917278185457, "step": 11270 }, { "epoch": 3.75917278185457, "ref_ce_loss": 0.0853324756026268, "step": 11270 }, { "epoch": 3.75917278185457, "loss": 0.33526211977005005, "step": 11270 }, { "ce_loss": 0.03173764422535896, "epoch": 3.75917278185457, "step": 11270 }, { "distill_loss": 0.09609787166118622, "epoch": 3.75917278185457, "step": 11270 }, { "epoch": 3.75917278185457, "ref_ce_loss": 0.0757674053311348, "step": 11270 }, { "epoch": 3.7625083388925953, "loss": 0.4434, "step": 11280 }, { "epoch": 3.7625083388925953, "grad_norm": 2.5927469730377197, "step": 11280 }, { "epoch": 3.7625083388925953, "learning_rate": 0.00017163882021435764, "step": 11280 }, { "epoch": 3.7625083388925953, "loss": 0.4663512706756592, "step": 11280 }, { "ce_loss": 0.15741774439811707, "epoch": 3.7625083388925953, "step": 11280 }, { "distill_loss": 0.14555154740810394, "epoch": 3.7625083388925953, "step": 11280 }, { "epoch": 3.7625083388925953, "ref_ce_loss": 0.10073420405387878, "step": 11280 }, { "epoch": 3.7625083388925953, "loss": 0.36396872997283936, "step": 11280 }, { "ce_loss": 0.11238844692707062, "epoch": 3.7625083388925953, "step": 11280 }, { "distill_loss": 0.1512582004070282, "epoch": 3.7625083388925953, "step": 11280 }, { "epoch": 3.7625083388925953, "ref_ce_loss": 0.10009444504976273, "step": 11280 }, { "epoch": 3.7625083388925953, "loss": 0.3391995131969452, "step": 11280 }, { "ce_loss": 0.11951491981744766, "epoch": 3.7625083388925953, "step": 11280 }, { "distill_loss": 0.10620466619729996, "epoch": 3.7625083388925953, "step": 11280 }, { "epoch": 3.7625083388925953, "ref_ce_loss": 0.07334575802087784, "step": 11280 }, { "epoch": 3.7625083388925953, "loss": 0.4804520606994629, "step": 11280 }, { "ce_loss": 0.07516561448574066, "epoch": 3.7625083388925953, "step": 11280 }, { "distill_loss": 0.1316811740398407, "epoch": 3.7625083388925953, "step": 11280 }, { "epoch": 3.7625083388925953, "ref_ce_loss": 0.08692770451307297, "step": 11280 }, { "epoch": 3.7658438959306206, "loss": 0.5076, "step": 11290 }, { "epoch": 3.7658438959306206, "grad_norm": 4.046371936798096, "step": 11290 }, { "epoch": 3.7658438959306206, "learning_rate": 0.00017143835791427888, "step": 11290 }, { "epoch": 3.7658438959306206, "loss": 0.6690834164619446, "step": 11290 }, { "ce_loss": 0.15648917853832245, "epoch": 3.7658438959306206, "step": 11290 }, { "distill_loss": 0.188400000333786, "epoch": 3.7658438959306206, "step": 11290 }, { "epoch": 3.7658438959306206, "ref_ce_loss": 0.10655590146780014, "step": 11290 }, { "epoch": 3.7658438959306206, "loss": 0.24589793384075165, "step": 11290 }, { "ce_loss": 0.04482785612344742, "epoch": 3.7658438959306206, "step": 11290 }, { "distill_loss": 0.09695935249328613, "epoch": 3.7658438959306206, "step": 11290 }, { "epoch": 3.7658438959306206, "ref_ce_loss": 0.06211121380329132, "step": 11290 }, { "epoch": 3.7658438959306206, "loss": 0.24969103932380676, "step": 11290 }, { "ce_loss": 0.061918340623378754, "epoch": 3.7658438959306206, "step": 11290 }, { "distill_loss": 0.09989786893129349, "epoch": 3.7658438959306206, "step": 11290 }, { "epoch": 3.7658438959306206, "ref_ce_loss": 0.08783142268657684, "step": 11290 }, { "epoch": 3.7658438959306206, "loss": 0.3797447085380554, "step": 11290 }, { "ce_loss": 0.09772893786430359, "epoch": 3.7658438959306206, "step": 11290 }, { "distill_loss": 0.13673144578933716, "epoch": 3.7658438959306206, "step": 11290 }, { "epoch": 3.7658438959306206, "ref_ce_loss": 0.09559912979602814, "step": 11290 }, { "epoch": 3.769179452968646, "loss": 0.4614, "step": 11300 }, { "epoch": 3.769179452968646, "grad_norm": 3.4041154384613037, "step": 11300 }, { "epoch": 3.769179452968646, "learning_rate": 0.0001712378565190921, "step": 11300 }, { "epoch": 3.769179452968646, "loss": 0.37293708324432373, "step": 11300 }, { "ce_loss": 0.09539689123630524, "epoch": 3.769179452968646, "step": 11300 }, { "distill_loss": 0.153346985578537, "epoch": 3.769179452968646, "step": 11300 }, { "epoch": 3.769179452968646, "ref_ce_loss": 0.08379501849412918, "step": 11300 }, { "epoch": 3.769179452968646, "loss": 0.3446539044380188, "step": 11300 }, { "ce_loss": 0.10582612454891205, "epoch": 3.769179452968646, "step": 11300 }, { "distill_loss": 0.12128449976444244, "epoch": 3.769179452968646, "step": 11300 }, { "epoch": 3.769179452968646, "ref_ce_loss": 0.09659599512815475, "step": 11300 }, { "epoch": 3.769179452968646, "loss": 0.7190386056900024, "step": 11300 }, { "ce_loss": 0.08890789747238159, "epoch": 3.769179452968646, "step": 11300 }, { "distill_loss": 0.12199924141168594, "epoch": 3.769179452968646, "step": 11300 }, { "epoch": 3.769179452968646, "ref_ce_loss": 0.12765192985534668, "step": 11300 }, { "epoch": 3.769179452968646, "loss": 0.41915977001190186, "step": 11300 }, { "ce_loss": 0.13943921029567719, "epoch": 3.769179452968646, "step": 11300 }, { "distill_loss": 0.16844049096107483, "epoch": 3.769179452968646, "step": 11300 }, { "epoch": 3.769179452968646, "ref_ce_loss": 0.11106761544942856, "step": 11300 }, { "epoch": 3.7725150100066713, "loss": 0.4793, "step": 11310 }, { "epoch": 3.7725150100066713, "grad_norm": 4.181412696838379, "step": 11310 }, { "epoch": 3.7725150100066713, "learning_rate": 0.0001710373163944326, "step": 11310 }, { "epoch": 3.7725150100066713, "loss": 0.48942622542381287, "step": 11310 }, { "ce_loss": 0.17593348026275635, "epoch": 3.7725150100066713, "step": 11310 }, { "distill_loss": 0.13092395663261414, "epoch": 3.7725150100066713, "step": 11310 }, { "epoch": 3.7725150100066713, "ref_ce_loss": 0.10657544434070587, "step": 11310 }, { "epoch": 3.7725150100066713, "loss": 0.2934868335723877, "step": 11310 }, { "ce_loss": 0.07878759503364563, "epoch": 3.7725150100066713, "step": 11310 }, { "distill_loss": 0.09831201285123825, "epoch": 3.7725150100066713, "step": 11310 }, { "epoch": 3.7725150100066713, "ref_ce_loss": 0.08740630000829697, "step": 11310 }, { "epoch": 3.7725150100066713, "loss": 0.3965328633785248, "step": 11310 }, { "ce_loss": 0.09016729146242142, "epoch": 3.7725150100066713, "step": 11310 }, { "distill_loss": 0.09539443999528885, "epoch": 3.7725150100066713, "step": 11310 }, { "epoch": 3.7725150100066713, "ref_ce_loss": 0.07570146769285202, "step": 11310 }, { "epoch": 3.7725150100066713, "loss": 0.5012593269348145, "step": 11310 }, { "ce_loss": 0.1913784295320511, "epoch": 3.7725150100066713, "step": 11310 }, { "distill_loss": 0.16016805171966553, "epoch": 3.7725150100066713, "step": 11310 }, { "epoch": 3.7725150100066713, "ref_ce_loss": 0.11406117677688599, "step": 11310 }, { "epoch": 3.7758505670446967, "loss": 0.486, "step": 11320 }, { "epoch": 3.7758505670446967, "grad_norm": 3.193751573562622, "step": 11320 }, { "epoch": 3.7758505670446967, "learning_rate": 0.00017083673790600648, "step": 11320 }, { "epoch": 3.7758505670446967, "loss": 1.0421459674835205, "step": 11320 }, { "ce_loss": 0.145387202501297, "epoch": 3.7758505670446967, "step": 11320 }, { "distill_loss": 0.19037388265132904, "epoch": 3.7758505670446967, "step": 11320 }, { "epoch": 3.7758505670446967, "ref_ce_loss": 0.10654689371585846, "step": 11320 }, { "epoch": 3.7758505670446967, "loss": 0.6212438344955444, "step": 11320 }, { "ce_loss": 0.14684922993183136, "epoch": 3.7758505670446967, "step": 11320 }, { "distill_loss": 0.1758728325366974, "epoch": 3.7758505670446967, "step": 11320 }, { "epoch": 3.7758505670446967, "ref_ce_loss": 0.12208641320466995, "step": 11320 }, { "epoch": 3.7758505670446967, "loss": 0.6268714070320129, "step": 11320 }, { "ce_loss": 0.20063500106334686, "epoch": 3.7758505670446967, "step": 11320 }, { "distill_loss": 0.16264456510543823, "epoch": 3.7758505670446967, "step": 11320 }, { "epoch": 3.7758505670446967, "ref_ce_loss": 0.07537572830915451, "step": 11320 }, { "epoch": 3.7758505670446967, "loss": 0.6568611860275269, "step": 11320 }, { "ce_loss": 0.1356017291545868, "epoch": 3.7758505670446967, "step": 11320 }, { "distill_loss": 0.17617285251617432, "epoch": 3.7758505670446967, "step": 11320 }, { "epoch": 3.7758505670446967, "ref_ce_loss": 0.09102947264909744, "step": 11320 }, { "epoch": 3.779186124082722, "loss": 0.5344, "step": 11330 }, { "epoch": 3.779186124082722, "grad_norm": 2.6559865474700928, "step": 11330 }, { "epoch": 3.779186124082722, "learning_rate": 0.00017063612141958996, "step": 11330 }, { "epoch": 3.779186124082722, "loss": 0.5010161399841309, "step": 11330 }, { "ce_loss": 0.17269544303417206, "epoch": 3.779186124082722, "step": 11330 }, { "distill_loss": 0.1870214194059372, "epoch": 3.779186124082722, "step": 11330 }, { "epoch": 3.779186124082722, "ref_ce_loss": 0.09947448968887329, "step": 11330 }, { "epoch": 3.779186124082722, "loss": 0.9882927536964417, "step": 11330 }, { "ce_loss": 0.2070547342300415, "epoch": 3.779186124082722, "step": 11330 }, { "distill_loss": 0.2346445918083191, "epoch": 3.779186124082722, "step": 11330 }, { "epoch": 3.779186124082722, "ref_ce_loss": 0.1403355449438095, "step": 11330 }, { "epoch": 3.779186124082722, "loss": 0.37922123074531555, "step": 11330 }, { "ce_loss": 0.10499770939350128, "epoch": 3.779186124082722, "step": 11330 }, { "distill_loss": 0.17015215754508972, "epoch": 3.779186124082722, "step": 11330 }, { "epoch": 3.779186124082722, "ref_ce_loss": 0.10365436226129532, "step": 11330 }, { "epoch": 3.779186124082722, "loss": 0.5613137483596802, "step": 11330 }, { "ce_loss": 0.08903953433036804, "epoch": 3.779186124082722, "step": 11330 }, { "distill_loss": 0.16297371685504913, "epoch": 3.779186124082722, "step": 11330 }, { "epoch": 3.779186124082722, "ref_ce_loss": 0.07672325521707535, "step": 11330 }, { "epoch": 3.7825216811207474, "loss": 0.4881, "step": 11340 }, { "epoch": 3.7825216811207474, "grad_norm": 3.2749686241149902, "step": 11340 }, { "epoch": 3.7825216811207474, "learning_rate": 0.00017043546730102823, "step": 11340 }, { "epoch": 3.7825216811207474, "loss": 0.39569780230522156, "step": 11340 }, { "ce_loss": 0.09518209844827652, "epoch": 3.7825216811207474, "step": 11340 }, { "distill_loss": 0.1157209724187851, "epoch": 3.7825216811207474, "step": 11340 }, { "epoch": 3.7825216811207474, "ref_ce_loss": 0.07844128459692001, "step": 11340 }, { "epoch": 3.7825216811207474, "loss": 1.2595678567886353, "step": 11340 }, { "ce_loss": 0.14451389014720917, "epoch": 3.7825216811207474, "step": 11340 }, { "distill_loss": 0.1210441067814827, "epoch": 3.7825216811207474, "step": 11340 }, { "epoch": 3.7825216811207474, "ref_ce_loss": 0.13017107546329498, "step": 11340 }, { "epoch": 3.7825216811207474, "loss": 0.9055213928222656, "step": 11340 }, { "ce_loss": 0.2113693356513977, "epoch": 3.7825216811207474, "step": 11340 }, { "distill_loss": 0.1787041276693344, "epoch": 3.7825216811207474, "step": 11340 }, { "epoch": 3.7825216811207474, "ref_ce_loss": 0.07526680827140808, "step": 11340 }, { "epoch": 3.7825216811207474, "loss": 0.3183319568634033, "step": 11340 }, { "ce_loss": 0.08012717217206955, "epoch": 3.7825216811207474, "step": 11340 }, { "distill_loss": 0.1112750992178917, "epoch": 3.7825216811207474, "step": 11340 }, { "epoch": 3.7825216811207474, "ref_ce_loss": 0.09437817335128784, "step": 11340 }, { "epoch": 3.7858572381587727, "loss": 0.5804, "step": 11350 }, { "epoch": 3.7858572381587727, "grad_norm": 2.631268262863159, "step": 11350 }, { "epoch": 3.7858572381587727, "learning_rate": 0.00017023477591623524, "step": 11350 }, { "epoch": 3.7858572381587727, "loss": 0.5030439496040344, "step": 11350 }, { "ce_loss": 0.09157869219779968, "epoch": 3.7858572381587727, "step": 11350 }, { "distill_loss": 0.11663976311683655, "epoch": 3.7858572381587727, "step": 11350 }, { "epoch": 3.7858572381587727, "ref_ce_loss": 0.11903272569179535, "step": 11350 }, { "epoch": 3.7858572381587727, "loss": 0.467189759016037, "step": 11350 }, { "ce_loss": 0.12028718739748001, "epoch": 3.7858572381587727, "step": 11350 }, { "distill_loss": 0.17001944780349731, "epoch": 3.7858572381587727, "step": 11350 }, { "epoch": 3.7858572381587727, "ref_ce_loss": 0.12809155881404877, "step": 11350 }, { "epoch": 3.7858572381587727, "loss": 0.4767748713493347, "step": 11350 }, { "ce_loss": 0.1660422831773758, "epoch": 3.7858572381587727, "step": 11350 }, { "distill_loss": 0.11231112480163574, "epoch": 3.7858572381587727, "step": 11350 }, { "epoch": 3.7858572381587727, "ref_ce_loss": 0.1354854255914688, "step": 11350 }, { "epoch": 3.7858572381587727, "loss": 0.30572032928466797, "step": 11350 }, { "ce_loss": 0.05527346208691597, "epoch": 3.7858572381587727, "step": 11350 }, { "distill_loss": 0.11191977560520172, "epoch": 3.7858572381587727, "step": 11350 }, { "epoch": 3.7858572381587727, "ref_ce_loss": 0.0689462423324585, "step": 11350 }, { "epoch": 3.789192795196798, "loss": 0.4704, "step": 11360 }, { "epoch": 3.789192795196798, "grad_norm": 2.8327856063842773, "step": 11360 }, { "epoch": 3.789192795196798, "learning_rate": 0.000170034047631193, "step": 11360 }, { "epoch": 3.789192795196798, "loss": 0.3354668617248535, "step": 11360 }, { "ce_loss": 0.08834193646907806, "epoch": 3.789192795196798, "step": 11360 }, { "distill_loss": 0.13074669241905212, "epoch": 3.789192795196798, "step": 11360 }, { "epoch": 3.789192795196798, "ref_ce_loss": 0.0854744091629982, "step": 11360 }, { "epoch": 3.789192795196798, "loss": 0.6441950798034668, "step": 11360 }, { "ce_loss": 0.1427772492170334, "epoch": 3.789192795196798, "step": 11360 }, { "distill_loss": 0.13033153116703033, "epoch": 3.789192795196798, "step": 11360 }, { "epoch": 3.789192795196798, "ref_ce_loss": 0.0994003564119339, "step": 11360 }, { "epoch": 3.789192795196798, "loss": 0.3823297619819641, "step": 11360 }, { "ce_loss": 0.15958751738071442, "epoch": 3.789192795196798, "step": 11360 }, { "distill_loss": 0.11525478959083557, "epoch": 3.789192795196798, "step": 11360 }, { "epoch": 3.789192795196798, "ref_ce_loss": 0.08661943674087524, "step": 11360 }, { "epoch": 3.789192795196798, "loss": 0.41648799180984497, "step": 11360 }, { "ce_loss": 0.0783916637301445, "epoch": 3.789192795196798, "step": 11360 }, { "distill_loss": 0.10013296455144882, "epoch": 3.789192795196798, "step": 11360 }, { "epoch": 3.789192795196798, "ref_ce_loss": 0.15111957490444183, "step": 11360 }, { "epoch": 3.7925283522348234, "loss": 0.5495, "step": 11370 }, { "epoch": 3.7925283522348234, "grad_norm": 5.2707719802856445, "step": 11370 }, { "epoch": 3.7925283522348234, "learning_rate": 0.0001698332828119506, "step": 11370 }, { "epoch": 3.7925283522348234, "loss": 0.8329063057899475, "step": 11370 }, { "ce_loss": 0.0788368433713913, "epoch": 3.7925283522348234, "step": 11370 }, { "distill_loss": 0.1925995647907257, "epoch": 3.7925283522348234, "step": 11370 }, { "epoch": 3.7925283522348234, "ref_ce_loss": 0.15627038478851318, "step": 11370 }, { "epoch": 3.7925283522348234, "loss": 0.4649633467197418, "step": 11370 }, { "ce_loss": 0.1610284000635147, "epoch": 3.7925283522348234, "step": 11370 }, { "distill_loss": 0.12861159443855286, "epoch": 3.7925283522348234, "step": 11370 }, { "epoch": 3.7925283522348234, "ref_ce_loss": 0.07107669115066528, "step": 11370 }, { "epoch": 3.7925283522348234, "loss": 0.44461050629615784, "step": 11370 }, { "ce_loss": 0.13306909799575806, "epoch": 3.7925283522348234, "step": 11370 }, { "distill_loss": 0.1817498505115509, "epoch": 3.7925283522348234, "step": 11370 }, { "epoch": 3.7925283522348234, "ref_ce_loss": 0.11042030155658722, "step": 11370 }, { "epoch": 3.7925283522348234, "loss": 0.605265736579895, "step": 11370 }, { "ce_loss": 0.22842943668365479, "epoch": 3.7925283522348234, "step": 11370 }, { "distill_loss": 0.18195891380310059, "epoch": 3.7925283522348234, "step": 11370 }, { "epoch": 3.7925283522348234, "ref_ce_loss": 0.16252247989177704, "step": 11370 }, { "epoch": 3.795863909272849, "loss": 0.4597, "step": 11380 }, { "epoch": 3.795863909272849, "grad_norm": 2.723623037338257, "step": 11380 }, { "epoch": 3.795863909272849, "learning_rate": 0.00016963248182462397, "step": 11380 }, { "epoch": 3.795863909272849, "loss": 0.5169832110404968, "step": 11380 }, { "ce_loss": 0.11923842132091522, "epoch": 3.795863909272849, "step": 11380 }, { "distill_loss": 0.1981431394815445, "epoch": 3.795863909272849, "step": 11380 }, { "epoch": 3.795863909272849, "ref_ce_loss": 0.08852661401033401, "step": 11380 }, { "epoch": 3.795863909272849, "loss": 0.38678765296936035, "step": 11380 }, { "ce_loss": 0.12697753310203552, "epoch": 3.795863909272849, "step": 11380 }, { "distill_loss": 0.14126920700073242, "epoch": 3.795863909272849, "step": 11380 }, { "epoch": 3.795863909272849, "ref_ce_loss": 0.05947829782962799, "step": 11380 }, { "epoch": 3.795863909272849, "loss": 0.5108277201652527, "step": 11380 }, { "ce_loss": 0.0795733779668808, "epoch": 3.795863909272849, "step": 11380 }, { "distill_loss": 0.12557706236839294, "epoch": 3.795863909272849, "step": 11380 }, { "epoch": 3.795863909272849, "ref_ce_loss": 0.09753233939409256, "step": 11380 }, { "epoch": 3.795863909272849, "loss": 0.4048956632614136, "step": 11380 }, { "ce_loss": 0.13374558091163635, "epoch": 3.795863909272849, "step": 11380 }, { "distill_loss": 0.15967635810375214, "epoch": 3.795863909272849, "step": 11380 }, { "epoch": 3.795863909272849, "ref_ce_loss": 0.08542696386575699, "step": 11380 }, { "epoch": 3.799199466310874, "loss": 0.5665, "step": 11390 }, { "epoch": 3.799199466310874, "grad_norm": 3.3278257846832275, "step": 11390 }, { "epoch": 3.799199466310874, "learning_rate": 0.00016943164503539491, "step": 11390 }, { "epoch": 3.799199466310874, "loss": 0.45715150237083435, "step": 11390 }, { "ce_loss": 0.12858489155769348, "epoch": 3.799199466310874, "step": 11390 }, { "distill_loss": 0.15878446400165558, "epoch": 3.799199466310874, "step": 11390 }, { "epoch": 3.799199466310874, "ref_ce_loss": 0.13602490723133087, "step": 11390 }, { "epoch": 3.799199466310874, "loss": 0.5386162996292114, "step": 11390 }, { "ce_loss": 0.15765678882598877, "epoch": 3.799199466310874, "step": 11390 }, { "distill_loss": 0.15718784928321838, "epoch": 3.799199466310874, "step": 11390 }, { "epoch": 3.799199466310874, "ref_ce_loss": 0.15782934427261353, "step": 11390 }, { "epoch": 3.799199466310874, "loss": 0.3341408371925354, "step": 11390 }, { "ce_loss": 0.0596722736954689, "epoch": 3.799199466310874, "step": 11390 }, { "distill_loss": 0.13757136464118958, "epoch": 3.799199466310874, "step": 11390 }, { "epoch": 3.799199466310874, "ref_ce_loss": 0.07602232694625854, "step": 11390 }, { "epoch": 3.799199466310874, "loss": 0.6375399231910706, "step": 11390 }, { "ce_loss": 0.21162016689777374, "epoch": 3.799199466310874, "step": 11390 }, { "distill_loss": 0.20884963870048523, "epoch": 3.799199466310874, "step": 11390 }, { "epoch": 3.799199466310874, "ref_ce_loss": 0.11842446774244308, "step": 11390 }, { "epoch": 3.8025350233488995, "loss": 0.4789, "step": 11400 }, { "epoch": 3.8025350233488995, "grad_norm": 2.8649020195007324, "step": 11400 }, { "epoch": 3.8025350233488995, "learning_rate": 0.00016923077281051041, "step": 11400 }, { "epoch": 3.8025350233488995, "loss": 0.4338218569755554, "step": 11400 }, { "ce_loss": 0.14233727753162384, "epoch": 3.8025350233488995, "step": 11400 }, { "distill_loss": 0.1555687040090561, "epoch": 3.8025350233488995, "step": 11400 }, { "epoch": 3.8025350233488995, "ref_ce_loss": 0.0952368825674057, "step": 11400 }, { "epoch": 3.8025350233488995, "loss": 0.45588257908821106, "step": 11400 }, { "ce_loss": 0.0950573980808258, "epoch": 3.8025350233488995, "step": 11400 }, { "distill_loss": 0.1703718602657318, "epoch": 3.8025350233488995, "step": 11400 }, { "epoch": 3.8025350233488995, "ref_ce_loss": 0.10461972653865814, "step": 11400 }, { "epoch": 3.8025350233488995, "loss": 0.349023699760437, "step": 11400 }, { "ce_loss": 0.12215173244476318, "epoch": 3.8025350233488995, "step": 11400 }, { "distill_loss": 0.13151374459266663, "epoch": 3.8025350233488995, "step": 11400 }, { "epoch": 3.8025350233488995, "ref_ce_loss": 0.08300989866256714, "step": 11400 }, { "epoch": 3.8025350233488995, "loss": 0.5359344482421875, "step": 11400 }, { "ce_loss": 0.07137981802225113, "epoch": 3.8025350233488995, "step": 11400 }, { "distill_loss": 0.15614856779575348, "epoch": 3.8025350233488995, "step": 11400 }, { "epoch": 3.8025350233488995, "ref_ce_loss": 0.10713817179203033, "step": 11400 }, { "epoch": 3.805870580386925, "loss": 0.5378, "step": 11410 }, { "epoch": 3.805870580386925, "grad_norm": 1.8696403503417969, "step": 11410 }, { "epoch": 3.805870580386925, "learning_rate": 0.00016902986551628227, "step": 11410 }, { "epoch": 3.805870580386925, "loss": 0.7627152800559998, "step": 11410 }, { "ce_loss": 0.16251112520694733, "epoch": 3.805870580386925, "step": 11410 }, { "distill_loss": 0.18291950225830078, "epoch": 3.805870580386925, "step": 11410 }, { "epoch": 3.805870580386925, "ref_ce_loss": 0.06121499091386795, "step": 11410 }, { "epoch": 3.805870580386925, "loss": 0.41648682951927185, "step": 11410 }, { "ce_loss": 0.1059994250535965, "epoch": 3.805870580386925, "step": 11410 }, { "distill_loss": 0.14188294112682343, "epoch": 3.805870580386925, "step": 11410 }, { "epoch": 3.805870580386925, "ref_ce_loss": 0.10719886422157288, "step": 11410 }, { "epoch": 3.805870580386925, "loss": 0.6039667725563049, "step": 11410 }, { "ce_loss": 0.11324057728052139, "epoch": 3.805870580386925, "step": 11410 }, { "distill_loss": 0.15148040652275085, "epoch": 3.805870580386925, "step": 11410 }, { "epoch": 3.805870580386925, "ref_ce_loss": 0.1474461853504181, "step": 11410 }, { "epoch": 3.805870580386925, "loss": 0.5163719654083252, "step": 11410 }, { "ce_loss": 0.15379080176353455, "epoch": 3.805870580386925, "step": 11410 }, { "distill_loss": 0.17272308468818665, "epoch": 3.805870580386925, "step": 11410 }, { "epoch": 3.805870580386925, "ref_ce_loss": 0.1529800295829773, "step": 11410 }, { "epoch": 3.80920613742495, "loss": 0.5488, "step": 11420 }, { "epoch": 3.80920613742495, "grad_norm": 4.255395889282227, "step": 11420 }, { "epoch": 3.80920613742495, "learning_rate": 0.00016882892351908606, "step": 11420 }, { "epoch": 3.80920613742495, "loss": 0.7047617435455322, "step": 11420 }, { "ce_loss": 0.08719105273485184, "epoch": 3.80920613742495, "step": 11420 }, { "distill_loss": 0.14586789906024933, "epoch": 3.80920613742495, "step": 11420 }, { "epoch": 3.80920613742495, "ref_ce_loss": 0.06734959781169891, "step": 11420 }, { "epoch": 3.80920613742495, "loss": 0.3226097524166107, "step": 11420 }, { "ce_loss": 0.10788920521736145, "epoch": 3.80920613742495, "step": 11420 }, { "distill_loss": 0.10606271028518677, "epoch": 3.80920613742495, "step": 11420 }, { "epoch": 3.80920613742495, "ref_ce_loss": 0.10857447236776352, "step": 11420 }, { "epoch": 3.80920613742495, "loss": 0.506752073764801, "step": 11420 }, { "ce_loss": 0.20389452576637268, "epoch": 3.80920613742495, "step": 11420 }, { "distill_loss": 0.1739177256822586, "epoch": 3.80920613742495, "step": 11420 }, { "epoch": 3.80920613742495, "ref_ce_loss": 0.08177173137664795, "step": 11420 }, { "epoch": 3.80920613742495, "loss": 0.4029782712459564, "step": 11420 }, { "ce_loss": 0.10493995994329453, "epoch": 3.80920613742495, "step": 11420 }, { "distill_loss": 0.1328524351119995, "epoch": 3.80920613742495, "step": 11420 }, { "epoch": 3.80920613742495, "ref_ce_loss": 0.11656699329614639, "step": 11420 }, { "epoch": 3.8125416944629755, "loss": 0.5567, "step": 11430 }, { "epoch": 3.8125416944629755, "grad_norm": 1.993993878364563, "step": 11430 }, { "epoch": 3.8125416944629755, "learning_rate": 0.0001686279471853608, "step": 11430 }, { "epoch": 3.8125416944629755, "loss": 0.955905556678772, "step": 11430 }, { "ce_loss": 0.1232018992304802, "epoch": 3.8125416944629755, "step": 11430 }, { "distill_loss": 0.20480038225650787, "epoch": 3.8125416944629755, "step": 11430 }, { "epoch": 3.8125416944629755, "ref_ce_loss": 0.10530410706996918, "step": 11430 }, { "epoch": 3.8125416944629755, "loss": 0.543412446975708, "step": 11430 }, { "ce_loss": 0.18799374997615814, "epoch": 3.8125416944629755, "step": 11430 }, { "distill_loss": 0.1597263514995575, "epoch": 3.8125416944629755, "step": 11430 }, { "epoch": 3.8125416944629755, "ref_ce_loss": 0.11109674721956253, "step": 11430 }, { "epoch": 3.8125416944629755, "loss": 0.33423781394958496, "step": 11430 }, { "ce_loss": 0.10933635383844376, "epoch": 3.8125416944629755, "step": 11430 }, { "distill_loss": 0.13336357474327087, "epoch": 3.8125416944629755, "step": 11430 }, { "epoch": 3.8125416944629755, "ref_ce_loss": 0.0713241845369339, "step": 11430 }, { "epoch": 3.8125416944629755, "loss": 0.45266109704971313, "step": 11430 }, { "ce_loss": 0.14071442186832428, "epoch": 3.8125416944629755, "step": 11430 }, { "distill_loss": 0.1235266625881195, "epoch": 3.8125416944629755, "step": 11430 }, { "epoch": 3.8125416944629755, "ref_ce_loss": 0.10337050259113312, "step": 11430 }, { "epoch": 3.815877251501001, "loss": 0.4643, "step": 11440 }, { "epoch": 3.815877251501001, "grad_norm": 2.24055552482605, "step": 11440 }, { "epoch": 3.815877251501001, "learning_rate": 0.00016842693688160794, "step": 11440 }, { "epoch": 3.815877251501001, "loss": 0.5264979004859924, "step": 11440 }, { "ce_loss": 0.17195414006710052, "epoch": 3.815877251501001, "step": 11440 }, { "distill_loss": 0.1488429754972458, "epoch": 3.815877251501001, "step": 11440 }, { "epoch": 3.815877251501001, "ref_ce_loss": 0.14474719762802124, "step": 11440 }, { "epoch": 3.815877251501001, "loss": 0.4335517883300781, "step": 11440 }, { "ce_loss": 0.13938097655773163, "epoch": 3.815877251501001, "step": 11440 }, { "distill_loss": 0.14782142639160156, "epoch": 3.815877251501001, "step": 11440 }, { "epoch": 3.815877251501001, "ref_ce_loss": 0.06512699276208878, "step": 11440 }, { "epoch": 3.815877251501001, "loss": 0.4729892611503601, "step": 11440 }, { "ce_loss": 0.11497749388217926, "epoch": 3.815877251501001, "step": 11440 }, { "distill_loss": 0.1210591122508049, "epoch": 3.815877251501001, "step": 11440 }, { "epoch": 3.815877251501001, "ref_ce_loss": 0.1321076601743698, "step": 11440 }, { "epoch": 3.815877251501001, "loss": 0.8550109267234802, "step": 11440 }, { "ce_loss": 0.14266787469387054, "epoch": 3.815877251501001, "step": 11440 }, { "distill_loss": 0.16640591621398926, "epoch": 3.815877251501001, "step": 11440 }, { "epoch": 3.815877251501001, "ref_ce_loss": 0.1064901351928711, "step": 11440 }, { "epoch": 3.8192128085390262, "loss": 0.4977, "step": 11450 }, { "epoch": 3.8192128085390262, "grad_norm": 3.0378942489624023, "step": 11450 }, { "epoch": 3.8192128085390262, "learning_rate": 0.00016822589297439108, "step": 11450 }, { "epoch": 3.8192128085390262, "loss": 0.8309723138809204, "step": 11450 }, { "ce_loss": 0.18916484713554382, "epoch": 3.8192128085390262, "step": 11450 }, { "distill_loss": 0.17502309381961823, "epoch": 3.8192128085390262, "step": 11450 }, { "epoch": 3.8192128085390262, "ref_ce_loss": 0.09515126049518585, "step": 11450 }, { "epoch": 3.8192128085390262, "loss": 0.7633737325668335, "step": 11450 }, { "ce_loss": 0.3523317873477936, "epoch": 3.8192128085390262, "step": 11450 }, { "distill_loss": 0.235255628824234, "epoch": 3.8192128085390262, "step": 11450 }, { "epoch": 3.8192128085390262, "ref_ce_loss": 0.13086393475532532, "step": 11450 }, { "epoch": 3.8192128085390262, "loss": 0.5058243274688721, "step": 11450 }, { "ce_loss": 0.19745786488056183, "epoch": 3.8192128085390262, "step": 11450 }, { "distill_loss": 0.13970790803432465, "epoch": 3.8192128085390262, "step": 11450 }, { "epoch": 3.8192128085390262, "ref_ce_loss": 0.12864728271961212, "step": 11450 }, { "epoch": 3.8192128085390262, "loss": 0.9811885356903076, "step": 11450 }, { "ce_loss": 0.13506951928138733, "epoch": 3.8192128085390262, "step": 11450 }, { "distill_loss": 0.12431784719228745, "epoch": 3.8192128085390262, "step": 11450 }, { "epoch": 3.8192128085390262, "ref_ce_loss": 0.09122858941555023, "step": 11450 }, { "epoch": 3.8225483655770516, "loss": 0.5499, "step": 11460 }, { "epoch": 3.8225483655770516, "grad_norm": 3.075512170791626, "step": 11460 }, { "epoch": 3.8225483655770516, "learning_rate": 0.00016802481583033495, "step": 11460 }, { "epoch": 3.8225483655770516, "loss": 0.40927478671073914, "step": 11460 }, { "ce_loss": 0.12293460220098495, "epoch": 3.8225483655770516, "step": 11460 }, { "distill_loss": 0.16415520012378693, "epoch": 3.8225483655770516, "step": 11460 }, { "epoch": 3.8225483655770516, "ref_ce_loss": 0.08292596787214279, "step": 11460 }, { "epoch": 3.8225483655770516, "loss": 0.30401870608329773, "step": 11460 }, { "ce_loss": 0.0601293109357357, "epoch": 3.8225483655770516, "step": 11460 }, { "distill_loss": 0.11315567791461945, "epoch": 3.8225483655770516, "step": 11460 }, { "epoch": 3.8225483655770516, "ref_ce_loss": 0.13054944574832916, "step": 11460 }, { "epoch": 3.8225483655770516, "loss": 1.0063207149505615, "step": 11460 }, { "ce_loss": 0.1392781138420105, "epoch": 3.8225483655770516, "step": 11460 }, { "distill_loss": 0.12168440222740173, "epoch": 3.8225483655770516, "step": 11460 }, { "epoch": 3.8225483655770516, "ref_ce_loss": 0.09981101006269455, "step": 11460 }, { "epoch": 3.8225483655770516, "loss": 0.37048590183258057, "step": 11460 }, { "ce_loss": 0.09129716455936432, "epoch": 3.8225483655770516, "step": 11460 }, { "distill_loss": 0.1344550997018814, "epoch": 3.8225483655770516, "step": 11460 }, { "epoch": 3.8225483655770516, "ref_ce_loss": 0.07073692977428436, "step": 11460 }, { "epoch": 3.825883922615077, "loss": 0.5073, "step": 11470 }, { "epoch": 3.825883922615077, "grad_norm": 3.2191481590270996, "step": 11470 }, { "epoch": 3.825883922615077, "learning_rate": 0.000167823705816125, "step": 11470 }, { "epoch": 3.825883922615077, "loss": 0.48221296072006226, "step": 11470 }, { "ce_loss": 0.1248675063252449, "epoch": 3.825883922615077, "step": 11470 }, { "distill_loss": 0.18250107765197754, "epoch": 3.825883922615077, "step": 11470 }, { "epoch": 3.825883922615077, "ref_ce_loss": 0.13513994216918945, "step": 11470 }, { "epoch": 3.825883922615077, "loss": 0.493794322013855, "step": 11470 }, { "ce_loss": 0.12715068459510803, "epoch": 3.825883922615077, "step": 11470 }, { "distill_loss": 0.15496912598609924, "epoch": 3.825883922615077, "step": 11470 }, { "epoch": 3.825883922615077, "ref_ce_loss": 0.1318623125553131, "step": 11470 }, { "epoch": 3.825883922615077, "loss": 0.3657727837562561, "step": 11470 }, { "ce_loss": 0.13231192529201508, "epoch": 3.825883922615077, "step": 11470 }, { "distill_loss": 0.16635209321975708, "epoch": 3.825883922615077, "step": 11470 }, { "epoch": 3.825883922615077, "ref_ce_loss": 0.046196348965168, "step": 11470 }, { "epoch": 3.825883922615077, "loss": 0.6422207951545715, "step": 11470 }, { "ce_loss": 0.14488328993320465, "epoch": 3.825883922615077, "step": 11470 }, { "distill_loss": 0.14699134230613708, "epoch": 3.825883922615077, "step": 11470 }, { "epoch": 3.825883922615077, "ref_ce_loss": 0.1012740209698677, "step": 11470 }, { "epoch": 3.8292194796531023, "loss": 0.4673, "step": 11480 }, { "epoch": 3.8292194796531023, "grad_norm": 2.7229583263397217, "step": 11480 }, { "epoch": 3.8292194796531023, "learning_rate": 0.0001676225632985065, "step": 11480 }, { "epoch": 3.8292194796531023, "loss": 0.9861547946929932, "step": 11480 }, { "ce_loss": 0.07622343301773071, "epoch": 3.8292194796531023, "step": 11480 }, { "distill_loss": 0.12430855631828308, "epoch": 3.8292194796531023, "step": 11480 }, { "epoch": 3.8292194796531023, "ref_ce_loss": 0.1209321841597557, "step": 11480 }, { "epoch": 3.8292194796531023, "loss": 0.4856456220149994, "step": 11480 }, { "ce_loss": 0.1425744742155075, "epoch": 3.8292194796531023, "step": 11480 }, { "distill_loss": 0.2019084393978119, "epoch": 3.8292194796531023, "step": 11480 }, { "epoch": 3.8292194796531023, "ref_ce_loss": 0.09371843934059143, "step": 11480 }, { "epoch": 3.8292194796531023, "loss": 0.6687341928482056, "step": 11480 }, { "ce_loss": 0.2130454182624817, "epoch": 3.8292194796531023, "step": 11480 }, { "distill_loss": 0.21954701840877533, "epoch": 3.8292194796531023, "step": 11480 }, { "epoch": 3.8292194796531023, "ref_ce_loss": 0.10809019207954407, "step": 11480 }, { "epoch": 3.8292194796531023, "loss": 0.3015283942222595, "step": 11480 }, { "ce_loss": 0.08696311712265015, "epoch": 3.8292194796531023, "step": 11480 }, { "distill_loss": 0.11111010611057281, "epoch": 3.8292194796531023, "step": 11480 }, { "epoch": 3.8292194796531023, "ref_ce_loss": 0.0717904344201088, "step": 11480 }, { "epoch": 3.8325550366911276, "loss": 0.5021, "step": 11490 }, { "epoch": 3.8325550366911276, "grad_norm": 2.285511016845703, "step": 11490 }, { "epoch": 3.8325550366911276, "learning_rate": 0.00016742138864428403, "step": 11490 }, { "epoch": 3.8325550366911276, "loss": 0.36162352561950684, "step": 11490 }, { "ce_loss": 0.09861333668231964, "epoch": 3.8325550366911276, "step": 11490 }, { "distill_loss": 0.12867295742034912, "epoch": 3.8325550366911276, "step": 11490 }, { "epoch": 3.8325550366911276, "ref_ce_loss": 0.1342122107744217, "step": 11490 }, { "epoch": 3.8325550366911276, "loss": 0.5594727993011475, "step": 11490 }, { "ce_loss": 0.1684267222881317, "epoch": 3.8325550366911276, "step": 11490 }, { "distill_loss": 0.16716884076595306, "epoch": 3.8325550366911276, "step": 11490 }, { "epoch": 3.8325550366911276, "ref_ce_loss": 0.09705770015716553, "step": 11490 }, { "epoch": 3.8325550366911276, "loss": 0.48375794291496277, "step": 11490 }, { "ce_loss": 0.19998034834861755, "epoch": 3.8325550366911276, "step": 11490 }, { "distill_loss": 0.13959825038909912, "epoch": 3.8325550366911276, "step": 11490 }, { "epoch": 3.8325550366911276, "ref_ce_loss": 0.14406739175319672, "step": 11490 }, { "epoch": 3.8325550366911276, "loss": 0.46701711416244507, "step": 11490 }, { "ce_loss": 0.13540586829185486, "epoch": 3.8325550366911276, "step": 11490 }, { "distill_loss": 0.15161505341529846, "epoch": 3.8325550366911276, "step": 11490 }, { "epoch": 3.8325550366911276, "ref_ce_loss": 0.12113173305988312, "step": 11490 }, { "epoch": 3.835890593729153, "loss": 0.4589, "step": 11500 }, { "epoch": 3.835890593729153, "grad_norm": 2.968534469604492, "step": 11500 }, { "epoch": 3.835890593729153, "learning_rate": 0.00016722018222032085, "step": 11500 }, { "epoch": 3.835890593729153, "loss": 0.48139268159866333, "step": 11500 }, { "ce_loss": 0.1482585072517395, "epoch": 3.835890593729153, "step": 11500 }, { "distill_loss": 0.16626609861850739, "epoch": 3.835890593729153, "step": 11500 }, { "epoch": 3.835890593729153, "ref_ce_loss": 0.11868167668581009, "step": 11500 }, { "epoch": 3.835890593729153, "loss": 0.46502596139907837, "step": 11500 }, { "ce_loss": 0.16445650160312653, "epoch": 3.835890593729153, "step": 11500 }, { "distill_loss": 0.16992321610450745, "epoch": 3.835890593729153, "step": 11500 }, { "epoch": 3.835890593729153, "ref_ce_loss": 0.10180897265672684, "step": 11500 }, { "epoch": 3.835890593729153, "loss": 0.6293812394142151, "step": 11500 }, { "ce_loss": 0.06558717787265778, "epoch": 3.835890593729153, "step": 11500 }, { "distill_loss": 0.13015815615653992, "epoch": 3.835890593729153, "step": 11500 }, { "epoch": 3.835890593729153, "ref_ce_loss": 0.081215038895607, "step": 11500 }, { "epoch": 3.835890593729153, "loss": 0.9193124175071716, "step": 11500 }, { "ce_loss": 0.17590521275997162, "epoch": 3.835890593729153, "step": 11500 }, { "distill_loss": 0.21001935005187988, "epoch": 3.835890593729153, "step": 11500 }, { "epoch": 3.835890593729153, "ref_ce_loss": 0.1322702318429947, "step": 11500 }, { "epoch": 3.8392261507671783, "loss": 0.5016, "step": 11510 }, { "epoch": 3.8392261507671783, "grad_norm": 3.171128034591675, "step": 11510 }, { "epoch": 3.8392261507671783, "learning_rate": 0.00016701894439353818, "step": 11510 }, { "epoch": 3.8392261507671783, "loss": 0.40584662556648254, "step": 11510 }, { "ce_loss": 0.11897286772727966, "epoch": 3.8392261507671783, "step": 11510 }, { "distill_loss": 0.15135160088539124, "epoch": 3.8392261507671783, "step": 11510 }, { "epoch": 3.8392261507671783, "ref_ce_loss": 0.1354077309370041, "step": 11510 }, { "epoch": 3.8392261507671783, "loss": 0.44731903076171875, "step": 11510 }, { "ce_loss": 0.1088339239358902, "epoch": 3.8392261507671783, "step": 11510 }, { "distill_loss": 0.1577133685350418, "epoch": 3.8392261507671783, "step": 11510 }, { "epoch": 3.8392261507671783, "ref_ce_loss": 0.13546203076839447, "step": 11510 }, { "epoch": 3.8392261507671783, "loss": 0.9254103899002075, "step": 11510 }, { "ce_loss": 0.18921251595020294, "epoch": 3.8392261507671783, "step": 11510 }, { "distill_loss": 0.1279955953359604, "epoch": 3.8392261507671783, "step": 11510 }, { "epoch": 3.8392261507671783, "ref_ce_loss": 0.11867087334394455, "step": 11510 }, { "epoch": 3.8392261507671783, "loss": 0.41881296038627625, "step": 11510 }, { "ce_loss": 0.1359606236219406, "epoch": 3.8392261507671783, "step": 11510 }, { "distill_loss": 0.16384638845920563, "epoch": 3.8392261507671783, "step": 11510 }, { "epoch": 3.8392261507671783, "ref_ce_loss": 0.08438153564929962, "step": 11510 }, { "epoch": 3.8425617078052037, "loss": 0.5232, "step": 11520 }, { "epoch": 3.8425617078052037, "grad_norm": 3.616981267929077, "step": 11520 }, { "epoch": 3.8425617078052037, "learning_rate": 0.0001668176755309143, "step": 11520 }, { "epoch": 3.8425617078052037, "loss": 0.36008861660957336, "step": 11520 }, { "ce_loss": 0.09858591854572296, "epoch": 3.8425617078052037, "step": 11520 }, { "distill_loss": 0.13108551502227783, "epoch": 3.8425617078052037, "step": 11520 }, { "epoch": 3.8425617078052037, "ref_ce_loss": 0.10107474774122238, "step": 11520 }, { "epoch": 3.8425617078052037, "loss": 0.4817239046096802, "step": 11520 }, { "ce_loss": 0.03896396607160568, "epoch": 3.8425617078052037, "step": 11520 }, { "distill_loss": 0.10215901583433151, "epoch": 3.8425617078052037, "step": 11520 }, { "epoch": 3.8425617078052037, "ref_ce_loss": 0.07847031205892563, "step": 11520 }, { "epoch": 3.8425617078052037, "loss": 0.6684030294418335, "step": 11520 }, { "ce_loss": 0.1818249374628067, "epoch": 3.8425617078052037, "step": 11520 }, { "distill_loss": 0.14205506443977356, "epoch": 3.8425617078052037, "step": 11520 }, { "epoch": 3.8425617078052037, "ref_ce_loss": 0.11002738028764725, "step": 11520 }, { "epoch": 3.8425617078052037, "loss": 0.3725161552429199, "step": 11520 }, { "ce_loss": 0.12967868149280548, "epoch": 3.8425617078052037, "step": 11520 }, { "distill_loss": 0.14431628584861755, "epoch": 3.8425617078052037, "step": 11520 }, { "epoch": 3.8425617078052037, "ref_ce_loss": 0.0984206423163414, "step": 11520 }, { "epoch": 3.845897264843229, "loss": 0.4685, "step": 11530 }, { "epoch": 3.845897264843229, "grad_norm": 2.334750175476074, "step": 11530 }, { "epoch": 3.845897264843229, "learning_rate": 0.0001666163759994843, "step": 11530 }, { "epoch": 3.845897264843229, "loss": 0.6536337733268738, "step": 11530 }, { "ce_loss": 0.2566821873188019, "epoch": 3.845897264843229, "step": 11530 }, { "distill_loss": 0.22711533308029175, "epoch": 3.845897264843229, "step": 11530 }, { "epoch": 3.845897264843229, "ref_ce_loss": 0.13162018358707428, "step": 11530 }, { "epoch": 3.845897264843229, "loss": 0.5456714630126953, "step": 11530 }, { "ce_loss": 0.21901577711105347, "epoch": 3.845897264843229, "step": 11530 }, { "distill_loss": 0.15249355137348175, "epoch": 3.845897264843229, "step": 11530 }, { "epoch": 3.845897264843229, "ref_ce_loss": 0.13759253919124603, "step": 11530 }, { "epoch": 3.845897264843229, "loss": 0.36008813977241516, "step": 11530 }, { "ce_loss": 0.10647378861904144, "epoch": 3.845897264843229, "step": 11530 }, { "distill_loss": 0.09742897003889084, "epoch": 3.845897264843229, "step": 11530 }, { "epoch": 3.845897264843229, "ref_ce_loss": 0.10891912877559662, "step": 11530 }, { "epoch": 3.845897264843229, "loss": 0.4108813405036926, "step": 11530 }, { "ce_loss": 0.15171436965465546, "epoch": 3.845897264843229, "step": 11530 }, { "distill_loss": 0.12007388472557068, "epoch": 3.845897264843229, "step": 11530 }, { "epoch": 3.845897264843229, "ref_ce_loss": 0.10404207557439804, "step": 11530 }, { "epoch": 3.8492328218812544, "loss": 0.4934, "step": 11540 }, { "epoch": 3.8492328218812544, "grad_norm": 8.927061080932617, "step": 11540 }, { "epoch": 3.8492328218812544, "learning_rate": 0.0001664150461663391, "step": 11540 }, { "epoch": 3.8492328218812544, "loss": 0.371764600276947, "step": 11540 }, { "ce_loss": 0.10404995083808899, "epoch": 3.8492328218812544, "step": 11540 }, { "distill_loss": 0.1396816223859787, "epoch": 3.8492328218812544, "step": 11540 }, { "epoch": 3.8492328218812544, "ref_ce_loss": 0.09026072919368744, "step": 11540 }, { "epoch": 3.8492328218812544, "loss": 0.470393568277359, "step": 11540 }, { "ce_loss": 0.11666182428598404, "epoch": 3.8492328218812544, "step": 11540 }, { "distill_loss": 0.15077465772628784, "epoch": 3.8492328218812544, "step": 11540 }, { "epoch": 3.8492328218812544, "ref_ce_loss": 0.10939660668373108, "step": 11540 }, { "epoch": 3.8492328218812544, "loss": 0.6475262641906738, "step": 11540 }, { "ce_loss": 0.2445875108242035, "epoch": 3.8492328218812544, "step": 11540 }, { "distill_loss": 0.16829434037208557, "epoch": 3.8492328218812544, "step": 11540 }, { "epoch": 3.8492328218812544, "ref_ce_loss": 0.11170588433742523, "step": 11540 }, { "epoch": 3.8492328218812544, "loss": 0.2612134516239166, "step": 11540 }, { "ce_loss": 0.0813259482383728, "epoch": 3.8492328218812544, "step": 11540 }, { "distill_loss": 0.11056999862194061, "epoch": 3.8492328218812544, "step": 11540 }, { "epoch": 3.8492328218812544, "ref_ce_loss": 0.06922511756420135, "step": 11540 }, { "epoch": 3.8525683789192797, "loss": 0.4508, "step": 11550 }, { "epoch": 3.8525683789192797, "grad_norm": 5.26814079284668, "step": 11550 }, { "epoch": 3.8525683789192797, "learning_rate": 0.00016621368639862488, "step": 11550 }, { "epoch": 3.8525683789192797, "loss": 0.5349399447441101, "step": 11550 }, { "ce_loss": 0.20689332485198975, "epoch": 3.8525683789192797, "step": 11550 }, { "distill_loss": 0.14780738949775696, "epoch": 3.8525683789192797, "step": 11550 }, { "epoch": 3.8525683789192797, "ref_ce_loss": 0.12087158858776093, "step": 11550 }, { "epoch": 3.8525683789192797, "loss": 0.3580425977706909, "step": 11550 }, { "ce_loss": 0.07902470231056213, "epoch": 3.8525683789192797, "step": 11550 }, { "distill_loss": 0.09948811680078506, "epoch": 3.8525683789192797, "step": 11550 }, { "epoch": 3.8525683789192797, "ref_ce_loss": 0.06858757883310318, "step": 11550 }, { "epoch": 3.8525683789192797, "loss": 0.31841012835502625, "step": 11550 }, { "ce_loss": 0.09229130297899246, "epoch": 3.8525683789192797, "step": 11550 }, { "distill_loss": 0.15116482973098755, "epoch": 3.8525683789192797, "step": 11550 }, { "epoch": 3.8525683789192797, "ref_ce_loss": 0.07489918172359467, "step": 11550 }, { "epoch": 3.8525683789192797, "loss": 0.351014643907547, "step": 11550 }, { "ce_loss": 0.12908421456813812, "epoch": 3.8525683789192797, "step": 11550 }, { "distill_loss": 0.1351543664932251, "epoch": 3.8525683789192797, "step": 11550 }, { "epoch": 3.8525683789192797, "ref_ce_loss": 0.0867190733551979, "step": 11550 }, { "epoch": 3.855903935957305, "loss": 0.4975, "step": 11560 }, { "epoch": 3.855903935957305, "grad_norm": 2.962989330291748, "step": 11560 }, { "epoch": 3.855903935957305, "learning_rate": 0.0001660122970635425, "step": 11560 }, { "epoch": 3.855903935957305, "loss": 0.5496320128440857, "step": 11560 }, { "ce_loss": 0.23305082321166992, "epoch": 3.855903935957305, "step": 11560 }, { "distill_loss": 0.182064026594162, "epoch": 3.855903935957305, "step": 11560 }, { "epoch": 3.855903935957305, "ref_ce_loss": 0.10594523698091507, "step": 11560 }, { "epoch": 3.855903935957305, "loss": 0.5559959411621094, "step": 11560 }, { "ce_loss": 0.14517226815223694, "epoch": 3.855903935957305, "step": 11560 }, { "distill_loss": 0.13715030252933502, "epoch": 3.855903935957305, "step": 11560 }, { "epoch": 3.855903935957305, "ref_ce_loss": 0.10893628746271133, "step": 11560 }, { "epoch": 3.855903935957305, "loss": 0.5879688262939453, "step": 11560 }, { "ce_loss": 0.11704345792531967, "epoch": 3.855903935957305, "step": 11560 }, { "distill_loss": 0.12446143478155136, "epoch": 3.855903935957305, "step": 11560 }, { "epoch": 3.855903935957305, "ref_ce_loss": 0.1276710480451584, "step": 11560 }, { "epoch": 3.855903935957305, "loss": 0.747494101524353, "step": 11560 }, { "ce_loss": 0.1367560178041458, "epoch": 3.855903935957305, "step": 11560 }, { "distill_loss": 0.1948416382074356, "epoch": 3.855903935957305, "step": 11560 }, { "epoch": 3.855903935957305, "ref_ce_loss": 0.12615439295768738, "step": 11560 }, { "epoch": 3.8592394929953304, "loss": 0.4883, "step": 11570 }, { "epoch": 3.8592394929953304, "grad_norm": 2.0698843002319336, "step": 11570 }, { "epoch": 3.8592394929953304, "learning_rate": 0.00016581087852834657, "step": 11570 }, { "epoch": 3.8592394929953304, "loss": 0.44442111253738403, "step": 11570 }, { "ce_loss": 0.08238573372364044, "epoch": 3.8592394929953304, "step": 11570 }, { "distill_loss": 0.14392045140266418, "epoch": 3.8592394929953304, "step": 11570 }, { "epoch": 3.8592394929953304, "ref_ce_loss": 0.08191262930631638, "step": 11570 }, { "epoch": 3.8592394929953304, "loss": 0.490500271320343, "step": 11570 }, { "ce_loss": 0.15933695435523987, "epoch": 3.8592394929953304, "step": 11570 }, { "distill_loss": 0.13870687782764435, "epoch": 3.8592394929953304, "step": 11570 }, { "epoch": 3.8592394929953304, "ref_ce_loss": 0.0866045281291008, "step": 11570 }, { "epoch": 3.8592394929953304, "loss": 0.5080830454826355, "step": 11570 }, { "ce_loss": 0.20873354375362396, "epoch": 3.8592394929953304, "step": 11570 }, { "distill_loss": 0.1734309196472168, "epoch": 3.8592394929953304, "step": 11570 }, { "epoch": 3.8592394929953304, "ref_ce_loss": 0.09393316507339478, "step": 11570 }, { "epoch": 3.8592394929953304, "loss": 0.5791904330253601, "step": 11570 }, { "ce_loss": 0.18548917770385742, "epoch": 3.8592394929953304, "step": 11570 }, { "distill_loss": 0.18491113185882568, "epoch": 3.8592394929953304, "step": 11570 }, { "epoch": 3.8592394929953304, "ref_ce_loss": 0.16175051033496857, "step": 11570 }, { "epoch": 3.8625750500333558, "loss": 0.5087, "step": 11580 }, { "epoch": 3.8625750500333558, "grad_norm": 3.098926067352295, "step": 11580 }, { "epoch": 3.8625750500333558, "learning_rate": 0.00016560943116034513, "step": 11580 }, { "epoch": 3.8625750500333558, "loss": 0.49878960847854614, "step": 11580 }, { "ce_loss": 0.1209196150302887, "epoch": 3.8625750500333558, "step": 11580 }, { "distill_loss": 0.15495194494724274, "epoch": 3.8625750500333558, "step": 11580 }, { "epoch": 3.8625750500333558, "ref_ce_loss": 0.08308865875005722, "step": 11580 }, { "epoch": 3.8625750500333558, "loss": 0.29101717472076416, "step": 11580 }, { "ce_loss": 0.08006220310926437, "epoch": 3.8625750500333558, "step": 11580 }, { "distill_loss": 0.11955109238624573, "epoch": 3.8625750500333558, "step": 11580 }, { "epoch": 3.8625750500333558, "ref_ce_loss": 0.09133538603782654, "step": 11580 }, { "epoch": 3.8625750500333558, "loss": 0.4208149015903473, "step": 11580 }, { "ce_loss": 0.04855787381529808, "epoch": 3.8625750500333558, "step": 11580 }, { "distill_loss": 0.1554453819990158, "epoch": 3.8625750500333558, "step": 11580 }, { "epoch": 3.8625750500333558, "ref_ce_loss": 0.07605446130037308, "step": 11580 }, { "epoch": 3.8625750500333558, "loss": 0.42378556728363037, "step": 11580 }, { "ce_loss": 0.13488538563251495, "epoch": 3.8625750500333558, "step": 11580 }, { "distill_loss": 0.15314099192619324, "epoch": 3.8625750500333558, "step": 11580 }, { "epoch": 3.8625750500333558, "ref_ce_loss": 0.09601394087076187, "step": 11580 }, { "epoch": 3.865910607071381, "loss": 0.4683, "step": 11590 }, { "epoch": 3.865910607071381, "grad_norm": 3.4676589965820312, "step": 11590 }, { "epoch": 3.865910607071381, "learning_rate": 0.00016540795532689863, "step": 11590 }, { "epoch": 3.865910607071381, "loss": 0.2266787439584732, "step": 11590 }, { "ce_loss": 0.06223537027835846, "epoch": 3.865910607071381, "step": 11590 }, { "distill_loss": 0.08924637734889984, "epoch": 3.865910607071381, "step": 11590 }, { "epoch": 3.865910607071381, "ref_ce_loss": 0.05673402175307274, "step": 11590 }, { "epoch": 3.865910607071381, "loss": 0.5110219120979309, "step": 11590 }, { "ce_loss": 0.1444280594587326, "epoch": 3.865910607071381, "step": 11590 }, { "distill_loss": 0.1732235997915268, "epoch": 3.865910607071381, "step": 11590 }, { "epoch": 3.865910607071381, "ref_ce_loss": 0.11704862117767334, "step": 11590 }, { "epoch": 3.865910607071381, "loss": 0.5369408130645752, "step": 11590 }, { "ce_loss": 0.15495778620243073, "epoch": 3.865910607071381, "step": 11590 }, { "distill_loss": 0.13318301737308502, "epoch": 3.865910607071381, "step": 11590 }, { "epoch": 3.865910607071381, "ref_ce_loss": 0.10845693200826645, "step": 11590 }, { "epoch": 3.865910607071381, "loss": 0.7540019750595093, "step": 11590 }, { "ce_loss": 0.2528756558895111, "epoch": 3.865910607071381, "step": 11590 }, { "distill_loss": 0.19086939096450806, "epoch": 3.865910607071381, "step": 11590 }, { "epoch": 3.865910607071381, "ref_ce_loss": 0.16340744495391846, "step": 11590 }, { "epoch": 3.8692461641094065, "loss": 0.5208, "step": 11600 }, { "epoch": 3.8692461641094065, "grad_norm": 2.3811683654785156, "step": 11600 }, { "epoch": 3.8692461641094065, "learning_rate": 0.00016520645139541951, "step": 11600 }, { "epoch": 3.8692461641094065, "loss": 0.5290732979774475, "step": 11600 }, { "ce_loss": 0.13007256388664246, "epoch": 3.8692461641094065, "step": 11600 }, { "distill_loss": 0.19593222439289093, "epoch": 3.8692461641094065, "step": 11600 }, { "epoch": 3.8692461641094065, "ref_ce_loss": 0.09087681025266647, "step": 11600 }, { "epoch": 3.8692461641094065, "loss": 0.43068817257881165, "step": 11600 }, { "ce_loss": 0.14082638919353485, "epoch": 3.8692461641094065, "step": 11600 }, { "distill_loss": 0.16190363466739655, "epoch": 3.8692461641094065, "step": 11600 }, { "epoch": 3.8692461641094065, "ref_ce_loss": 0.12784138321876526, "step": 11600 }, { "epoch": 3.8692461641094065, "loss": 0.4996127188205719, "step": 11600 }, { "ce_loss": 0.15893928706645966, "epoch": 3.8692461641094065, "step": 11600 }, { "distill_loss": 0.23627258837223053, "epoch": 3.8692461641094065, "step": 11600 }, { "epoch": 3.8692461641094065, "ref_ce_loss": 0.1043238639831543, "step": 11600 }, { "epoch": 3.8692461641094065, "loss": 0.44533899426460266, "step": 11600 }, { "ce_loss": 0.07461967319250107, "epoch": 3.8692461641094065, "step": 11600 }, { "distill_loss": 0.14704746007919312, "epoch": 3.8692461641094065, "step": 11600 }, { "epoch": 3.8692461641094065, "ref_ce_loss": 0.10774028301239014, "step": 11600 }, { "epoch": 3.872581721147432, "loss": 0.5007, "step": 11610 }, { "epoch": 3.872581721147432, "grad_norm": 3.090487003326416, "step": 11610 }, { "epoch": 3.872581721147432, "learning_rate": 0.00016500491973337158, "step": 11610 }, { "epoch": 3.872581721147432, "loss": 0.2862015664577484, "step": 11610 }, { "ce_loss": 0.07175882905721664, "epoch": 3.872581721147432, "step": 11610 }, { "distill_loss": 0.1443636417388916, "epoch": 3.872581721147432, "step": 11610 }, { "epoch": 3.872581721147432, "ref_ce_loss": 0.047852739691734314, "step": 11610 }, { "epoch": 3.872581721147432, "loss": 0.9817602634429932, "step": 11610 }, { "ce_loss": 0.11350759863853455, "epoch": 3.872581721147432, "step": 11610 }, { "distill_loss": 0.14939598739147186, "epoch": 3.872581721147432, "step": 11610 }, { "epoch": 3.872581721147432, "ref_ce_loss": 0.10215581208467484, "step": 11610 }, { "epoch": 3.872581721147432, "loss": 0.5146198272705078, "step": 11610 }, { "ce_loss": 0.17866657674312592, "epoch": 3.872581721147432, "step": 11610 }, { "distill_loss": 0.1787855625152588, "epoch": 3.872581721147432, "step": 11610 }, { "epoch": 3.872581721147432, "ref_ce_loss": 0.11341854184865952, "step": 11610 }, { "epoch": 3.872581721147432, "loss": 0.3538992702960968, "step": 11610 }, { "ce_loss": 0.12631307542324066, "epoch": 3.872581721147432, "step": 11610 }, { "distill_loss": 0.15863800048828125, "epoch": 3.872581721147432, "step": 11610 }, { "epoch": 3.872581721147432, "ref_ce_loss": 0.06889407336711884, "step": 11610 }, { "epoch": 3.875917278185457, "loss": 0.5357, "step": 11620 }, { "epoch": 3.875917278185457, "grad_norm": 2.8688204288482666, "step": 11620 }, { "epoch": 3.875917278185457, "learning_rate": 0.00016480336070826904, "step": 11620 }, { "epoch": 3.875917278185457, "loss": 0.3075914978981018, "step": 11620 }, { "ce_loss": 0.0304547268897295, "epoch": 3.875917278185457, "step": 11620 }, { "distill_loss": 0.1418517827987671, "epoch": 3.875917278185457, "step": 11620 }, { "epoch": 3.875917278185457, "ref_ce_loss": 0.06853881478309631, "step": 11620 }, { "epoch": 3.875917278185457, "loss": 0.264699250459671, "step": 11620 }, { "ce_loss": 0.03863706439733505, "epoch": 3.875917278185457, "step": 11620 }, { "distill_loss": 0.1346575915813446, "epoch": 3.875917278185457, "step": 11620 }, { "epoch": 3.875917278185457, "ref_ce_loss": 0.06331543624401093, "step": 11620 }, { "epoch": 3.875917278185457, "loss": 0.35955944657325745, "step": 11620 }, { "ce_loss": 0.0983886793255806, "epoch": 3.875917278185457, "step": 11620 }, { "distill_loss": 0.13251645863056183, "epoch": 3.875917278185457, "step": 11620 }, { "epoch": 3.875917278185457, "ref_ce_loss": 0.053774092346429825, "step": 11620 }, { "epoch": 3.875917278185457, "loss": 0.3828924000263214, "step": 11620 }, { "ce_loss": 0.0748138353228569, "epoch": 3.875917278185457, "step": 11620 }, { "distill_loss": 0.155991330742836, "epoch": 3.875917278185457, "step": 11620 }, { "epoch": 3.875917278185457, "ref_ce_loss": 0.11473026126623154, "step": 11620 }, { "epoch": 3.8792528352234825, "loss": 0.5052, "step": 11630 }, { "epoch": 3.8792528352234825, "grad_norm": 2.6132733821868896, "step": 11630 }, { "epoch": 3.8792528352234825, "learning_rate": 0.00016460177468767588, "step": 11630 }, { "epoch": 3.8792528352234825, "loss": 0.7341196537017822, "step": 11630 }, { "ce_loss": 0.1839066743850708, "epoch": 3.8792528352234825, "step": 11630 }, { "distill_loss": 0.20775669813156128, "epoch": 3.8792528352234825, "step": 11630 }, { "epoch": 3.8792528352234825, "ref_ce_loss": 0.15465961396694183, "step": 11630 }, { "epoch": 3.8792528352234825, "loss": 0.6776058673858643, "step": 11630 }, { "ce_loss": 0.13776738941669464, "epoch": 3.8792528352234825, "step": 11630 }, { "distill_loss": 0.1728818416595459, "epoch": 3.8792528352234825, "step": 11630 }, { "epoch": 3.8792528352234825, "ref_ce_loss": 0.09529329836368561, "step": 11630 }, { "epoch": 3.8792528352234825, "loss": 0.44648653268814087, "step": 11630 }, { "ce_loss": 0.1300860196352005, "epoch": 3.8792528352234825, "step": 11630 }, { "distill_loss": 0.1513717770576477, "epoch": 3.8792528352234825, "step": 11630 }, { "epoch": 3.8792528352234825, "ref_ce_loss": 0.12261359393596649, "step": 11630 }, { "epoch": 3.8792528352234825, "loss": 0.6910187005996704, "step": 11630 }, { "ce_loss": 0.2088298350572586, "epoch": 3.8792528352234825, "step": 11630 }, { "distill_loss": 0.2162533700466156, "epoch": 3.8792528352234825, "step": 11630 }, { "epoch": 3.8792528352234825, "ref_ce_loss": 0.13651823997497559, "step": 11630 }, { "epoch": 3.882588392261508, "loss": 0.5367, "step": 11640 }, { "epoch": 3.882588392261508, "grad_norm": 3.5043728351593018, "step": 11640 }, { "epoch": 3.882588392261508, "learning_rate": 0.00016440016203920574, "step": 11640 }, { "epoch": 3.882588392261508, "loss": 0.4661126136779785, "step": 11640 }, { "ce_loss": 0.09211700409650803, "epoch": 3.882588392261508, "step": 11640 }, { "distill_loss": 0.19817857444286346, "epoch": 3.882588392261508, "step": 11640 }, { "epoch": 3.882588392261508, "ref_ce_loss": 0.08246038854122162, "step": 11640 }, { "epoch": 3.882588392261508, "loss": 0.47072139382362366, "step": 11640 }, { "ce_loss": 0.1250084787607193, "epoch": 3.882588392261508, "step": 11640 }, { "distill_loss": 0.25739413499832153, "epoch": 3.882588392261508, "step": 11640 }, { "epoch": 3.882588392261508, "ref_ce_loss": 0.08818263560533524, "step": 11640 }, { "epoch": 3.882588392261508, "loss": 0.4213848412036896, "step": 11640 }, { "ce_loss": 0.12245972454547882, "epoch": 3.882588392261508, "step": 11640 }, { "distill_loss": 0.18493112921714783, "epoch": 3.882588392261508, "step": 11640 }, { "epoch": 3.882588392261508, "ref_ce_loss": 0.11392603069543839, "step": 11640 }, { "epoch": 3.882588392261508, "loss": 0.8170498013496399, "step": 11640 }, { "ce_loss": 0.12693078815937042, "epoch": 3.882588392261508, "step": 11640 }, { "distill_loss": 0.21718522906303406, "epoch": 3.882588392261508, "step": 11640 }, { "epoch": 3.882588392261508, "ref_ce_loss": 0.09534420818090439, "step": 11640 }, { "epoch": 3.885923949299533, "loss": 0.5095, "step": 11650 }, { "epoch": 3.885923949299533, "grad_norm": 3.2084357738494873, "step": 11650 }, { "epoch": 3.885923949299533, "learning_rate": 0.00016419852313052043, "step": 11650 }, { "epoch": 3.885923949299533, "loss": 0.4158484637737274, "step": 11650 }, { "ce_loss": 0.12239982187747955, "epoch": 3.885923949299533, "step": 11650 }, { "distill_loss": 0.13823160529136658, "epoch": 3.885923949299533, "step": 11650 }, { "epoch": 3.885923949299533, "ref_ce_loss": 0.11193165183067322, "step": 11650 }, { "epoch": 3.885923949299533, "loss": 0.371078759431839, "step": 11650 }, { "ce_loss": 0.12052211910486221, "epoch": 3.885923949299533, "step": 11650 }, { "distill_loss": 0.14332422614097595, "epoch": 3.885923949299533, "step": 11650 }, { "epoch": 3.885923949299533, "ref_ce_loss": 0.0856185331940651, "step": 11650 }, { "epoch": 3.885923949299533, "loss": 0.4371638298034668, "step": 11650 }, { "ce_loss": 0.16073602437973022, "epoch": 3.885923949299533, "step": 11650 }, { "distill_loss": 0.1462903916835785, "epoch": 3.885923949299533, "step": 11650 }, { "epoch": 3.885923949299533, "ref_ce_loss": 0.09076157212257385, "step": 11650 }, { "epoch": 3.885923949299533, "loss": 0.47583967447280884, "step": 11650 }, { "ce_loss": 0.13007958233356476, "epoch": 3.885923949299533, "step": 11650 }, { "distill_loss": 0.19589829444885254, "epoch": 3.885923949299533, "step": 11650 }, { "epoch": 3.885923949299533, "ref_ce_loss": 0.10973778367042542, "step": 11650 }, { "epoch": 3.8892595063375586, "loss": 0.4876, "step": 11660 }, { "epoch": 3.8892595063375586, "grad_norm": 2.6360435485839844, "step": 11660 }, { "epoch": 3.8892595063375586, "learning_rate": 0.00016399685832932975, "step": 11660 }, { "epoch": 3.8892595063375586, "loss": 0.504503071308136, "step": 11660 }, { "ce_loss": 0.11461180448532104, "epoch": 3.8892595063375586, "step": 11660 }, { "distill_loss": 0.18167349696159363, "epoch": 3.8892595063375586, "step": 11660 }, { "epoch": 3.8892595063375586, "ref_ce_loss": 0.0868772566318512, "step": 11660 }, { "epoch": 3.8892595063375586, "loss": 0.705942690372467, "step": 11660 }, { "ce_loss": 0.20406785607337952, "epoch": 3.8892595063375586, "step": 11660 }, { "distill_loss": 0.16648991405963898, "epoch": 3.8892595063375586, "step": 11660 }, { "epoch": 3.8892595063375586, "ref_ce_loss": 0.12941238284111023, "step": 11660 }, { "epoch": 3.8892595063375586, "loss": 0.31773361563682556, "step": 11660 }, { "ce_loss": 0.10134835541248322, "epoch": 3.8892595063375586, "step": 11660 }, { "distill_loss": 0.13180075585842133, "epoch": 3.8892595063375586, "step": 11660 }, { "epoch": 3.8892595063375586, "ref_ce_loss": 0.0696868896484375, "step": 11660 }, { "epoch": 3.8892595063375586, "loss": 0.33874279260635376, "step": 11660 }, { "ce_loss": 0.07513560354709625, "epoch": 3.8892595063375586, "step": 11660 }, { "distill_loss": 0.10537146776914597, "epoch": 3.8892595063375586, "step": 11660 }, { "epoch": 3.8892595063375586, "ref_ce_loss": 0.09582258760929108, "step": 11660 }, { "epoch": 3.892595063375584, "loss": 0.5061, "step": 11670 }, { "epoch": 3.892595063375584, "grad_norm": 2.3636677265167236, "step": 11670 }, { "epoch": 3.892595063375584, "learning_rate": 0.0001637951680033908, "step": 11670 }, { "epoch": 3.892595063375584, "loss": 0.35305699706077576, "step": 11670 }, { "ce_loss": 0.11162156611680984, "epoch": 3.892595063375584, "step": 11670 }, { "distill_loss": 0.11027735471725464, "epoch": 3.892595063375584, "step": 11670 }, { "epoch": 3.892595063375584, "ref_ce_loss": 0.0959223136305809, "step": 11670 }, { "epoch": 3.892595063375584, "loss": 0.4801453948020935, "step": 11670 }, { "ce_loss": 0.11221076548099518, "epoch": 3.892595063375584, "step": 11670 }, { "distill_loss": 0.16410060226917267, "epoch": 3.892595063375584, "step": 11670 }, { "epoch": 3.892595063375584, "ref_ce_loss": 0.08730052411556244, "step": 11670 }, { "epoch": 3.892595063375584, "loss": 0.4774989187717438, "step": 11670 }, { "ce_loss": 0.06793775409460068, "epoch": 3.892595063375584, "step": 11670 }, { "distill_loss": 0.1663445234298706, "epoch": 3.892595063375584, "step": 11670 }, { "epoch": 3.892595063375584, "ref_ce_loss": 0.16433344781398773, "step": 11670 }, { "epoch": 3.892595063375584, "loss": 0.5329448580741882, "step": 11670 }, { "ce_loss": 0.14226965606212616, "epoch": 3.892595063375584, "step": 11670 }, { "distill_loss": 0.15646116435527802, "epoch": 3.892595063375584, "step": 11670 }, { "epoch": 3.892595063375584, "ref_ce_loss": 0.08196160942316055, "step": 11670 }, { "epoch": 3.8959306204136093, "loss": 0.4525, "step": 11680 }, { "epoch": 3.8959306204136093, "grad_norm": 3.344223737716675, "step": 11680 }, { "epoch": 3.8959306204136093, "learning_rate": 0.0001635934525205072, "step": 11680 }, { "epoch": 3.8959306204136093, "loss": 0.378836065530777, "step": 11680 }, { "ce_loss": 0.09642636775970459, "epoch": 3.8959306204136093, "step": 11680 }, { "distill_loss": 0.14807581901550293, "epoch": 3.8959306204136093, "step": 11680 }, { "epoch": 3.8959306204136093, "ref_ce_loss": 0.07628420740365982, "step": 11680 }, { "epoch": 3.8959306204136093, "loss": 0.42805588245391846, "step": 11680 }, { "ce_loss": 0.06449296325445175, "epoch": 3.8959306204136093, "step": 11680 }, { "distill_loss": 0.1382300704717636, "epoch": 3.8959306204136093, "step": 11680 }, { "epoch": 3.8959306204136093, "ref_ce_loss": 0.08380676805973053, "step": 11680 }, { "epoch": 3.8959306204136093, "loss": 1.1077532768249512, "step": 11680 }, { "ce_loss": 0.2406988888978958, "epoch": 3.8959306204136093, "step": 11680 }, { "distill_loss": 0.21715620160102844, "epoch": 3.8959306204136093, "step": 11680 }, { "epoch": 3.8959306204136093, "ref_ce_loss": 0.08214037120342255, "step": 11680 }, { "epoch": 3.8959306204136093, "loss": 0.6784728169441223, "step": 11680 }, { "ce_loss": 0.2056596726179123, "epoch": 3.8959306204136093, "step": 11680 }, { "distill_loss": 0.18610471487045288, "epoch": 3.8959306204136093, "step": 11680 }, { "epoch": 3.8959306204136093, "ref_ce_loss": 0.11367989331483841, "step": 11680 }, { "epoch": 3.8992661774516346, "loss": 0.5363, "step": 11690 }, { "epoch": 3.8992661774516346, "grad_norm": 3.669268846511841, "step": 11690 }, { "epoch": 3.8992661774516346, "learning_rate": 0.00016339171224852834, "step": 11690 }, { "epoch": 3.8992661774516346, "loss": 0.5375806093215942, "step": 11690 }, { "ce_loss": 0.2153269350528717, "epoch": 3.8992661774516346, "step": 11690 }, { "distill_loss": 0.19027647376060486, "epoch": 3.8992661774516346, "step": 11690 }, { "epoch": 3.8992661774516346, "ref_ce_loss": 0.13186945021152496, "step": 11690 }, { "epoch": 3.8992661774516346, "loss": 0.43219420313835144, "step": 11690 }, { "ce_loss": 0.15259014070034027, "epoch": 3.8992661774516346, "step": 11690 }, { "distill_loss": 0.16870765388011932, "epoch": 3.8992661774516346, "step": 11690 }, { "epoch": 3.8992661774516346, "ref_ce_loss": 0.11080259084701538, "step": 11690 }, { "epoch": 3.8992661774516346, "loss": 0.4578809142112732, "step": 11690 }, { "ce_loss": 0.11323428899049759, "epoch": 3.8992661774516346, "step": 11690 }, { "distill_loss": 0.1490565836429596, "epoch": 3.8992661774516346, "step": 11690 }, { "epoch": 3.8992661774516346, "ref_ce_loss": 0.0648016631603241, "step": 11690 }, { "epoch": 3.8992661774516346, "loss": 0.6001131534576416, "step": 11690 }, { "ce_loss": 0.1150934249162674, "epoch": 3.8992661774516346, "step": 11690 }, { "distill_loss": 0.12322366237640381, "epoch": 3.8992661774516346, "step": 11690 }, { "epoch": 3.8992661774516346, "ref_ce_loss": 0.07318437844514847, "step": 11690 }, { "epoch": 3.90260173448966, "loss": 0.4775, "step": 11700 }, { "epoch": 3.90260173448966, "grad_norm": 2.9176535606384277, "step": 11700 }, { "epoch": 3.90260173448966, "learning_rate": 0.00016318994755534894, "step": 11700 }, { "epoch": 3.90260173448966, "loss": 0.21333375573158264, "step": 11700 }, { "ce_loss": 0.04972422122955322, "epoch": 3.90260173448966, "step": 11700 }, { "distill_loss": 0.09933877736330032, "epoch": 3.90260173448966, "step": 11700 }, { "epoch": 3.90260173448966, "ref_ce_loss": 0.06423191726207733, "step": 11700 }, { "epoch": 3.90260173448966, "loss": 0.4005766212940216, "step": 11700 }, { "ce_loss": 0.1483602076768875, "epoch": 3.90260173448966, "step": 11700 }, { "distill_loss": 0.14177684485912323, "epoch": 3.90260173448966, "step": 11700 }, { "epoch": 3.90260173448966, "ref_ce_loss": 0.08312007784843445, "step": 11700 }, { "epoch": 3.90260173448966, "loss": 0.4453301429748535, "step": 11700 }, { "ce_loss": 0.13142706453800201, "epoch": 3.90260173448966, "step": 11700 }, { "distill_loss": 0.19013100862503052, "epoch": 3.90260173448966, "step": 11700 }, { "epoch": 3.90260173448966, "ref_ce_loss": 0.0937122255563736, "step": 11700 }, { "epoch": 3.90260173448966, "loss": 0.4367537200450897, "step": 11700 }, { "ce_loss": 0.1704091727733612, "epoch": 3.90260173448966, "step": 11700 }, { "distill_loss": 0.14347653090953827, "epoch": 3.90260173448966, "step": 11700 }, { "epoch": 3.90260173448966, "ref_ce_loss": 0.07881513237953186, "step": 11700 }, { "epoch": 3.9059372915276853, "loss": 0.4818, "step": 11710 }, { "epoch": 3.9059372915276853, "grad_norm": 2.7853381633758545, "step": 11710 }, { "epoch": 3.9059372915276853, "learning_rate": 0.00016298815880890822, "step": 11710 }, { "epoch": 3.9059372915276853, "loss": 0.2837199568748474, "step": 11710 }, { "ce_loss": 0.06674331426620483, "epoch": 3.9059372915276853, "step": 11710 }, { "distill_loss": 0.14998966455459595, "epoch": 3.9059372915276853, "step": 11710 }, { "epoch": 3.9059372915276853, "ref_ce_loss": 0.06695345044136047, "step": 11710 }, { "epoch": 3.9059372915276853, "loss": 0.42292481660842896, "step": 11710 }, { "ce_loss": 0.1149093508720398, "epoch": 3.9059372915276853, "step": 11710 }, { "distill_loss": 0.14028172194957733, "epoch": 3.9059372915276853, "step": 11710 }, { "epoch": 3.9059372915276853, "ref_ce_loss": 0.11760975420475006, "step": 11710 }, { "epoch": 3.9059372915276853, "loss": 0.424089252948761, "step": 11710 }, { "ce_loss": 0.06741020083427429, "epoch": 3.9059372915276853, "step": 11710 }, { "distill_loss": 0.15840831398963928, "epoch": 3.9059372915276853, "step": 11710 }, { "epoch": 3.9059372915276853, "ref_ce_loss": 0.11006693542003632, "step": 11710 }, { "epoch": 3.9059372915276853, "loss": 0.3766666650772095, "step": 11710 }, { "ce_loss": 0.04787565395236015, "epoch": 3.9059372915276853, "step": 11710 }, { "distill_loss": 0.1054430678486824, "epoch": 3.9059372915276853, "step": 11710 }, { "epoch": 3.9059372915276853, "ref_ce_loss": 0.07436710596084595, "step": 11710 }, { "epoch": 3.9092728485657107, "loss": 0.4974, "step": 11720 }, { "epoch": 3.9092728485657107, "grad_norm": 2.3894925117492676, "step": 11720 }, { "epoch": 3.9092728485657107, "learning_rate": 0.00016278634637718922, "step": 11720 }, { "epoch": 3.9092728485657107, "loss": 0.41334080696105957, "step": 11720 }, { "ce_loss": 0.08776438236236572, "epoch": 3.9092728485657107, "step": 11720 }, { "distill_loss": 0.14124101400375366, "epoch": 3.9092728485657107, "step": 11720 }, { "epoch": 3.9092728485657107, "ref_ce_loss": 0.09219489991664886, "step": 11720 }, { "epoch": 3.9092728485657107, "loss": 0.42992448806762695, "step": 11720 }, { "ce_loss": 0.09458998590707779, "epoch": 3.9092728485657107, "step": 11720 }, { "distill_loss": 0.18413154780864716, "epoch": 3.9092728485657107, "step": 11720 }, { "epoch": 3.9092728485657107, "ref_ce_loss": 0.15011750161647797, "step": 11720 }, { "epoch": 3.9092728485657107, "loss": 0.3641749918460846, "step": 11720 }, { "ce_loss": 0.12528592348098755, "epoch": 3.9092728485657107, "step": 11720 }, { "distill_loss": 0.13670697808265686, "epoch": 3.9092728485657107, "step": 11720 }, { "epoch": 3.9092728485657107, "ref_ce_loss": 0.07176109403371811, "step": 11720 }, { "epoch": 3.9092728485657107, "loss": 0.36159613728523254, "step": 11720 }, { "ce_loss": 0.12030640989542007, "epoch": 3.9092728485657107, "step": 11720 }, { "distill_loss": 0.14727510511875153, "epoch": 3.9092728485657107, "step": 11720 }, { "epoch": 3.9092728485657107, "ref_ce_loss": 0.0937834084033966, "step": 11720 }, { "epoch": 3.912608405603736, "loss": 0.4675, "step": 11730 }, { "epoch": 3.912608405603736, "grad_norm": 3.769256830215454, "step": 11730 }, { "epoch": 3.912608405603736, "learning_rate": 0.00016258451062821827, "step": 11730 }, { "epoch": 3.912608405603736, "loss": 0.2411087602376938, "step": 11730 }, { "ce_loss": 0.0759025439620018, "epoch": 3.912608405603736, "step": 11730 }, { "distill_loss": 0.09947311878204346, "epoch": 3.912608405603736, "step": 11730 }, { "epoch": 3.912608405603736, "ref_ce_loss": 0.04836536571383476, "step": 11730 }, { "epoch": 3.912608405603736, "loss": 0.8758841753005981, "step": 11730 }, { "ce_loss": 0.1558084785938263, "epoch": 3.912608405603736, "step": 11730 }, { "distill_loss": 0.11926652491092682, "epoch": 3.912608405603736, "step": 11730 }, { "epoch": 3.912608405603736, "ref_ce_loss": 0.16222594678401947, "step": 11730 }, { "epoch": 3.912608405603736, "loss": 0.36194390058517456, "step": 11730 }, { "ce_loss": 0.11282184720039368, "epoch": 3.912608405603736, "step": 11730 }, { "distill_loss": 0.15627947449684143, "epoch": 3.912608405603736, "step": 11730 }, { "epoch": 3.912608405603736, "ref_ce_loss": 0.09275899082422256, "step": 11730 }, { "epoch": 3.912608405603736, "loss": 0.5852751731872559, "step": 11730 }, { "ce_loss": 0.10109452903270721, "epoch": 3.912608405603736, "step": 11730 }, { "distill_loss": 0.15885433554649353, "epoch": 3.912608405603736, "step": 11730 }, { "epoch": 3.912608405603736, "ref_ce_loss": 0.12536212801933289, "step": 11730 }, { "epoch": 3.9159439626417614, "loss": 0.4803, "step": 11740 }, { "epoch": 3.9159439626417614, "grad_norm": 3.014451265335083, "step": 11740 }, { "epoch": 3.9159439626417614, "learning_rate": 0.0001623826519300641, "step": 11740 }, { "epoch": 3.9159439626417614, "loss": 0.3135759234428406, "step": 11740 }, { "ce_loss": 0.09415557235479355, "epoch": 3.9159439626417614, "step": 11740 }, { "distill_loss": 0.11723636090755463, "epoch": 3.9159439626417614, "step": 11740 }, { "epoch": 3.9159439626417614, "ref_ce_loss": 0.06273960322141647, "step": 11740 }, { "epoch": 3.9159439626417614, "loss": 0.30374178290367126, "step": 11740 }, { "ce_loss": 0.10108528286218643, "epoch": 3.9159439626417614, "step": 11740 }, { "distill_loss": 0.11091496795415878, "epoch": 3.9159439626417614, "step": 11740 }, { "epoch": 3.9159439626417614, "ref_ce_loss": 0.0914100706577301, "step": 11740 }, { "epoch": 3.9159439626417614, "loss": 0.6572458744049072, "step": 11740 }, { "ce_loss": 0.1793697476387024, "epoch": 3.9159439626417614, "step": 11740 }, { "distill_loss": 0.14189916849136353, "epoch": 3.9159439626417614, "step": 11740 }, { "epoch": 3.9159439626417614, "ref_ce_loss": 0.09885284304618835, "step": 11740 }, { "epoch": 3.9159439626417614, "loss": 0.43537265062332153, "step": 11740 }, { "ce_loss": 0.1796526312828064, "epoch": 3.9159439626417614, "step": 11740 }, { "distill_loss": 0.13787336647510529, "epoch": 3.9159439626417614, "step": 11740 }, { "epoch": 3.9159439626417614, "ref_ce_loss": 0.08680669963359833, "step": 11740 }, { "epoch": 3.9192795196797867, "loss": 0.4401, "step": 11750 }, { "epoch": 3.9192795196797867, "grad_norm": 2.1687493324279785, "step": 11750 }, { "epoch": 3.9192795196797867, "learning_rate": 0.00016218077065083736, "step": 11750 }, { "epoch": 3.9192795196797867, "loss": 0.3428560793399811, "step": 11750 }, { "ce_loss": 0.052370764315128326, "epoch": 3.9192795196797867, "step": 11750 }, { "distill_loss": 0.0916663259267807, "epoch": 3.9192795196797867, "step": 11750 }, { "epoch": 3.9192795196797867, "ref_ce_loss": 0.09654126316308975, "step": 11750 }, { "epoch": 3.9192795196797867, "loss": 0.3783106505870819, "step": 11750 }, { "ce_loss": 0.09852546453475952, "epoch": 3.9192795196797867, "step": 11750 }, { "distill_loss": 0.11007370054721832, "epoch": 3.9192795196797867, "step": 11750 }, { "epoch": 3.9192795196797867, "ref_ce_loss": 0.10336705297231674, "step": 11750 }, { "epoch": 3.9192795196797867, "loss": 0.4215086102485657, "step": 11750 }, { "ce_loss": 0.11413310468196869, "epoch": 3.9192795196797867, "step": 11750 }, { "distill_loss": 0.1096954271197319, "epoch": 3.9192795196797867, "step": 11750 }, { "epoch": 3.9192795196797867, "ref_ce_loss": 0.13159877061843872, "step": 11750 }, { "epoch": 3.9192795196797867, "loss": 0.4675275981426239, "step": 11750 }, { "ce_loss": 0.16325342655181885, "epoch": 3.9192795196797867, "step": 11750 }, { "distill_loss": 0.14663267135620117, "epoch": 3.9192795196797867, "step": 11750 }, { "epoch": 3.9192795196797867, "ref_ce_loss": 0.15452298521995544, "step": 11750 }, { "epoch": 3.922615076717812, "loss": 0.4391, "step": 11760 }, { "epoch": 3.922615076717812, "grad_norm": 5.820791721343994, "step": 11760 }, { "epoch": 3.922615076717812, "learning_rate": 0.00016197886715868987, "step": 11760 }, { "epoch": 3.922615076717812, "loss": 0.3936474621295929, "step": 11760 }, { "ce_loss": 0.11980064958333969, "epoch": 3.922615076717812, "step": 11760 }, { "distill_loss": 0.09972325712442398, "epoch": 3.922615076717812, "step": 11760 }, { "epoch": 3.922615076717812, "ref_ce_loss": 0.09936317801475525, "step": 11760 }, { "epoch": 3.922615076717812, "loss": 0.42065149545669556, "step": 11760 }, { "ce_loss": 0.0961647778749466, "epoch": 3.922615076717812, "step": 11760 }, { "distill_loss": 0.11633703112602234, "epoch": 3.922615076717812, "step": 11760 }, { "epoch": 3.922615076717812, "ref_ce_loss": 0.09963429719209671, "step": 11760 }, { "epoch": 3.922615076717812, "loss": 0.4702761173248291, "step": 11760 }, { "ce_loss": 0.14327195286750793, "epoch": 3.922615076717812, "step": 11760 }, { "distill_loss": 0.13244634866714478, "epoch": 3.922615076717812, "step": 11760 }, { "epoch": 3.922615076717812, "ref_ce_loss": 0.11447618156671524, "step": 11760 }, { "epoch": 3.922615076717812, "loss": 0.4945146441459656, "step": 11760 }, { "ce_loss": 0.038211580365896225, "epoch": 3.922615076717812, "step": 11760 }, { "distill_loss": 0.07644416391849518, "epoch": 3.922615076717812, "step": 11760 }, { "epoch": 3.922615076717812, "ref_ce_loss": 0.07319790124893188, "step": 11760 }, { "epoch": 3.9259506337558374, "loss": 0.5131, "step": 11770 }, { "epoch": 3.9259506337558374, "grad_norm": 2.2660369873046875, "step": 11770 }, { "epoch": 3.9259506337558374, "learning_rate": 0.00016177694182181396, "step": 11770 }, { "epoch": 3.9259506337558374, "loss": 0.4388217031955719, "step": 11770 }, { "ce_loss": 0.08850383013486862, "epoch": 3.9259506337558374, "step": 11770 }, { "distill_loss": 0.18210861086845398, "epoch": 3.9259506337558374, "step": 11770 }, { "epoch": 3.9259506337558374, "ref_ce_loss": 0.1255296766757965, "step": 11770 }, { "epoch": 3.9259506337558374, "loss": 0.6527267694473267, "step": 11770 }, { "ce_loss": 0.1932949721813202, "epoch": 3.9259506337558374, "step": 11770 }, { "distill_loss": 0.12827904522418976, "epoch": 3.9259506337558374, "step": 11770 }, { "epoch": 3.9259506337558374, "ref_ce_loss": 0.12352578341960907, "step": 11770 }, { "epoch": 3.9259506337558374, "loss": 0.2734456956386566, "step": 11770 }, { "ce_loss": 0.035691630095243454, "epoch": 3.9259506337558374, "step": 11770 }, { "distill_loss": 0.08057143539190292, "epoch": 3.9259506337558374, "step": 11770 }, { "epoch": 3.9259506337558374, "ref_ce_loss": 0.0883350819349289, "step": 11770 }, { "epoch": 3.9259506337558374, "loss": 0.6890462040901184, "step": 11770 }, { "ce_loss": 0.2692700922489166, "epoch": 3.9259506337558374, "step": 11770 }, { "distill_loss": 0.17551743984222412, "epoch": 3.9259506337558374, "step": 11770 }, { "epoch": 3.9259506337558374, "ref_ce_loss": 0.1092531830072403, "step": 11770 }, { "epoch": 3.9292861907938628, "loss": 0.4957, "step": 11780 }, { "epoch": 3.9292861907938628, "grad_norm": 3.0138349533081055, "step": 11780 }, { "epoch": 3.9292861907938628, "learning_rate": 0.00016157499500844182, "step": 11780 }, { "epoch": 3.9292861907938628, "loss": 0.3256349563598633, "step": 11780 }, { "ce_loss": 0.08072438836097717, "epoch": 3.9292861907938628, "step": 11780 }, { "distill_loss": 0.12202650308609009, "epoch": 3.9292861907938628, "step": 11780 }, { "epoch": 3.9292861907938628, "ref_ce_loss": 0.12261930853128433, "step": 11780 }, { "epoch": 3.9292861907938628, "loss": 0.28612565994262695, "step": 11780 }, { "ce_loss": 0.08246175199747086, "epoch": 3.9292861907938628, "step": 11780 }, { "distill_loss": 0.11741185188293457, "epoch": 3.9292861907938628, "step": 11780 }, { "epoch": 3.9292861907938628, "ref_ce_loss": 0.08588752150535583, "step": 11780 }, { "epoch": 3.9292861907938628, "loss": 0.36774736642837524, "step": 11780 }, { "ce_loss": 0.11784758418798447, "epoch": 3.9292861907938628, "step": 11780 }, { "distill_loss": 0.13903403282165527, "epoch": 3.9292861907938628, "step": 11780 }, { "epoch": 3.9292861907938628, "ref_ce_loss": 0.11045124381780624, "step": 11780 }, { "epoch": 3.9292861907938628, "loss": 0.2915292978286743, "step": 11780 }, { "ce_loss": 0.07259783893823624, "epoch": 3.9292861907938628, "step": 11780 }, { "distill_loss": 0.09900534152984619, "epoch": 3.9292861907938628, "step": 11780 }, { "epoch": 3.9292861907938628, "ref_ce_loss": 0.05908788740634918, "step": 11780 }, { "epoch": 3.932621747831888, "loss": 0.4541, "step": 11790 }, { "epoch": 3.932621747831888, "grad_norm": 2.875215530395508, "step": 11790 }, { "epoch": 3.932621747831888, "learning_rate": 0.00016137302708684476, "step": 11790 }, { "epoch": 3.932621747831888, "loss": 0.35691261291503906, "step": 11790 }, { "ce_loss": 0.1520719975233078, "epoch": 3.932621747831888, "step": 11790 }, { "distill_loss": 0.11623502522706985, "epoch": 3.932621747831888, "step": 11790 }, { "epoch": 3.932621747831888, "ref_ce_loss": 0.08838573843240738, "step": 11790 }, { "epoch": 3.932621747831888, "loss": 0.3730715811252594, "step": 11790 }, { "ce_loss": 0.13004635274410248, "epoch": 3.932621747831888, "step": 11790 }, { "distill_loss": 0.10884232074022293, "epoch": 3.932621747831888, "step": 11790 }, { "epoch": 3.932621747831888, "ref_ce_loss": 0.100712850689888, "step": 11790 }, { "epoch": 3.932621747831888, "loss": 0.2646164000034332, "step": 11790 }, { "ce_loss": 0.10139144957065582, "epoch": 3.932621747831888, "step": 11790 }, { "distill_loss": 0.10436727106571198, "epoch": 3.932621747831888, "step": 11790 }, { "epoch": 3.932621747831888, "ref_ce_loss": 0.05834803357720375, "step": 11790 }, { "epoch": 3.932621747831888, "loss": 0.48992589116096497, "step": 11790 }, { "ce_loss": 0.1954546868801117, "epoch": 3.932621747831888, "step": 11790 }, { "distill_loss": 0.17859971523284912, "epoch": 3.932621747831888, "step": 11790 }, { "epoch": 3.932621747831888, "ref_ce_loss": 0.11544232815504074, "step": 11790 }, { "epoch": 3.9359573048699135, "loss": 0.5165, "step": 11800 }, { "epoch": 3.9359573048699135, "grad_norm": 4.251229286193848, "step": 11800 }, { "epoch": 3.9359573048699135, "learning_rate": 0.00016117103842533254, "step": 11800 }, { "epoch": 3.9359573048699135, "loss": 0.35116198658943176, "step": 11800 }, { "ce_loss": 0.11046728491783142, "epoch": 3.9359573048699135, "step": 11800 }, { "distill_loss": 0.1407284140586853, "epoch": 3.9359573048699135, "step": 11800 }, { "epoch": 3.9359573048699135, "ref_ce_loss": 0.09850616753101349, "step": 11800 }, { "epoch": 3.9359573048699135, "loss": 0.6646789908409119, "step": 11800 }, { "ce_loss": 0.20312164723873138, "epoch": 3.9359573048699135, "step": 11800 }, { "distill_loss": 0.20929650962352753, "epoch": 3.9359573048699135, "step": 11800 }, { "epoch": 3.9359573048699135, "ref_ce_loss": 0.12372786551713943, "step": 11800 }, { "epoch": 3.9359573048699135, "loss": 0.2903001010417938, "step": 11800 }, { "ce_loss": 0.05333415046334267, "epoch": 3.9359573048699135, "step": 11800 }, { "distill_loss": 0.12783510982990265, "epoch": 3.9359573048699135, "step": 11800 }, { "epoch": 3.9359573048699135, "ref_ce_loss": 0.060100555419921875, "step": 11800 }, { "epoch": 3.9359573048699135, "loss": 0.3517928123474121, "step": 11800 }, { "ce_loss": 0.07974269986152649, "epoch": 3.9359573048699135, "step": 11800 }, { "distill_loss": 0.1284874528646469, "epoch": 3.9359573048699135, "step": 11800 }, { "epoch": 3.9359573048699135, "ref_ce_loss": 0.06259086728096008, "step": 11800 }, { "epoch": 3.939292861907939, "loss": 0.4735, "step": 11810 }, { "epoch": 3.939292861907939, "grad_norm": 2.0716724395751953, "step": 11810 }, { "epoch": 3.939292861907939, "learning_rate": 0.00016096902939225283, "step": 11810 }, { "epoch": 3.939292861907939, "loss": 0.47878918051719666, "step": 11810 }, { "ce_loss": 0.20008765161037445, "epoch": 3.939292861907939, "step": 11810 }, { "distill_loss": 0.1378183811903, "epoch": 3.939292861907939, "step": 11810 }, { "epoch": 3.939292861907939, "ref_ce_loss": 0.12045397609472275, "step": 11810 }, { "epoch": 3.939292861907939, "loss": 0.6673761606216431, "step": 11810 }, { "ce_loss": 0.1397218108177185, "epoch": 3.939292861907939, "step": 11810 }, { "distill_loss": 0.14506368339061737, "epoch": 3.939292861907939, "step": 11810 }, { "epoch": 3.939292861907939, "ref_ce_loss": 0.13423040509223938, "step": 11810 }, { "epoch": 3.939292861907939, "loss": 0.5018212199211121, "step": 11810 }, { "ce_loss": 0.1881476491689682, "epoch": 3.939292861907939, "step": 11810 }, { "distill_loss": 0.1714666485786438, "epoch": 3.939292861907939, "step": 11810 }, { "epoch": 3.939292861907939, "ref_ce_loss": 0.09688962996006012, "step": 11810 }, { "epoch": 3.939292861907939, "loss": 0.4474361836910248, "step": 11810 }, { "ce_loss": 0.1412636637687683, "epoch": 3.939292861907939, "step": 11810 }, { "distill_loss": 0.17755410075187683, "epoch": 3.939292861907939, "step": 11810 }, { "epoch": 3.939292861907939, "ref_ce_loss": 0.09456686675548553, "step": 11810 }, { "epoch": 3.942628418945964, "loss": 0.4634, "step": 11820 }, { "epoch": 3.942628418945964, "grad_norm": 3.9476706981658936, "step": 11820 }, { "epoch": 3.942628418945964, "learning_rate": 0.00016076700035599052, "step": 11820 }, { "epoch": 3.942628418945964, "loss": 0.7122544050216675, "step": 11820 }, { "ce_loss": 0.2613614499568939, "epoch": 3.942628418945964, "step": 11820 }, { "distill_loss": 0.2061549425125122, "epoch": 3.942628418945964, "step": 11820 }, { "epoch": 3.942628418945964, "ref_ce_loss": 0.1448051482439041, "step": 11820 }, { "epoch": 3.942628418945964, "loss": 0.6049351096153259, "step": 11820 }, { "ce_loss": 0.22536680102348328, "epoch": 3.942628418945964, "step": 11820 }, { "distill_loss": 0.24145212769508362, "epoch": 3.942628418945964, "step": 11820 }, { "epoch": 3.942628418945964, "ref_ce_loss": 0.1379050463438034, "step": 11820 }, { "epoch": 3.942628418945964, "loss": 0.4970494508743286, "step": 11820 }, { "ce_loss": 0.1067628264427185, "epoch": 3.942628418945964, "step": 11820 }, { "distill_loss": 0.14002716541290283, "epoch": 3.942628418945964, "step": 11820 }, { "epoch": 3.942628418945964, "ref_ce_loss": 0.08016132563352585, "step": 11820 }, { "epoch": 3.942628418945964, "loss": 0.35276898741722107, "step": 11820 }, { "ce_loss": 0.12041664868593216, "epoch": 3.942628418945964, "step": 11820 }, { "distill_loss": 0.1190585047006607, "epoch": 3.942628418945964, "step": 11820 }, { "epoch": 3.942628418945964, "ref_ce_loss": 0.0713760107755661, "step": 11820 }, { "epoch": 3.9459639759839895, "loss": 0.4898, "step": 11830 }, { "epoch": 3.9459639759839895, "grad_norm": 3.2094149589538574, "step": 11830 }, { "epoch": 3.9459639759839895, "learning_rate": 0.0001605649516849667, "step": 11830 }, { "epoch": 3.9459639759839895, "loss": 0.4047658443450928, "step": 11830 }, { "ce_loss": 0.08828526735305786, "epoch": 3.9459639759839895, "step": 11830 }, { "distill_loss": 0.1679137498140335, "epoch": 3.9459639759839895, "step": 11830 }, { "epoch": 3.9459639759839895, "ref_ce_loss": 0.07592868059873581, "step": 11830 }, { "epoch": 3.9459639759839895, "loss": 0.38525718450546265, "step": 11830 }, { "ce_loss": 0.1573115736246109, "epoch": 3.9459639759839895, "step": 11830 }, { "distill_loss": 0.14555348455905914, "epoch": 3.9459639759839895, "step": 11830 }, { "epoch": 3.9459639759839895, "ref_ce_loss": 0.08227339386940002, "step": 11830 }, { "epoch": 3.9459639759839895, "loss": 0.6348785161972046, "step": 11830 }, { "ce_loss": 0.20729894936084747, "epoch": 3.9459639759839895, "step": 11830 }, { "distill_loss": 0.16912637650966644, "epoch": 3.9459639759839895, "step": 11830 }, { "epoch": 3.9459639759839895, "ref_ce_loss": 0.1757945418357849, "step": 11830 }, { "epoch": 3.9459639759839895, "loss": 0.6446617841720581, "step": 11830 }, { "ce_loss": 0.216706320643425, "epoch": 3.9459639759839895, "step": 11830 }, { "distill_loss": 0.17024075984954834, "epoch": 3.9459639759839895, "step": 11830 }, { "epoch": 3.9459639759839895, "ref_ce_loss": 0.1002252846956253, "step": 11830 }, { "epoch": 3.949299533022015, "loss": 0.5071, "step": 11840 }, { "epoch": 3.949299533022015, "grad_norm": 3.606684684753418, "step": 11840 }, { "epoch": 3.949299533022015, "learning_rate": 0.00016036288374763862, "step": 11840 }, { "epoch": 3.949299533022015, "loss": 0.4314247667789459, "step": 11840 }, { "ce_loss": 0.14194534718990326, "epoch": 3.949299533022015, "step": 11840 }, { "distill_loss": 0.1760086715221405, "epoch": 3.949299533022015, "step": 11840 }, { "epoch": 3.949299533022015, "ref_ce_loss": 0.11337415874004364, "step": 11840 }, { "epoch": 3.949299533022015, "loss": 0.5289191007614136, "step": 11840 }, { "ce_loss": 0.1298551708459854, "epoch": 3.949299533022015, "step": 11840 }, { "distill_loss": 0.16319157183170319, "epoch": 3.949299533022015, "step": 11840 }, { "epoch": 3.949299533022015, "ref_ce_loss": 0.10791688412427902, "step": 11840 }, { "epoch": 3.949299533022015, "loss": 0.5899083018302917, "step": 11840 }, { "ce_loss": 0.10379697382450104, "epoch": 3.949299533022015, "step": 11840 }, { "distill_loss": 0.1373264640569687, "epoch": 3.949299533022015, "step": 11840 }, { "epoch": 3.949299533022015, "ref_ce_loss": 0.09423581510782242, "step": 11840 }, { "epoch": 3.949299533022015, "loss": 0.4148091673851013, "step": 11840 }, { "ce_loss": 0.11106943339109421, "epoch": 3.949299533022015, "step": 11840 }, { "distill_loss": 0.17882095277309418, "epoch": 3.949299533022015, "step": 11840 }, { "epoch": 3.949299533022015, "ref_ce_loss": 0.12472303211688995, "step": 11840 }, { "epoch": 3.95263509006004, "loss": 0.5228, "step": 11850 }, { "epoch": 3.95263509006004, "grad_norm": 2.477299451828003, "step": 11850 }, { "epoch": 3.95263509006004, "learning_rate": 0.00016016079691249835, "step": 11850 }, { "epoch": 3.95263509006004, "loss": 0.30184489488601685, "step": 11850 }, { "ce_loss": 0.08025673031806946, "epoch": 3.95263509006004, "step": 11850 }, { "distill_loss": 0.13255445659160614, "epoch": 3.95263509006004, "step": 11850 }, { "epoch": 3.95263509006004, "ref_ce_loss": 0.05937567353248596, "step": 11850 }, { "epoch": 3.95263509006004, "loss": 0.9538842439651489, "step": 11850 }, { "ce_loss": 0.22645705938339233, "epoch": 3.95263509006004, "step": 11850 }, { "distill_loss": 0.29409679770469666, "epoch": 3.95263509006004, "step": 11850 }, { "epoch": 3.95263509006004, "ref_ce_loss": 0.15761788189411163, "step": 11850 }, { "epoch": 3.95263509006004, "loss": 0.5301204919815063, "step": 11850 }, { "ce_loss": 0.18444590270519257, "epoch": 3.95263509006004, "step": 11850 }, { "distill_loss": 0.179538756608963, "epoch": 3.95263509006004, "step": 11850 }, { "epoch": 3.95263509006004, "ref_ce_loss": 0.1306769698858261, "step": 11850 }, { "epoch": 3.95263509006004, "loss": 0.36008432507514954, "step": 11850 }, { "ce_loss": 0.09907856583595276, "epoch": 3.95263509006004, "step": 11850 }, { "distill_loss": 0.15818718075752258, "epoch": 3.95263509006004, "step": 11850 }, { "epoch": 3.95263509006004, "ref_ce_loss": 0.0713249072432518, "step": 11850 }, { "epoch": 3.9559706470980656, "loss": 0.4798, "step": 11860 }, { "epoch": 3.9559706470980656, "grad_norm": 2.704888105392456, "step": 11860 }, { "epoch": 3.9559706470980656, "learning_rate": 0.00015995869154807266, "step": 11860 }, { "epoch": 3.9559706470980656, "loss": 0.42570406198501587, "step": 11860 }, { "ce_loss": 0.13900217413902283, "epoch": 3.9559706470980656, "step": 11860 }, { "distill_loss": 0.1536346822977066, "epoch": 3.9559706470980656, "step": 11860 }, { "epoch": 3.9559706470980656, "ref_ce_loss": 0.07783767580986023, "step": 11860 }, { "epoch": 3.9559706470980656, "loss": 0.4678839147090912, "step": 11860 }, { "ce_loss": 0.09099794179201126, "epoch": 3.9559706470980656, "step": 11860 }, { "distill_loss": 0.22181285917758942, "epoch": 3.9559706470980656, "step": 11860 }, { "epoch": 3.9559706470980656, "ref_ce_loss": 0.11604619771242142, "step": 11860 }, { "epoch": 3.9559706470980656, "loss": 0.461051881313324, "step": 11860 }, { "ce_loss": 0.09864752739667892, "epoch": 3.9559706470980656, "step": 11860 }, { "distill_loss": 0.09917500615119934, "epoch": 3.9559706470980656, "step": 11860 }, { "epoch": 3.9559706470980656, "ref_ce_loss": 0.09517745673656464, "step": 11860 }, { "epoch": 3.9559706470980656, "loss": 0.5379061102867126, "step": 11860 }, { "ce_loss": 0.13948775827884674, "epoch": 3.9559706470980656, "step": 11860 }, { "distill_loss": 0.2097395360469818, "epoch": 3.9559706470980656, "step": 11860 }, { "epoch": 3.9559706470980656, "ref_ce_loss": 0.13162483274936676, "step": 11860 }, { "epoch": 3.959306204136091, "loss": 0.5352, "step": 11870 }, { "epoch": 3.959306204136091, "grad_norm": 3.3357937335968018, "step": 11870 }, { "epoch": 3.959306204136091, "learning_rate": 0.00015975656802292196, "step": 11870 }, { "epoch": 3.959306204136091, "loss": 0.5465363264083862, "step": 11870 }, { "ce_loss": 0.10335254669189453, "epoch": 3.959306204136091, "step": 11870 }, { "distill_loss": 0.22682029008865356, "epoch": 3.959306204136091, "step": 11870 }, { "epoch": 3.959306204136091, "ref_ce_loss": 0.119485042989254, "step": 11870 }, { "epoch": 3.959306204136091, "loss": 0.3730747401714325, "step": 11870 }, { "ce_loss": 0.09395212680101395, "epoch": 3.959306204136091, "step": 11870 }, { "distill_loss": 0.19650858640670776, "epoch": 3.959306204136091, "step": 11870 }, { "epoch": 3.959306204136091, "ref_ce_loss": 0.08227689564228058, "step": 11870 }, { "epoch": 3.959306204136091, "loss": 0.5724227428436279, "step": 11870 }, { "ce_loss": 0.07213887572288513, "epoch": 3.959306204136091, "step": 11870 }, { "distill_loss": 0.165091872215271, "epoch": 3.959306204136091, "step": 11870 }, { "epoch": 3.959306204136091, "ref_ce_loss": 0.07907076925039291, "step": 11870 }, { "epoch": 3.959306204136091, "loss": 0.9376710653305054, "step": 11870 }, { "ce_loss": 0.08496998250484467, "epoch": 3.959306204136091, "step": 11870 }, { "distill_loss": 0.18652302026748657, "epoch": 3.959306204136091, "step": 11870 }, { "epoch": 3.959306204136091, "ref_ce_loss": 0.10293760895729065, "step": 11870 }, { "epoch": 3.9626417611741163, "loss": 0.5433, "step": 11880 }, { "epoch": 3.9626417611741163, "grad_norm": 3.060880661010742, "step": 11880 }, { "epoch": 3.9626417611741163, "learning_rate": 0.00015955442670563983, "step": 11880 }, { "epoch": 3.9626417611741163, "loss": 0.38805532455444336, "step": 11880 }, { "ce_loss": 0.0926336944103241, "epoch": 3.9626417611741163, "step": 11880 }, { "distill_loss": 0.1631380021572113, "epoch": 3.9626417611741163, "step": 11880 }, { "epoch": 3.9626417611741163, "ref_ce_loss": 0.10547516494989395, "step": 11880 }, { "epoch": 3.9626417611741163, "loss": 0.4460315704345703, "step": 11880 }, { "ce_loss": 0.13091424107551575, "epoch": 3.9626417611741163, "step": 11880 }, { "distill_loss": 0.19473236799240112, "epoch": 3.9626417611741163, "step": 11880 }, { "epoch": 3.9626417611741163, "ref_ce_loss": 0.0909007340669632, "step": 11880 }, { "epoch": 3.9626417611741163, "loss": 0.6728657484054565, "step": 11880 }, { "ce_loss": 0.10651903599500656, "epoch": 3.9626417611741163, "step": 11880 }, { "distill_loss": 0.186860591173172, "epoch": 3.9626417611741163, "step": 11880 }, { "epoch": 3.9626417611741163, "ref_ce_loss": 0.0980074554681778, "step": 11880 }, { "epoch": 3.9626417611741163, "loss": 0.4039444327354431, "step": 11880 }, { "ce_loss": 0.07333799451589584, "epoch": 3.9626417611741163, "step": 11880 }, { "distill_loss": 0.21126312017440796, "epoch": 3.9626417611741163, "step": 11880 }, { "epoch": 3.9626417611741163, "ref_ce_loss": 0.08966390788555145, "step": 11880 }, { "epoch": 3.9659773182121416, "loss": 0.4965, "step": 11890 }, { "epoch": 3.9659773182121416, "grad_norm": 3.569099187850952, "step": 11890 }, { "epoch": 3.9659773182121416, "learning_rate": 0.00015935226796485227, "step": 11890 }, { "epoch": 3.9659773182121416, "loss": 0.5312309265136719, "step": 11890 }, { "ce_loss": 0.11842554807662964, "epoch": 3.9659773182121416, "step": 11890 }, { "distill_loss": 0.20231810212135315, "epoch": 3.9659773182121416, "step": 11890 }, { "epoch": 3.9659773182121416, "ref_ce_loss": 0.09051153063774109, "step": 11890 }, { "epoch": 3.9659773182121416, "loss": 0.3848481774330139, "step": 11890 }, { "ce_loss": 0.12863628566265106, "epoch": 3.9659773182121416, "step": 11890 }, { "distill_loss": 0.1335049271583557, "epoch": 3.9659773182121416, "step": 11890 }, { "epoch": 3.9659773182121416, "ref_ce_loss": 0.12259049713611603, "step": 11890 }, { "epoch": 3.9659773182121416, "loss": 0.3410494923591614, "step": 11890 }, { "ce_loss": 0.10201102495193481, "epoch": 3.9659773182121416, "step": 11890 }, { "distill_loss": 0.15431654453277588, "epoch": 3.9659773182121416, "step": 11890 }, { "epoch": 3.9659773182121416, "ref_ce_loss": 0.08442019671201706, "step": 11890 }, { "epoch": 3.9659773182121416, "loss": 0.4434935748577118, "step": 11890 }, { "ce_loss": 0.14288806915283203, "epoch": 3.9659773182121416, "step": 11890 }, { "distill_loss": 0.17208629846572876, "epoch": 3.9659773182121416, "step": 11890 }, { "epoch": 3.9659773182121416, "ref_ce_loss": 0.10217718780040741, "step": 11890 }, { "epoch": 3.969312875250167, "loss": 0.5087, "step": 11900 }, { "epoch": 3.969312875250167, "grad_norm": 2.463893413543701, "step": 11900 }, { "epoch": 3.969312875250167, "learning_rate": 0.00015915009216921716, "step": 11900 }, { "epoch": 3.969312875250167, "loss": 0.9179453253746033, "step": 11900 }, { "ce_loss": 0.14963048696517944, "epoch": 3.969312875250167, "step": 11900 }, { "distill_loss": 0.4440215528011322, "epoch": 3.969312875250167, "step": 11900 }, { "epoch": 3.969312875250167, "ref_ce_loss": 0.171931654214859, "step": 11900 }, { "epoch": 3.969312875250167, "loss": 0.636218786239624, "step": 11900 }, { "ce_loss": 0.16180500388145447, "epoch": 3.969312875250167, "step": 11900 }, { "distill_loss": 0.17104534804821014, "epoch": 3.969312875250167, "step": 11900 }, { "epoch": 3.969312875250167, "ref_ce_loss": 0.12259872257709503, "step": 11900 }, { "epoch": 3.969312875250167, "loss": 0.8462910652160645, "step": 11900 }, { "ce_loss": 0.14126595854759216, "epoch": 3.969312875250167, "step": 11900 }, { "distill_loss": 0.21537429094314575, "epoch": 3.969312875250167, "step": 11900 }, { "epoch": 3.969312875250167, "ref_ce_loss": 0.15868441760540009, "step": 11900 }, { "epoch": 3.969312875250167, "loss": 0.6787412166595459, "step": 11900 }, { "ce_loss": 0.2031691074371338, "epoch": 3.969312875250167, "step": 11900 }, { "distill_loss": 0.15440312027931213, "epoch": 3.969312875250167, "step": 11900 }, { "epoch": 3.969312875250167, "ref_ce_loss": 0.12730467319488525, "step": 11900 }, { "epoch": 3.9726484322881923, "loss": 0.5596, "step": 11910 }, { "epoch": 3.9726484322881923, "grad_norm": 4.980892658233643, "step": 11910 }, { "epoch": 3.9726484322881923, "learning_rate": 0.0001589478996874233, "step": 11910 }, { "epoch": 3.9726484322881923, "loss": 0.5536061525344849, "step": 11910 }, { "ce_loss": 0.15429961681365967, "epoch": 3.9726484322881923, "step": 11910 }, { "distill_loss": 0.2155410647392273, "epoch": 3.9726484322881923, "step": 11910 }, { "epoch": 3.9726484322881923, "ref_ce_loss": 0.12953603267669678, "step": 11910 }, { "epoch": 3.9726484322881923, "loss": 0.4384726881980896, "step": 11910 }, { "ce_loss": 0.10884366929531097, "epoch": 3.9726484322881923, "step": 11910 }, { "distill_loss": 0.17421254515647888, "epoch": 3.9726484322881923, "step": 11910 }, { "epoch": 3.9726484322881923, "ref_ce_loss": 0.10366030037403107, "step": 11910 }, { "epoch": 3.9726484322881923, "loss": 0.3859587013721466, "step": 11910 }, { "ce_loss": 0.08458442240953445, "epoch": 3.9726484322881923, "step": 11910 }, { "distill_loss": 0.12382687628269196, "epoch": 3.9726484322881923, "step": 11910 }, { "epoch": 3.9726484322881923, "ref_ce_loss": 0.09032014012336731, "step": 11910 }, { "epoch": 3.9726484322881923, "loss": 0.6750946044921875, "step": 11910 }, { "ce_loss": 0.2623468339443207, "epoch": 3.9726484322881923, "step": 11910 }, { "distill_loss": 0.180451437830925, "epoch": 3.9726484322881923, "step": 11910 }, { "epoch": 3.9726484322881923, "ref_ce_loss": 0.1338099092245102, "step": 11910 }, { "epoch": 3.9759839893262177, "loss": 0.5104, "step": 11920 }, { "epoch": 3.9759839893262177, "grad_norm": 3.3186306953430176, "step": 11920 }, { "epoch": 3.9759839893262177, "learning_rate": 0.00015874569088819015, "step": 11920 }, { "epoch": 3.9759839893262177, "loss": 0.4923209249973297, "step": 11920 }, { "ce_loss": 0.13076621294021606, "epoch": 3.9759839893262177, "step": 11920 }, { "distill_loss": 0.14761069416999817, "epoch": 3.9759839893262177, "step": 11920 }, { "epoch": 3.9759839893262177, "ref_ce_loss": 0.15832741558551788, "step": 11920 }, { "epoch": 3.9759839893262177, "loss": 0.2980753183364868, "step": 11920 }, { "ce_loss": 0.08580087870359421, "epoch": 3.9759839893262177, "step": 11920 }, { "distill_loss": 0.15114721655845642, "epoch": 3.9759839893262177, "step": 11920 }, { "epoch": 3.9759839893262177, "ref_ce_loss": 0.060876086354255676, "step": 11920 }, { "epoch": 3.9759839893262177, "loss": 0.5571591854095459, "step": 11920 }, { "ce_loss": 0.15017174184322357, "epoch": 3.9759839893262177, "step": 11920 }, { "distill_loss": 0.22002549469470978, "epoch": 3.9759839893262177, "step": 11920 }, { "epoch": 3.9759839893262177, "ref_ce_loss": 0.09577220678329468, "step": 11920 }, { "epoch": 3.9759839893262177, "loss": 0.5138970017433167, "step": 11920 }, { "ce_loss": 0.18627305328845978, "epoch": 3.9759839893262177, "step": 11920 }, { "distill_loss": 0.1946372538805008, "epoch": 3.9759839893262177, "step": 11920 }, { "epoch": 3.9759839893262177, "ref_ce_loss": 0.0819023847579956, "step": 11920 }, { "epoch": 3.979319546364243, "loss": 0.5138, "step": 11930 }, { "epoch": 3.979319546364243, "grad_norm": 2.3914072513580322, "step": 11930 }, { "epoch": 3.979319546364243, "learning_rate": 0.0001585434661402667, "step": 11930 }, { "epoch": 3.979319546364243, "loss": 0.3897310495376587, "step": 11930 }, { "ce_loss": 0.09720531105995178, "epoch": 3.979319546364243, "step": 11930 }, { "distill_loss": 0.16733823716640472, "epoch": 3.979319546364243, "step": 11930 }, { "epoch": 3.979319546364243, "ref_ce_loss": 0.1047656387090683, "step": 11930 }, { "epoch": 3.979319546364243, "loss": 0.5685856938362122, "step": 11930 }, { "ce_loss": 0.14480598270893097, "epoch": 3.979319546364243, "step": 11930 }, { "distill_loss": 0.1961873173713684, "epoch": 3.979319546364243, "step": 11930 }, { "epoch": 3.979319546364243, "ref_ce_loss": 0.11797832697629929, "step": 11930 }, { "epoch": 3.979319546364243, "loss": 0.5430389642715454, "step": 11930 }, { "ce_loss": 0.16561175882816315, "epoch": 3.979319546364243, "step": 11930 }, { "distill_loss": 0.19522586464881897, "epoch": 3.979319546364243, "step": 11930 }, { "epoch": 3.979319546364243, "ref_ce_loss": 0.1352234035730362, "step": 11930 }, { "epoch": 3.979319546364243, "loss": 0.5040985345840454, "step": 11930 }, { "ce_loss": 0.13989169895648956, "epoch": 3.979319546364243, "step": 11930 }, { "distill_loss": 0.13456882536411285, "epoch": 3.979319546364243, "step": 11930 }, { "epoch": 3.979319546364243, "ref_ce_loss": 0.10855761915445328, "step": 11930 }, { "epoch": 3.9826551034022684, "loss": 0.4939, "step": 11940 }, { "epoch": 3.9826551034022684, "grad_norm": 3.0207884311676025, "step": 11940 }, { "epoch": 3.9826551034022684, "learning_rate": 0.00015834122581243103, "step": 11940 }, { "epoch": 3.9826551034022684, "loss": 0.5423541069030762, "step": 11940 }, { "ce_loss": 0.11814697831869125, "epoch": 3.9826551034022684, "step": 11940 }, { "distill_loss": 0.15340958535671234, "epoch": 3.9826551034022684, "step": 11940 }, { "epoch": 3.9826551034022684, "ref_ce_loss": 0.10886798053979874, "step": 11940 }, { "epoch": 3.9826551034022684, "loss": 0.5537235140800476, "step": 11940 }, { "ce_loss": 0.2218320369720459, "epoch": 3.9826551034022684, "step": 11940 }, { "distill_loss": 0.19175180792808533, "epoch": 3.9826551034022684, "step": 11940 }, { "epoch": 3.9826551034022684, "ref_ce_loss": 0.10942274332046509, "step": 11940 }, { "epoch": 3.9826551034022684, "loss": 0.5389870405197144, "step": 11940 }, { "ce_loss": 0.11036817729473114, "epoch": 3.9826551034022684, "step": 11940 }, { "distill_loss": 0.18006466329097748, "epoch": 3.9826551034022684, "step": 11940 }, { "epoch": 3.9826551034022684, "ref_ce_loss": 0.12648382782936096, "step": 11940 }, { "epoch": 3.9826551034022684, "loss": 0.6045240163803101, "step": 11940 }, { "ce_loss": 0.21640561521053314, "epoch": 3.9826551034022684, "step": 11940 }, { "distill_loss": 0.208718404173851, "epoch": 3.9826551034022684, "step": 11940 }, { "epoch": 3.9826551034022684, "ref_ce_loss": 0.08881595730781555, "step": 11940 }, { "epoch": 3.9859906604402937, "loss": 0.5379, "step": 11950 }, { "epoch": 3.9859906604402937, "grad_norm": 3.2304489612579346, "step": 11950 }, { "epoch": 3.9859906604402937, "learning_rate": 0.00015813897027348989, "step": 11950 }, { "epoch": 3.9859906604402937, "loss": 0.2578718364238739, "step": 11950 }, { "ce_loss": 0.06574041396379471, "epoch": 3.9859906604402937, "step": 11950 }, { "distill_loss": 0.11338651180267334, "epoch": 3.9859906604402937, "step": 11950 }, { "epoch": 3.9859906604402937, "ref_ce_loss": 0.05241406708955765, "step": 11950 }, { "epoch": 3.9859906604402937, "loss": 0.2795087397098541, "step": 11950 }, { "ce_loss": 0.031032180413603783, "epoch": 3.9859906604402937, "step": 11950 }, { "distill_loss": 0.11463324725627899, "epoch": 3.9859906604402937, "step": 11950 }, { "epoch": 3.9859906604402937, "ref_ce_loss": 0.08366985619068146, "step": 11950 }, { "epoch": 3.9859906604402937, "loss": 0.6945253610610962, "step": 11950 }, { "ce_loss": 0.22229063510894775, "epoch": 3.9859906604402937, "step": 11950 }, { "distill_loss": 0.18065792322158813, "epoch": 3.9859906604402937, "step": 11950 }, { "epoch": 3.9859906604402937, "ref_ce_loss": 0.09593914449214935, "step": 11950 }, { "epoch": 3.9859906604402937, "loss": 0.4455210268497467, "step": 11950 }, { "ce_loss": 0.17479459941387177, "epoch": 3.9859906604402937, "step": 11950 }, { "distill_loss": 0.18478310108184814, "epoch": 3.9859906604402937, "step": 11950 }, { "epoch": 3.9859906604402937, "ref_ce_loss": 0.08535302430391312, "step": 11950 }, { "epoch": 3.989326217478319, "loss": 0.546, "step": 11960 }, { "epoch": 3.989326217478319, "grad_norm": 2.468493938446045, "step": 11960 }, { "epoch": 3.989326217478319, "learning_rate": 0.00015793669989227758, "step": 11960 }, { "epoch": 3.989326217478319, "loss": 0.5372409820556641, "step": 11960 }, { "ce_loss": 0.14682519435882568, "epoch": 3.989326217478319, "step": 11960 }, { "distill_loss": 0.23497946560382843, "epoch": 3.989326217478319, "step": 11960 }, { "epoch": 3.989326217478319, "ref_ce_loss": 0.09852960705757141, "step": 11960 }, { "epoch": 3.989326217478319, "loss": 0.3980017900466919, "step": 11960 }, { "ce_loss": 0.1523795872926712, "epoch": 3.989326217478319, "step": 11960 }, { "distill_loss": 0.1418992578983307, "epoch": 3.989326217478319, "step": 11960 }, { "epoch": 3.989326217478319, "ref_ce_loss": 0.1035848930478096, "step": 11960 }, { "epoch": 3.989326217478319, "loss": 0.45328015089035034, "step": 11960 }, { "ce_loss": 0.14866439998149872, "epoch": 3.989326217478319, "step": 11960 }, { "distill_loss": 0.24166658520698547, "epoch": 3.989326217478319, "step": 11960 }, { "epoch": 3.989326217478319, "ref_ce_loss": 0.06284800916910172, "step": 11960 }, { "epoch": 3.989326217478319, "loss": 1.29494309425354, "step": 11960 }, { "ce_loss": 0.22440198063850403, "epoch": 3.989326217478319, "step": 11960 }, { "distill_loss": 0.3025294244289398, "epoch": 3.989326217478319, "step": 11960 }, { "epoch": 3.989326217478319, "ref_ce_loss": 0.1537446528673172, "step": 11960 }, { "epoch": 3.9926617745163444, "loss": 0.5301, "step": 11970 }, { "epoch": 3.9926617745163444, "grad_norm": 3.436962604522705, "step": 11970 }, { "epoch": 3.9926617745163444, "learning_rate": 0.00015773441503765537, "step": 11970 }, { "epoch": 3.9926617745163444, "loss": 0.38053038716316223, "step": 11970 }, { "ce_loss": 0.12679526209831238, "epoch": 3.9926617745163444, "step": 11970 }, { "distill_loss": 0.18604570627212524, "epoch": 3.9926617745163444, "step": 11970 }, { "epoch": 3.9926617745163444, "ref_ce_loss": 0.06746406108140945, "step": 11970 }, { "epoch": 3.9926617745163444, "loss": 0.33811435103416443, "step": 11970 }, { "ce_loss": 0.06887925416231155, "epoch": 3.9926617745163444, "step": 11970 }, { "distill_loss": 0.14557978510856628, "epoch": 3.9926617745163444, "step": 11970 }, { "epoch": 3.9926617745163444, "ref_ce_loss": 0.06431765854358673, "step": 11970 }, { "epoch": 3.9926617745163444, "loss": 0.43382972478866577, "step": 11970 }, { "ce_loss": 0.14090314507484436, "epoch": 3.9926617745163444, "step": 11970 }, { "distill_loss": 0.20701713860034943, "epoch": 3.9926617745163444, "step": 11970 }, { "epoch": 3.9926617745163444, "ref_ce_loss": 0.05977332219481468, "step": 11970 }, { "epoch": 3.9926617745163444, "loss": 0.5020667314529419, "step": 11970 }, { "ce_loss": 0.11767388880252838, "epoch": 3.9926617745163444, "step": 11970 }, { "distill_loss": 0.2077358216047287, "epoch": 3.9926617745163444, "step": 11970 }, { "epoch": 3.9926617745163444, "ref_ce_loss": 0.11229929327964783, "step": 11970 }, { "epoch": 3.9959973315543698, "loss": 0.5052, "step": 11980 }, { "epoch": 3.9959973315543698, "grad_norm": 3.4483630657196045, "step": 11980 }, { "epoch": 3.9959973315543698, "learning_rate": 0.00015753211607851114, "step": 11980 }, { "epoch": 3.9959973315543698, "loss": 0.8230859637260437, "step": 11980 }, { "ce_loss": 0.1882457435131073, "epoch": 3.9959973315543698, "step": 11980 }, { "distill_loss": 0.17528904974460602, "epoch": 3.9959973315543698, "step": 11980 }, { "epoch": 3.9959973315543698, "ref_ce_loss": 0.09798739105463028, "step": 11980 }, { "epoch": 3.9959973315543698, "loss": 0.48872292041778564, "step": 11980 }, { "ce_loss": 0.12391944974660873, "epoch": 3.9959973315543698, "step": 11980 }, { "distill_loss": 0.19896352291107178, "epoch": 3.9959973315543698, "step": 11980 }, { "epoch": 3.9959973315543698, "ref_ce_loss": 0.11606057733297348, "step": 11980 }, { "epoch": 3.9959973315543698, "loss": 0.42669156193733215, "step": 11980 }, { "ce_loss": 0.07362711429595947, "epoch": 3.9959973315543698, "step": 11980 }, { "distill_loss": 0.1800188571214676, "epoch": 3.9959973315543698, "step": 11980 }, { "epoch": 3.9959973315543698, "ref_ce_loss": 0.11211638152599335, "step": 11980 }, { "epoch": 3.9959973315543698, "loss": 0.9327055215835571, "step": 11980 }, { "ce_loss": 0.15583303570747375, "epoch": 3.9959973315543698, "step": 11980 }, { "distill_loss": 0.20757633447647095, "epoch": 3.9959973315543698, "step": 11980 }, { "epoch": 3.9959973315543698, "ref_ce_loss": 0.12007487565279007, "step": 11980 }, { "epoch": 3.999332888592395, "loss": 0.494, "step": 11990 }, { "epoch": 3.999332888592395, "grad_norm": 1.737951397895813, "step": 11990 }, { "epoch": 3.999332888592395, "learning_rate": 0.00015732980338375836, "step": 11990 }, { "epoch": 3.999332888592395, "loss": 0.37717702984809875, "step": 11990 }, { "ce_loss": 0.12793800234794617, "epoch": 3.999332888592395, "step": 11990 }, { "distill_loss": 0.17517952620983124, "epoch": 3.999332888592395, "step": 11990 }, { "epoch": 3.999332888592395, "ref_ce_loss": 0.07389137148857117, "step": 11990 }, { "epoch": 3.999332888592395, "loss": 0.23578205704689026, "step": 11990 }, { "ce_loss": 0.04605966433882713, "epoch": 3.999332888592395, "step": 11990 }, { "distill_loss": 0.12195150554180145, "epoch": 3.999332888592395, "step": 11990 }, { "epoch": 3.999332888592395, "ref_ce_loss": 0.06749168783426285, "step": 11990 }, { "epoch": 3.999332888592395, "loss": 0.45324552059173584, "step": 11990 }, { "ce_loss": 0.16480252146720886, "epoch": 3.999332888592395, "step": 11990 }, { "distill_loss": 0.16457819938659668, "epoch": 3.999332888592395, "step": 11990 }, { "epoch": 3.999332888592395, "ref_ce_loss": 0.12353596836328506, "step": 11990 }, { "epoch": 3.999332888592395, "loss": 0.3933228552341461, "step": 11990 }, { "ce_loss": 0.1270236372947693, "epoch": 3.999332888592395, "step": 11990 }, { "distill_loss": 0.17129284143447876, "epoch": 3.999332888592395, "step": 11990 }, { "epoch": 3.999332888592395, "ref_ce_loss": 0.09473787248134613, "step": 11990 }, { "epoch": 4.0026684456304205, "loss": 0.5063, "step": 12000 }, { "epoch": 4.0026684456304205, "grad_norm": 3.112962245941162, "step": 12000 }, { "epoch": 4.0026684456304205, "learning_rate": 0.00015712747732233556, "step": 12000 }, { "epoch": 4.0026684456304205, "loss": 0.37287402153015137, "step": 12000 }, { "ce_loss": 0.06412719190120697, "epoch": 4.0026684456304205, "step": 12000 }, { "distill_loss": 0.12839214503765106, "epoch": 4.0026684456304205, "step": 12000 }, { "epoch": 4.0026684456304205, "ref_ce_loss": 0.09659893810749054, "step": 12000 }, { "epoch": 4.0026684456304205, "loss": 0.5252148509025574, "step": 12000 }, { "ce_loss": 0.125128373503685, "epoch": 4.0026684456304205, "step": 12000 }, { "distill_loss": 0.15444694459438324, "epoch": 4.0026684456304205, "step": 12000 }, { "epoch": 4.0026684456304205, "ref_ce_loss": 0.09184499830007553, "step": 12000 }, { "epoch": 4.0026684456304205, "loss": 0.4332132637500763, "step": 12000 }, { "ce_loss": 0.10666043311357498, "epoch": 4.0026684456304205, "step": 12000 }, { "distill_loss": 0.13837860524654388, "epoch": 4.0026684456304205, "step": 12000 }, { "epoch": 4.0026684456304205, "ref_ce_loss": 0.12885423004627228, "step": 12000 }, { "epoch": 4.0026684456304205, "loss": 0.9290437698364258, "step": 12000 }, { "ce_loss": 0.15852229297161102, "epoch": 4.0026684456304205, "step": 12000 }, { "distill_loss": 0.17924055457115173, "epoch": 4.0026684456304205, "step": 12000 }, { "epoch": 4.0026684456304205, "ref_ce_loss": 0.1165415495634079, "step": 12000 }, { "epoch": 4.006004002668446, "loss": 0.4461, "step": 12010 }, { "epoch": 4.006004002668446, "grad_norm": 2.8395586013793945, "step": 12010 }, { "epoch": 4.006004002668446, "learning_rate": 0.00015692513826320571, "step": 12010 }, { "epoch": 4.006004002668446, "loss": 0.3618427813053131, "step": 12010 }, { "ce_loss": 0.07987383753061295, "epoch": 4.006004002668446, "step": 12010 }, { "distill_loss": 0.14220160245895386, "epoch": 4.006004002668446, "step": 12010 }, { "epoch": 4.006004002668446, "ref_ce_loss": 0.0801934227347374, "step": 12010 }, { "epoch": 4.006004002668446, "loss": 0.5198785066604614, "step": 12010 }, { "ce_loss": 0.1803334504365921, "epoch": 4.006004002668446, "step": 12010 }, { "distill_loss": 0.198713481426239, "epoch": 4.006004002668446, "step": 12010 }, { "epoch": 4.006004002668446, "ref_ce_loss": 0.10547371953725815, "step": 12010 }, { "epoch": 4.006004002668446, "loss": 0.2746797502040863, "step": 12010 }, { "ce_loss": 0.0662359818816185, "epoch": 4.006004002668446, "step": 12010 }, { "distill_loss": 0.09645555913448334, "epoch": 4.006004002668446, "step": 12010 }, { "epoch": 4.006004002668446, "ref_ce_loss": 0.11164869368076324, "step": 12010 }, { "epoch": 4.006004002668446, "loss": 0.5860291719436646, "step": 12010 }, { "ce_loss": 0.09020759165287018, "epoch": 4.006004002668446, "step": 12010 }, { "distill_loss": 0.15296566486358643, "epoch": 4.006004002668446, "step": 12010 }, { "epoch": 4.006004002668446, "ref_ce_loss": 0.08592826873064041, "step": 12010 }, { "epoch": 4.009339559706471, "loss": 0.4357, "step": 12020 }, { "epoch": 4.009339559706471, "grad_norm": 2.628180503845215, "step": 12020 }, { "epoch": 4.009339559706471, "learning_rate": 0.00015672278657535537, "step": 12020 }, { "epoch": 4.009339559706471, "loss": 0.36614707112312317, "step": 12020 }, { "ce_loss": 0.07927390933036804, "epoch": 4.009339559706471, "step": 12020 }, { "distill_loss": 0.15328463912010193, "epoch": 4.009339559706471, "step": 12020 }, { "epoch": 4.009339559706471, "ref_ce_loss": 0.0527513325214386, "step": 12020 }, { "epoch": 4.009339559706471, "loss": 0.34300854802131653, "step": 12020 }, { "ce_loss": 0.09013242274522781, "epoch": 4.009339559706471, "step": 12020 }, { "distill_loss": 0.10978487133979797, "epoch": 4.009339559706471, "step": 12020 }, { "epoch": 4.009339559706471, "ref_ce_loss": 0.12032110244035721, "step": 12020 }, { "epoch": 4.009339559706471, "loss": 0.6139493584632874, "step": 12020 }, { "ce_loss": 0.09104227274656296, "epoch": 4.009339559706471, "step": 12020 }, { "distill_loss": 0.24132594466209412, "epoch": 4.009339559706471, "step": 12020 }, { "epoch": 4.009339559706471, "ref_ce_loss": 0.12532581388950348, "step": 12020 }, { "epoch": 4.009339559706471, "loss": 0.33067432045936584, "step": 12020 }, { "ce_loss": 0.10223788768053055, "epoch": 4.009339559706471, "step": 12020 }, { "distill_loss": 0.15111784636974335, "epoch": 4.009339559706471, "step": 12020 }, { "epoch": 4.009339559706471, "ref_ce_loss": 0.06201582029461861, "step": 12020 }, { "epoch": 4.0126751167444965, "loss": 0.4694, "step": 12030 }, { "epoch": 4.0126751167444965, "grad_norm": 4.58229923248291, "step": 12030 }, { "epoch": 4.0126751167444965, "learning_rate": 0.00015652042262779425, "step": 12030 }, { "epoch": 4.0126751167444965, "loss": 0.34944185614585876, "step": 12030 }, { "ce_loss": 0.07081367820501328, "epoch": 4.0126751167444965, "step": 12030 }, { "distill_loss": 0.11567500233650208, "epoch": 4.0126751167444965, "step": 12030 }, { "epoch": 4.0126751167444965, "ref_ce_loss": 0.11731644719839096, "step": 12030 }, { "epoch": 4.0126751167444965, "loss": 0.25561827421188354, "step": 12030 }, { "ce_loss": 0.037178147584199905, "epoch": 4.0126751167444965, "step": 12030 }, { "distill_loss": 0.09498228132724762, "epoch": 4.0126751167444965, "step": 12030 }, { "epoch": 4.0126751167444965, "ref_ce_loss": 0.07046579569578171, "step": 12030 }, { "epoch": 4.0126751167444965, "loss": 0.33924317359924316, "step": 12030 }, { "ce_loss": 0.048774536699056625, "epoch": 4.0126751167444965, "step": 12030 }, { "distill_loss": 0.13083665072917938, "epoch": 4.0126751167444965, "step": 12030 }, { "epoch": 4.0126751167444965, "ref_ce_loss": 0.06417500972747803, "step": 12030 }, { "epoch": 4.0126751167444965, "loss": 0.21740387380123138, "step": 12030 }, { "ce_loss": 0.024787340313196182, "epoch": 4.0126751167444965, "step": 12030 }, { "distill_loss": 0.1325160712003708, "epoch": 4.0126751167444965, "step": 12030 }, { "epoch": 4.0126751167444965, "ref_ce_loss": 0.059933774173259735, "step": 12030 }, { "epoch": 4.016010673782522, "loss": 0.4299, "step": 12040 }, { "epoch": 4.016010673782522, "grad_norm": 2.332430362701416, "step": 12040 }, { "epoch": 4.016010673782522, "learning_rate": 0.0001563180467895544, "step": 12040 }, { "epoch": 4.016010673782522, "loss": 0.31575217843055725, "step": 12040 }, { "ce_loss": 0.05037858337163925, "epoch": 4.016010673782522, "step": 12040 }, { "distill_loss": 0.13738572597503662, "epoch": 4.016010673782522, "step": 12040 }, { "epoch": 4.016010673782522, "ref_ce_loss": 0.05846783518791199, "step": 12040 }, { "epoch": 4.016010673782522, "loss": 0.3863769769668579, "step": 12040 }, { "ce_loss": 0.12451977282762527, "epoch": 4.016010673782522, "step": 12040 }, { "distill_loss": 0.08762554079294205, "epoch": 4.016010673782522, "step": 12040 }, { "epoch": 4.016010673782522, "ref_ce_loss": 0.13553641736507416, "step": 12040 }, { "epoch": 4.016010673782522, "loss": 0.38889557123184204, "step": 12040 }, { "ce_loss": 0.07112105190753937, "epoch": 4.016010673782522, "step": 12040 }, { "distill_loss": 0.15990790724754333, "epoch": 4.016010673782522, "step": 12040 }, { "epoch": 4.016010673782522, "ref_ce_loss": 0.09904682636260986, "step": 12040 }, { "epoch": 4.016010673782522, "loss": 0.458415687084198, "step": 12040 }, { "ce_loss": 0.04768303409218788, "epoch": 4.016010673782522, "step": 12040 }, { "distill_loss": 0.12121204286813736, "epoch": 4.016010673782522, "step": 12040 }, { "epoch": 4.016010673782522, "ref_ce_loss": 0.0890619307756424, "step": 12040 }, { "epoch": 4.019346230820547, "loss": 0.4102, "step": 12050 }, { "epoch": 4.019346230820547, "grad_norm": 2.3165054321289062, "step": 12050 }, { "epoch": 4.019346230820547, "learning_rate": 0.00015611565942968942, "step": 12050 }, { "epoch": 4.019346230820547, "loss": 0.2301270216703415, "step": 12050 }, { "ce_loss": 0.07038333266973495, "epoch": 4.019346230820547, "step": 12050 }, { "distill_loss": 0.09666424989700317, "epoch": 4.019346230820547, "step": 12050 }, { "epoch": 4.019346230820547, "ref_ce_loss": 0.06273900717496872, "step": 12050 }, { "epoch": 4.019346230820547, "loss": 0.3673703670501709, "step": 12050 }, { "ce_loss": 0.10099012404680252, "epoch": 4.019346230820547, "step": 12050 }, { "distill_loss": 0.1266748458147049, "epoch": 4.019346230820547, "step": 12050 }, { "epoch": 4.019346230820547, "ref_ce_loss": 0.038833919912576675, "step": 12050 }, { "epoch": 4.019346230820547, "loss": 0.47880423069000244, "step": 12050 }, { "ce_loss": 0.1467541754245758, "epoch": 4.019346230820547, "step": 12050 }, { "distill_loss": 0.1850563883781433, "epoch": 4.019346230820547, "step": 12050 }, { "epoch": 4.019346230820547, "ref_ce_loss": 0.10418304055929184, "step": 12050 }, { "epoch": 4.019346230820547, "loss": 0.37483644485473633, "step": 12050 }, { "ce_loss": 0.07514076679944992, "epoch": 4.019346230820547, "step": 12050 }, { "distill_loss": 0.12129350006580353, "epoch": 4.019346230820547, "step": 12050 }, { "epoch": 4.019346230820547, "ref_ce_loss": 0.10785885155200958, "step": 12050 }, { "epoch": 4.0226817878585726, "loss": 0.4115, "step": 12060 }, { "epoch": 4.0226817878585726, "grad_norm": 2.2294394969940186, "step": 12060 }, { "epoch": 4.0226817878585726, "learning_rate": 0.00015591326091727415, "step": 12060 }, { "epoch": 4.0226817878585726, "loss": 0.4823136627674103, "step": 12060 }, { "ce_loss": 0.1290162205696106, "epoch": 4.0226817878585726, "step": 12060 }, { "distill_loss": 0.20154951512813568, "epoch": 4.0226817878585726, "step": 12060 }, { "epoch": 4.0226817878585726, "ref_ce_loss": 0.06897612661123276, "step": 12060 }, { "epoch": 4.0226817878585726, "loss": 0.35938891768455505, "step": 12060 }, { "ce_loss": 0.05964622274041176, "epoch": 4.0226817878585726, "step": 12060 }, { "distill_loss": 0.15778091549873352, "epoch": 4.0226817878585726, "step": 12060 }, { "epoch": 4.0226817878585726, "ref_ce_loss": 0.10114988684654236, "step": 12060 }, { "epoch": 4.0226817878585726, "loss": 0.5172111392021179, "step": 12060 }, { "ce_loss": 0.1536588817834854, "epoch": 4.0226817878585726, "step": 12060 }, { "distill_loss": 0.16821041703224182, "epoch": 4.0226817878585726, "step": 12060 }, { "epoch": 4.0226817878585726, "ref_ce_loss": 0.09951279312372208, "step": 12060 }, { "epoch": 4.0226817878585726, "loss": 0.38765984773635864, "step": 12060 }, { "ce_loss": 0.04113280028104782, "epoch": 4.0226817878585726, "step": 12060 }, { "distill_loss": 0.10768014937639236, "epoch": 4.0226817878585726, "step": 12060 }, { "epoch": 4.0226817878585726, "ref_ce_loss": 0.0842534527182579, "step": 12060 }, { "epoch": 4.026017344896598, "loss": 0.4356, "step": 12070 }, { "epoch": 4.026017344896598, "grad_norm": 5.903532981872559, "step": 12070 }, { "epoch": 4.026017344896598, "learning_rate": 0.00015571085162140348, "step": 12070 }, { "epoch": 4.026017344896598, "loss": 0.37335243821144104, "step": 12070 }, { "ce_loss": 0.09252926707267761, "epoch": 4.026017344896598, "step": 12070 }, { "distill_loss": 0.14429230988025665, "epoch": 4.026017344896598, "step": 12070 }, { "epoch": 4.026017344896598, "ref_ce_loss": 0.06657299399375916, "step": 12070 }, { "epoch": 4.026017344896598, "loss": 0.37593209743499756, "step": 12070 }, { "ce_loss": 0.10314659774303436, "epoch": 4.026017344896598, "step": 12070 }, { "distill_loss": 0.14421296119689941, "epoch": 4.026017344896598, "step": 12070 }, { "epoch": 4.026017344896598, "ref_ce_loss": 0.0858980119228363, "step": 12070 }, { "epoch": 4.026017344896598, "loss": 0.30769476294517517, "step": 12070 }, { "ce_loss": 0.07112128287553787, "epoch": 4.026017344896598, "step": 12070 }, { "distill_loss": 0.1462993174791336, "epoch": 4.026017344896598, "step": 12070 }, { "epoch": 4.026017344896598, "ref_ce_loss": 0.0900954082608223, "step": 12070 }, { "epoch": 4.026017344896598, "loss": 0.5134388208389282, "step": 12070 }, { "ce_loss": 0.0642455592751503, "epoch": 4.026017344896598, "step": 12070 }, { "distill_loss": 0.13371606171131134, "epoch": 4.026017344896598, "step": 12070 }, { "epoch": 4.026017344896598, "ref_ce_loss": 0.08747991174459457, "step": 12070 }, { "epoch": 4.029352901934623, "loss": 0.4413, "step": 12080 }, { "epoch": 4.029352901934623, "grad_norm": 2.3316900730133057, "step": 12080 }, { "epoch": 4.029352901934623, "learning_rate": 0.0001555084319111922, "step": 12080 }, { "epoch": 4.029352901934623, "loss": 0.7505779266357422, "step": 12080 }, { "ce_loss": 0.14793424308300018, "epoch": 4.029352901934623, "step": 12080 }, { "distill_loss": 0.22277532517910004, "epoch": 4.029352901934623, "step": 12080 }, { "epoch": 4.029352901934623, "ref_ce_loss": 0.12870121002197266, "step": 12080 }, { "epoch": 4.029352901934623, "loss": 0.8267667889595032, "step": 12080 }, { "ce_loss": 0.10344298183917999, "epoch": 4.029352901934623, "step": 12080 }, { "distill_loss": 0.3623758554458618, "epoch": 4.029352901934623, "step": 12080 }, { "epoch": 4.029352901934623, "ref_ce_loss": 0.11685804277658463, "step": 12080 }, { "epoch": 4.029352901934623, "loss": 0.9109047651290894, "step": 12080 }, { "ce_loss": 0.10552089661359787, "epoch": 4.029352901934623, "step": 12080 }, { "distill_loss": 0.19695395231246948, "epoch": 4.029352901934623, "step": 12080 }, { "epoch": 4.029352901934623, "ref_ce_loss": 0.09001109004020691, "step": 12080 }, { "epoch": 4.029352901934623, "loss": 0.38273149728775024, "step": 12080 }, { "ce_loss": 0.04735790193080902, "epoch": 4.029352901934623, "step": 12080 }, { "distill_loss": 0.11591293662786484, "epoch": 4.029352901934623, "step": 12080 }, { "epoch": 4.029352901934623, "ref_ce_loss": 0.054501067847013474, "step": 12080 }, { "epoch": 4.032688458972649, "loss": 0.4852, "step": 12090 }, { "epoch": 4.032688458972649, "grad_norm": 3.8199822902679443, "step": 12090 }, { "epoch": 4.032688458972649, "learning_rate": 0.00015530600215577406, "step": 12090 }, { "epoch": 4.032688458972649, "loss": 0.45621997117996216, "step": 12090 }, { "ce_loss": 0.11262834072113037, "epoch": 4.032688458972649, "step": 12090 }, { "distill_loss": 0.15463413298130035, "epoch": 4.032688458972649, "step": 12090 }, { "epoch": 4.032688458972649, "ref_ce_loss": 0.07563820481300354, "step": 12090 }, { "epoch": 4.032688458972649, "loss": 0.3850533962249756, "step": 12090 }, { "ce_loss": 0.08699747920036316, "epoch": 4.032688458972649, "step": 12090 }, { "distill_loss": 0.14912444353103638, "epoch": 4.032688458972649, "step": 12090 }, { "epoch": 4.032688458972649, "ref_ce_loss": 0.08690159022808075, "step": 12090 }, { "epoch": 4.032688458972649, "loss": 0.39265933632850647, "step": 12090 }, { "ce_loss": 0.10053718090057373, "epoch": 4.032688458972649, "step": 12090 }, { "distill_loss": 0.20805439352989197, "epoch": 4.032688458972649, "step": 12090 }, { "epoch": 4.032688458972649, "ref_ce_loss": 0.08398868143558502, "step": 12090 }, { "epoch": 4.032688458972649, "loss": 0.8067128658294678, "step": 12090 }, { "ce_loss": 0.11772124469280243, "epoch": 4.032688458972649, "step": 12090 }, { "distill_loss": 0.21782295405864716, "epoch": 4.032688458972649, "step": 12090 }, { "epoch": 4.032688458972649, "ref_ce_loss": 0.12272774428129196, "step": 12090 }, { "epoch": 4.036024016010674, "loss": 0.4555, "step": 12100 }, { "epoch": 4.036024016010674, "grad_norm": 4.044421195983887, "step": 12100 }, { "epoch": 4.036024016010674, "learning_rate": 0.00015510356272430104, "step": 12100 }, { "epoch": 4.036024016010674, "loss": 0.4507158100605011, "step": 12100 }, { "ce_loss": 0.15286117792129517, "epoch": 4.036024016010674, "step": 12100 }, { "distill_loss": 0.16192707419395447, "epoch": 4.036024016010674, "step": 12100 }, { "epoch": 4.036024016010674, "ref_ce_loss": 0.135764941573143, "step": 12100 }, { "epoch": 4.036024016010674, "loss": 0.46531906723976135, "step": 12100 }, { "ce_loss": 0.09542107582092285, "epoch": 4.036024016010674, "step": 12100 }, { "distill_loss": 0.1124882772564888, "epoch": 4.036024016010674, "step": 12100 }, { "epoch": 4.036024016010674, "ref_ce_loss": 0.10861322283744812, "step": 12100 }, { "epoch": 4.036024016010674, "loss": 0.253815233707428, "step": 12100 }, { "ce_loss": 0.06234263256192207, "epoch": 4.036024016010674, "step": 12100 }, { "distill_loss": 0.10280825197696686, "epoch": 4.036024016010674, "step": 12100 }, { "epoch": 4.036024016010674, "ref_ce_loss": 0.06219073012471199, "step": 12100 }, { "epoch": 4.036024016010674, "loss": 0.3336341977119446, "step": 12100 }, { "ce_loss": 0.06819576770067215, "epoch": 4.036024016010674, "step": 12100 }, { "distill_loss": 0.11720435321331024, "epoch": 4.036024016010674, "step": 12100 }, { "epoch": 4.036024016010674, "ref_ce_loss": 0.11607708036899567, "step": 12100 }, { "epoch": 4.039359573048699, "loss": 0.4412, "step": 12110 }, { "epoch": 4.039359573048699, "grad_norm": 1.9481059312820435, "step": 12110 }, { "epoch": 4.039359573048699, "learning_rate": 0.00015490111398594274, "step": 12110 }, { "epoch": 4.039359573048699, "loss": 0.29373347759246826, "step": 12110 }, { "ce_loss": 0.04972933977842331, "epoch": 4.039359573048699, "step": 12110 }, { "distill_loss": 0.13565079867839813, "epoch": 4.039359573048699, "step": 12110 }, { "epoch": 4.039359573048699, "ref_ce_loss": 0.06544889509677887, "step": 12110 }, { "epoch": 4.039359573048699, "loss": 0.3456859588623047, "step": 12110 }, { "ce_loss": 0.08374223858118057, "epoch": 4.039359573048699, "step": 12110 }, { "distill_loss": 0.12544633448123932, "epoch": 4.039359573048699, "step": 12110 }, { "epoch": 4.039359573048699, "ref_ce_loss": 0.05935639142990112, "step": 12110 }, { "epoch": 4.039359573048699, "loss": 0.3637619614601135, "step": 12110 }, { "ce_loss": 0.113809734582901, "epoch": 4.039359573048699, "step": 12110 }, { "distill_loss": 0.14481617510318756, "epoch": 4.039359573048699, "step": 12110 }, { "epoch": 4.039359573048699, "ref_ce_loss": 0.08502238243818283, "step": 12110 }, { "epoch": 4.039359573048699, "loss": 0.4461738169193268, "step": 12110 }, { "ce_loss": 0.1227860376238823, "epoch": 4.039359573048699, "step": 12110 }, { "distill_loss": 0.21389171481132507, "epoch": 4.039359573048699, "step": 12110 }, { "epoch": 4.039359573048699, "ref_ce_loss": 0.10931068658828735, "step": 12110 }, { "epoch": 4.042695130086725, "loss": 0.4501, "step": 12120 }, { "epoch": 4.042695130086725, "grad_norm": 2.8527162075042725, "step": 12120 }, { "epoch": 4.042695130086725, "learning_rate": 0.0001546986563098859, "step": 12120 }, { "epoch": 4.042695130086725, "loss": 0.3520258069038391, "step": 12120 }, { "ce_loss": 0.0780138224363327, "epoch": 4.042695130086725, "step": 12120 }, { "distill_loss": 0.14417199790477753, "epoch": 4.042695130086725, "step": 12120 }, { "epoch": 4.042695130086725, "ref_ce_loss": 0.09585023671388626, "step": 12120 }, { "epoch": 4.042695130086725, "loss": 0.5828714370727539, "step": 12120 }, { "ce_loss": 0.09808569401502609, "epoch": 4.042695130086725, "step": 12120 }, { "distill_loss": 0.13045603036880493, "epoch": 4.042695130086725, "step": 12120 }, { "epoch": 4.042695130086725, "ref_ce_loss": 0.07338562607765198, "step": 12120 }, { "epoch": 4.042695130086725, "loss": 0.2788516581058502, "step": 12120 }, { "ce_loss": 0.06606944650411606, "epoch": 4.042695130086725, "step": 12120 }, { "distill_loss": 0.10744508355855942, "epoch": 4.042695130086725, "step": 12120 }, { "epoch": 4.042695130086725, "ref_ce_loss": 0.06514713913202286, "step": 12120 }, { "epoch": 4.042695130086725, "loss": 0.30643728375434875, "step": 12120 }, { "ce_loss": 0.08950202167034149, "epoch": 4.042695130086725, "step": 12120 }, { "distill_loss": 0.14668361842632294, "epoch": 4.042695130086725, "step": 12120 }, { "epoch": 4.042695130086725, "ref_ce_loss": 0.0699087455868721, "step": 12120 }, { "epoch": 4.04603068712475, "loss": 0.4806, "step": 12130 }, { "epoch": 4.04603068712475, "grad_norm": 2.6800968647003174, "step": 12130 }, { "epoch": 4.04603068712475, "learning_rate": 0.00015449619006533343, "step": 12130 }, { "epoch": 4.04603068712475, "loss": 0.49065351486206055, "step": 12130 }, { "ce_loss": 0.06231939420104027, "epoch": 4.04603068712475, "step": 12130 }, { "distill_loss": 0.22406406700611115, "epoch": 4.04603068712475, "step": 12130 }, { "epoch": 4.04603068712475, "ref_ce_loss": 0.07999047636985779, "step": 12130 }, { "epoch": 4.04603068712475, "loss": 0.45048731565475464, "step": 12130 }, { "ce_loss": 0.09844529628753662, "epoch": 4.04603068712475, "step": 12130 }, { "distill_loss": 0.14128665626049042, "epoch": 4.04603068712475, "step": 12130 }, { "epoch": 4.04603068712475, "ref_ce_loss": 0.12517352402210236, "step": 12130 }, { "epoch": 4.04603068712475, "loss": 0.4383133053779602, "step": 12130 }, { "ce_loss": 0.12598662078380585, "epoch": 4.04603068712475, "step": 12130 }, { "distill_loss": 0.1633603870868683, "epoch": 4.04603068712475, "step": 12130 }, { "epoch": 4.04603068712475, "ref_ce_loss": 0.09888887405395508, "step": 12130 }, { "epoch": 4.04603068712475, "loss": 0.24241933226585388, "step": 12130 }, { "ce_loss": 0.04206832870841026, "epoch": 4.04603068712475, "step": 12130 }, { "distill_loss": 0.14933034777641296, "epoch": 4.04603068712475, "step": 12130 }, { "epoch": 4.04603068712475, "ref_ce_loss": 0.050794150680303574, "step": 12130 }, { "epoch": 4.049366244162775, "loss": 0.4709, "step": 12140 }, { "epoch": 4.049366244162775, "grad_norm": 2.5225110054016113, "step": 12140 }, { "epoch": 4.049366244162775, "learning_rate": 0.00015429371562150385, "step": 12140 }, { "epoch": 4.049366244162775, "loss": 0.5925564765930176, "step": 12140 }, { "ce_loss": 0.05308568477630615, "epoch": 4.049366244162775, "step": 12140 }, { "distill_loss": 0.15124325454235077, "epoch": 4.049366244162775, "step": 12140 }, { "epoch": 4.049366244162775, "ref_ce_loss": 0.07154041528701782, "step": 12140 }, { "epoch": 4.049366244162775, "loss": 0.3111650347709656, "step": 12140 }, { "ce_loss": 0.07286947965621948, "epoch": 4.049366244162775, "step": 12140 }, { "distill_loss": 0.12985488772392273, "epoch": 4.049366244162775, "step": 12140 }, { "epoch": 4.049366244162775, "ref_ce_loss": 0.08120644092559814, "step": 12140 }, { "epoch": 4.049366244162775, "loss": 0.5160831212997437, "step": 12140 }, { "ce_loss": 0.08501293510198593, "epoch": 4.049366244162775, "step": 12140 }, { "distill_loss": 0.13488444685935974, "epoch": 4.049366244162775, "step": 12140 }, { "epoch": 4.049366244162775, "ref_ce_loss": 0.08812274038791656, "step": 12140 }, { "epoch": 4.049366244162775, "loss": 0.2512379288673401, "step": 12140 }, { "ce_loss": 0.07532287389039993, "epoch": 4.049366244162775, "step": 12140 }, { "distill_loss": 0.10057376325130463, "epoch": 4.049366244162775, "step": 12140 }, { "epoch": 4.049366244162775, "ref_ce_loss": 0.07492376863956451, "step": 12140 }, { "epoch": 4.052701801200801, "loss": 0.4284, "step": 12150 }, { "epoch": 4.052701801200801, "grad_norm": 4.018413066864014, "step": 12150 }, { "epoch": 4.052701801200801, "learning_rate": 0.00015409123334763077, "step": 12150 }, { "epoch": 4.052701801200801, "loss": 0.47143644094467163, "step": 12150 }, { "ce_loss": 0.11935675889253616, "epoch": 4.052701801200801, "step": 12150 }, { "distill_loss": 0.19259300827980042, "epoch": 4.052701801200801, "step": 12150 }, { "epoch": 4.052701801200801, "ref_ce_loss": 0.1302216798067093, "step": 12150 }, { "epoch": 4.052701801200801, "loss": 0.3792656660079956, "step": 12150 }, { "ce_loss": 0.11778055131435394, "epoch": 4.052701801200801, "step": 12150 }, { "distill_loss": 0.17504708468914032, "epoch": 4.052701801200801, "step": 12150 }, { "epoch": 4.052701801200801, "ref_ce_loss": 0.08613471686840057, "step": 12150 }, { "epoch": 4.052701801200801, "loss": 0.5767985582351685, "step": 12150 }, { "ce_loss": 0.16085843741893768, "epoch": 4.052701801200801, "step": 12150 }, { "distill_loss": 0.24712547659873962, "epoch": 4.052701801200801, "step": 12150 }, { "epoch": 4.052701801200801, "ref_ce_loss": 0.12454459071159363, "step": 12150 }, { "epoch": 4.052701801200801, "loss": 0.4131409823894501, "step": 12150 }, { "ce_loss": 0.053598470985889435, "epoch": 4.052701801200801, "step": 12150 }, { "distill_loss": 0.16270749270915985, "epoch": 4.052701801200801, "step": 12150 }, { "epoch": 4.052701801200801, "ref_ce_loss": 0.08859755843877792, "step": 12150 }, { "epoch": 4.056037358238826, "loss": 0.4947, "step": 12160 }, { "epoch": 4.056037358238826, "grad_norm": 3.498987913131714, "step": 12160 }, { "epoch": 4.056037358238826, "learning_rate": 0.00015388874361296184, "step": 12160 }, { "epoch": 4.056037358238826, "loss": 0.3833063542842865, "step": 12160 }, { "ce_loss": 0.08747068792581558, "epoch": 4.056037358238826, "step": 12160 }, { "distill_loss": 0.14331986010074615, "epoch": 4.056037358238826, "step": 12160 }, { "epoch": 4.056037358238826, "ref_ce_loss": 0.062439221888780594, "step": 12160 }, { "epoch": 4.056037358238826, "loss": 0.6706572771072388, "step": 12160 }, { "ce_loss": 0.17328523099422455, "epoch": 4.056037358238826, "step": 12160 }, { "distill_loss": 0.22888147830963135, "epoch": 4.056037358238826, "step": 12160 }, { "epoch": 4.056037358238826, "ref_ce_loss": 0.0902508795261383, "step": 12160 }, { "epoch": 4.056037358238826, "loss": 0.6042168140411377, "step": 12160 }, { "ce_loss": 0.12251890450716019, "epoch": 4.056037358238826, "step": 12160 }, { "distill_loss": 0.22527821362018585, "epoch": 4.056037358238826, "step": 12160 }, { "epoch": 4.056037358238826, "ref_ce_loss": 0.10940185189247131, "step": 12160 }, { "epoch": 4.056037358238826, "loss": 0.35750412940979004, "step": 12160 }, { "ce_loss": 0.04393932968378067, "epoch": 4.056037358238826, "step": 12160 }, { "distill_loss": 0.13152629137039185, "epoch": 4.056037358238826, "step": 12160 }, { "epoch": 4.056037358238826, "ref_ce_loss": 0.09057697653770447, "step": 12160 }, { "epoch": 4.059372915276851, "loss": 0.4463, "step": 12170 }, { "epoch": 4.059372915276851, "grad_norm": 3.839312791824341, "step": 12170 }, { "epoch": 4.059372915276851, "learning_rate": 0.00015368624678675858, "step": 12170 }, { "epoch": 4.059372915276851, "loss": 0.2831639051437378, "step": 12170 }, { "ce_loss": 0.06703417003154755, "epoch": 4.059372915276851, "step": 12170 }, { "distill_loss": 0.1631172150373459, "epoch": 4.059372915276851, "step": 12170 }, { "epoch": 4.059372915276851, "ref_ce_loss": 0.05295206606388092, "step": 12170 }, { "epoch": 4.059372915276851, "loss": 0.5331304669380188, "step": 12170 }, { "ce_loss": 0.107203908264637, "epoch": 4.059372915276851, "step": 12170 }, { "distill_loss": 0.13781806826591492, "epoch": 4.059372915276851, "step": 12170 }, { "epoch": 4.059372915276851, "ref_ce_loss": 0.07520002871751785, "step": 12170 }, { "epoch": 4.059372915276851, "loss": 0.3221982717514038, "step": 12170 }, { "ce_loss": 0.11160756647586823, "epoch": 4.059372915276851, "step": 12170 }, { "distill_loss": 0.12180545181035995, "epoch": 4.059372915276851, "step": 12170 }, { "epoch": 4.059372915276851, "ref_ce_loss": 0.06871822476387024, "step": 12170 }, { "epoch": 4.059372915276851, "loss": 0.8085801601409912, "step": 12170 }, { "ce_loss": 0.1456156075000763, "epoch": 4.059372915276851, "step": 12170 }, { "distill_loss": 0.19259947538375854, "epoch": 4.059372915276851, "step": 12170 }, { "epoch": 4.059372915276851, "ref_ce_loss": 0.0803808867931366, "step": 12170 }, { "epoch": 4.062708472314877, "loss": 0.4678, "step": 12180 }, { "epoch": 4.062708472314877, "grad_norm": 3.4009456634521484, "step": 12180 }, { "epoch": 4.062708472314877, "learning_rate": 0.0001534837432382953, "step": 12180 }, { "epoch": 4.062708472314877, "loss": 0.4208252727985382, "step": 12180 }, { "ce_loss": 0.0713610127568245, "epoch": 4.062708472314877, "step": 12180 }, { "distill_loss": 0.1717347800731659, "epoch": 4.062708472314877, "step": 12180 }, { "epoch": 4.062708472314877, "ref_ce_loss": 0.06079830601811409, "step": 12180 }, { "epoch": 4.062708472314877, "loss": 0.5087335109710693, "step": 12180 }, { "ce_loss": 0.14097148180007935, "epoch": 4.062708472314877, "step": 12180 }, { "distill_loss": 0.20097658038139343, "epoch": 4.062708472314877, "step": 12180 }, { "epoch": 4.062708472314877, "ref_ce_loss": 0.08624764531850815, "step": 12180 }, { "epoch": 4.062708472314877, "loss": 0.3848879337310791, "step": 12180 }, { "ce_loss": 0.10179894417524338, "epoch": 4.062708472314877, "step": 12180 }, { "distill_loss": 0.19365674257278442, "epoch": 4.062708472314877, "step": 12180 }, { "epoch": 4.062708472314877, "ref_ce_loss": 0.08927271515130997, "step": 12180 }, { "epoch": 4.062708472314877, "loss": 0.3143869936466217, "step": 12180 }, { "ce_loss": 0.05073242262005806, "epoch": 4.062708472314877, "step": 12180 }, { "distill_loss": 0.1419139802455902, "epoch": 4.062708472314877, "step": 12180 }, { "epoch": 4.062708472314877, "ref_ce_loss": 0.06367547810077667, "step": 12180 }, { "epoch": 4.066044029352902, "loss": 0.485, "step": 12190 }, { "epoch": 4.066044029352902, "grad_norm": 3.052934169769287, "step": 12190 }, { "epoch": 4.066044029352902, "learning_rate": 0.00015328123333685855, "step": 12190 }, { "epoch": 4.066044029352902, "loss": 0.43036988377571106, "step": 12190 }, { "ce_loss": 0.11047770082950592, "epoch": 4.066044029352902, "step": 12190 }, { "distill_loss": 0.16661612689495087, "epoch": 4.066044029352902, "step": 12190 }, { "epoch": 4.066044029352902, "ref_ce_loss": 0.07643501460552216, "step": 12190 }, { "epoch": 4.066044029352902, "loss": 0.28986313939094543, "step": 12190 }, { "ce_loss": 0.09734617173671722, "epoch": 4.066044029352902, "step": 12190 }, { "distill_loss": 0.10917098075151443, "epoch": 4.066044029352902, "step": 12190 }, { "epoch": 4.066044029352902, "ref_ce_loss": 0.060155268758535385, "step": 12190 }, { "epoch": 4.066044029352902, "loss": 0.35381343960762024, "step": 12190 }, { "ce_loss": 0.14211811125278473, "epoch": 4.066044029352902, "step": 12190 }, { "distill_loss": 0.11142193526029587, "epoch": 4.066044029352902, "step": 12190 }, { "epoch": 4.066044029352902, "ref_ce_loss": 0.06509396433830261, "step": 12190 }, { "epoch": 4.066044029352902, "loss": 0.5622801780700684, "step": 12190 }, { "ce_loss": 0.12198896706104279, "epoch": 4.066044029352902, "step": 12190 }, { "distill_loss": 0.19638624787330627, "epoch": 4.066044029352902, "step": 12190 }, { "epoch": 4.066044029352902, "ref_ce_loss": 0.09142731130123138, "step": 12190 }, { "epoch": 4.0693795863909275, "loss": 0.4595, "step": 12200 }, { "epoch": 4.0693795863909275, "grad_norm": 3.138794183731079, "step": 12200 }, { "epoch": 4.0693795863909275, "learning_rate": 0.00015307871745174655, "step": 12200 }, { "epoch": 4.0693795863909275, "loss": 0.23779502511024475, "step": 12200 }, { "ce_loss": 0.034942738711833954, "epoch": 4.0693795863909275, "step": 12200 }, { "distill_loss": 0.09184229373931885, "epoch": 4.0693795863909275, "step": 12200 }, { "epoch": 4.0693795863909275, "ref_ce_loss": 0.07803148776292801, "step": 12200 }, { "epoch": 4.0693795863909275, "loss": 0.9437388181686401, "step": 12200 }, { "ce_loss": 0.16672064363956451, "epoch": 4.0693795863909275, "step": 12200 }, { "distill_loss": 0.2688308656215668, "epoch": 4.0693795863909275, "step": 12200 }, { "epoch": 4.0693795863909275, "ref_ce_loss": 0.0933331549167633, "step": 12200 }, { "epoch": 4.0693795863909275, "loss": 0.6048306822776794, "step": 12200 }, { "ce_loss": 0.10321357846260071, "epoch": 4.0693795863909275, "step": 12200 }, { "distill_loss": 0.24434088170528412, "epoch": 4.0693795863909275, "step": 12200 }, { "epoch": 4.0693795863909275, "ref_ce_loss": 0.09095852822065353, "step": 12200 }, { "epoch": 4.0693795863909275, "loss": 0.5055559277534485, "step": 12200 }, { "ce_loss": 0.134865403175354, "epoch": 4.0693795863909275, "step": 12200 }, { "distill_loss": 0.16672514379024506, "epoch": 4.0693795863909275, "step": 12200 }, { "epoch": 4.0693795863909275, "ref_ce_loss": 0.08413047343492508, "step": 12200 }, { "epoch": 4.072715143428953, "loss": 0.4824, "step": 12210 }, { "epoch": 4.072715143428953, "grad_norm": 5.428567409515381, "step": 12210 }, { "epoch": 4.072715143428953, "learning_rate": 0.00015287619595226839, "step": 12210 }, { "epoch": 4.072715143428953, "loss": 0.5914174914360046, "step": 12210 }, { "ce_loss": 0.10178596526384354, "epoch": 4.072715143428953, "step": 12210 }, { "distill_loss": 0.23800191283226013, "epoch": 4.072715143428953, "step": 12210 }, { "epoch": 4.072715143428953, "ref_ce_loss": 0.10165928304195404, "step": 12210 }, { "epoch": 4.072715143428953, "loss": 0.32598963379859924, "step": 12210 }, { "ce_loss": 0.07633938640356064, "epoch": 4.072715143428953, "step": 12210 }, { "distill_loss": 0.1258414089679718, "epoch": 4.072715143428953, "step": 12210 }, { "epoch": 4.072715143428953, "ref_ce_loss": 0.05301518365740776, "step": 12210 }, { "epoch": 4.072715143428953, "loss": 0.36426302790641785, "step": 12210 }, { "ce_loss": 0.11082565039396286, "epoch": 4.072715143428953, "step": 12210 }, { "distill_loss": 0.12851569056510925, "epoch": 4.072715143428953, "step": 12210 }, { "epoch": 4.072715143428953, "ref_ce_loss": 0.10189623385667801, "step": 12210 }, { "epoch": 4.072715143428953, "loss": 0.6586376428604126, "step": 12210 }, { "ce_loss": 0.3017585575580597, "epoch": 4.072715143428953, "step": 12210 }, { "distill_loss": 0.2281065434217453, "epoch": 4.072715143428953, "step": 12210 }, { "epoch": 4.072715143428953, "ref_ce_loss": 0.12855705618858337, "step": 12210 }, { "epoch": 4.076050700466978, "loss": 0.441, "step": 12220 }, { "epoch": 4.076050700466978, "grad_norm": 2.7300357818603516, "step": 12220 }, { "epoch": 4.076050700466978, "learning_rate": 0.00015267366920774337, "step": 12220 }, { "epoch": 4.076050700466978, "loss": 0.5121831297874451, "step": 12220 }, { "ce_loss": 0.14911715686321259, "epoch": 4.076050700466978, "step": 12220 }, { "distill_loss": 0.15121105313301086, "epoch": 4.076050700466978, "step": 12220 }, { "epoch": 4.076050700466978, "ref_ce_loss": 0.08566410839557648, "step": 12220 }, { "epoch": 4.076050700466978, "loss": 0.6446946263313293, "step": 12220 }, { "ce_loss": 0.08306907117366791, "epoch": 4.076050700466978, "step": 12220 }, { "distill_loss": 0.17455963790416718, "epoch": 4.076050700466978, "step": 12220 }, { "epoch": 4.076050700466978, "ref_ce_loss": 0.10251959413290024, "step": 12220 }, { "epoch": 4.076050700466978, "loss": 0.3556770086288452, "step": 12220 }, { "ce_loss": 0.10885165631771088, "epoch": 4.076050700466978, "step": 12220 }, { "distill_loss": 0.15706342458724976, "epoch": 4.076050700466978, "step": 12220 }, { "epoch": 4.076050700466978, "ref_ce_loss": 0.08962170779705048, "step": 12220 }, { "epoch": 4.076050700466978, "loss": 0.5599108338356018, "step": 12220 }, { "ce_loss": 0.15744344890117645, "epoch": 4.076050700466978, "step": 12220 }, { "distill_loss": 0.21136343479156494, "epoch": 4.076050700466978, "step": 12220 }, { "epoch": 4.076050700466978, "ref_ce_loss": 0.13838784396648407, "step": 12220 }, { "epoch": 4.0793862575050035, "loss": 0.4326, "step": 12230 }, { "epoch": 4.0793862575050035, "grad_norm": 2.221538782119751, "step": 12230 }, { "epoch": 4.0793862575050035, "learning_rate": 0.0001524711375875004, "step": 12230 }, { "epoch": 4.0793862575050035, "loss": 0.5925437211990356, "step": 12230 }, { "ce_loss": 0.16379602253437042, "epoch": 4.0793862575050035, "step": 12230 }, { "distill_loss": 0.1683337390422821, "epoch": 4.0793862575050035, "step": 12230 }, { "epoch": 4.0793862575050035, "ref_ce_loss": 0.1362374871969223, "step": 12230 }, { "epoch": 4.0793862575050035, "loss": 0.3094097077846527, "step": 12230 }, { "ce_loss": 0.07063864916563034, "epoch": 4.0793862575050035, "step": 12230 }, { "distill_loss": 0.09304803609848022, "epoch": 4.0793862575050035, "step": 12230 }, { "epoch": 4.0793862575050035, "ref_ce_loss": 0.09349749237298965, "step": 12230 }, { "epoch": 4.0793862575050035, "loss": 0.35739865899086, "step": 12230 }, { "ce_loss": 0.07744188606739044, "epoch": 4.0793862575050035, "step": 12230 }, { "distill_loss": 0.15637515485286713, "epoch": 4.0793862575050035, "step": 12230 }, { "epoch": 4.0793862575050035, "ref_ce_loss": 0.08380445837974548, "step": 12230 }, { "epoch": 4.0793862575050035, "loss": 0.30349835753440857, "step": 12230 }, { "ce_loss": 0.06610994786024094, "epoch": 4.0793862575050035, "step": 12230 }, { "distill_loss": 0.13098305463790894, "epoch": 4.0793862575050035, "step": 12230 }, { "epoch": 4.0793862575050035, "ref_ce_loss": 0.07835172116756439, "step": 12230 }, { "epoch": 4.082721814543029, "loss": 0.4087, "step": 12240 }, { "epoch": 4.082721814543029, "grad_norm": 3.177318572998047, "step": 12240 }, { "epoch": 4.082721814543029, "learning_rate": 0.00015226860146087725, "step": 12240 }, { "epoch": 4.082721814543029, "loss": 0.4792167842388153, "step": 12240 }, { "ce_loss": 0.12271936982870102, "epoch": 4.082721814543029, "step": 12240 }, { "distill_loss": 0.26747363805770874, "epoch": 4.082721814543029, "step": 12240 }, { "epoch": 4.082721814543029, "ref_ce_loss": 0.08874979615211487, "step": 12240 }, { "epoch": 4.082721814543029, "loss": 0.3786783218383789, "step": 12240 }, { "ce_loss": 0.10062919557094574, "epoch": 4.082721814543029, "step": 12240 }, { "distill_loss": 0.11116902530193329, "epoch": 4.082721814543029, "step": 12240 }, { "epoch": 4.082721814543029, "ref_ce_loss": 0.13712024688720703, "step": 12240 }, { "epoch": 4.082721814543029, "loss": 0.6380170583724976, "step": 12240 }, { "ce_loss": 0.08112715184688568, "epoch": 4.082721814543029, "step": 12240 }, { "distill_loss": 0.0964871421456337, "epoch": 4.082721814543029, "step": 12240 }, { "epoch": 4.082721814543029, "ref_ce_loss": 0.041808392852544785, "step": 12240 }, { "epoch": 4.082721814543029, "loss": 0.2703827917575836, "step": 12240 }, { "ce_loss": 0.029660837724804878, "epoch": 4.082721814543029, "step": 12240 }, { "distill_loss": 0.0692979022860527, "epoch": 4.082721814543029, "step": 12240 }, { "epoch": 4.082721814543029, "ref_ce_loss": 0.07503220438957214, "step": 12240 }, { "epoch": 4.086057371581054, "loss": 0.4596, "step": 12250 }, { "epoch": 4.086057371581054, "grad_norm": 3.3609211444854736, "step": 12250 }, { "epoch": 4.086057371581054, "learning_rate": 0.00015206606119721986, "step": 12250 }, { "epoch": 4.086057371581054, "loss": 0.29551267623901367, "step": 12250 }, { "ce_loss": 0.04871445521712303, "epoch": 4.086057371581054, "step": 12250 }, { "distill_loss": 0.15821117162704468, "epoch": 4.086057371581054, "step": 12250 }, { "epoch": 4.086057371581054, "ref_ce_loss": 0.08842559158802032, "step": 12250 }, { "epoch": 4.086057371581054, "loss": 0.38177791237831116, "step": 12250 }, { "ce_loss": 0.07803545147180557, "epoch": 4.086057371581054, "step": 12250 }, { "distill_loss": 0.14289258420467377, "epoch": 4.086057371581054, "step": 12250 }, { "epoch": 4.086057371581054, "ref_ce_loss": 0.10587479919195175, "step": 12250 }, { "epoch": 4.086057371581054, "loss": 0.28975555300712585, "step": 12250 }, { "ce_loss": 0.054523248225450516, "epoch": 4.086057371581054, "step": 12250 }, { "distill_loss": 0.14741668105125427, "epoch": 4.086057371581054, "step": 12250 }, { "epoch": 4.086057371581054, "ref_ce_loss": 0.08770716190338135, "step": 12250 }, { "epoch": 4.086057371581054, "loss": 0.4934762120246887, "step": 12250 }, { "ce_loss": 0.1420019418001175, "epoch": 4.086057371581054, "step": 12250 }, { "distill_loss": 0.2047957330942154, "epoch": 4.086057371581054, "step": 12250 }, { "epoch": 4.086057371581054, "ref_ce_loss": 0.10681845992803574, "step": 12250 }, { "epoch": 4.0893929286190795, "loss": 0.4271, "step": 12260 }, { "epoch": 4.0893929286190795, "grad_norm": 3.676234245300293, "step": 12260 }, { "epoch": 4.0893929286190795, "learning_rate": 0.00015186351716588192, "step": 12260 }, { "epoch": 4.0893929286190795, "loss": 0.5635673403739929, "step": 12260 }, { "ce_loss": 0.10926533490419388, "epoch": 4.0893929286190795, "step": 12260 }, { "distill_loss": 0.18934011459350586, "epoch": 4.0893929286190795, "step": 12260 }, { "epoch": 4.0893929286190795, "ref_ce_loss": 0.06814757734537125, "step": 12260 }, { "epoch": 4.0893929286190795, "loss": 0.6337475776672363, "step": 12260 }, { "ce_loss": 0.08871379494667053, "epoch": 4.0893929286190795, "step": 12260 }, { "distill_loss": 0.15814882516860962, "epoch": 4.0893929286190795, "step": 12260 }, { "epoch": 4.0893929286190795, "ref_ce_loss": 0.1226770430803299, "step": 12260 }, { "epoch": 4.0893929286190795, "loss": 0.29514455795288086, "step": 12260 }, { "ce_loss": 0.06737024337053299, "epoch": 4.0893929286190795, "step": 12260 }, { "distill_loss": 0.1109817624092102, "epoch": 4.0893929286190795, "step": 12260 }, { "epoch": 4.0893929286190795, "ref_ce_loss": 0.0816814973950386, "step": 12260 }, { "epoch": 4.0893929286190795, "loss": 0.3731231093406677, "step": 12260 }, { "ce_loss": 0.1104077622294426, "epoch": 4.0893929286190795, "step": 12260 }, { "distill_loss": 0.15331417322158813, "epoch": 4.0893929286190795, "step": 12260 }, { "epoch": 4.0893929286190795, "ref_ce_loss": 0.0772353783249855, "step": 12260 }, { "epoch": 4.092728485657105, "loss": 0.4239, "step": 12270 }, { "epoch": 4.092728485657105, "grad_norm": 2.3397421836853027, "step": 12270 }, { "epoch": 4.092728485657105, "learning_rate": 0.00015166096973622377, "step": 12270 }, { "epoch": 4.092728485657105, "loss": 0.35443541407585144, "step": 12270 }, { "ce_loss": 0.045290570706129074, "epoch": 4.092728485657105, "step": 12270 }, { "distill_loss": 0.10662087053060532, "epoch": 4.092728485657105, "step": 12270 }, { "epoch": 4.092728485657105, "ref_ce_loss": 0.06537654995918274, "step": 12270 }, { "epoch": 4.092728485657105, "loss": 0.3716265857219696, "step": 12270 }, { "ce_loss": 0.05092225968837738, "epoch": 4.092728485657105, "step": 12270 }, { "distill_loss": 0.11948276311159134, "epoch": 4.092728485657105, "step": 12270 }, { "epoch": 4.092728485657105, "ref_ce_loss": 0.09160125255584717, "step": 12270 }, { "epoch": 4.092728485657105, "loss": 0.22582557797431946, "step": 12270 }, { "ce_loss": 0.05884668603539467, "epoch": 4.092728485657105, "step": 12270 }, { "distill_loss": 0.11232615262269974, "epoch": 4.092728485657105, "step": 12270 }, { "epoch": 4.092728485657105, "ref_ce_loss": 0.05450451374053955, "step": 12270 }, { "epoch": 4.092728485657105, "loss": 0.3369407653808594, "step": 12270 }, { "ce_loss": 0.07732923328876495, "epoch": 4.092728485657105, "step": 12270 }, { "distill_loss": 0.11193235218524933, "epoch": 4.092728485657105, "step": 12270 }, { "epoch": 4.092728485657105, "ref_ce_loss": 0.0715961679816246, "step": 12270 }, { "epoch": 4.09606404269513, "loss": 0.4267, "step": 12280 }, { "epoch": 4.09606404269513, "grad_norm": 3.901005506515503, "step": 12280 }, { "epoch": 4.09606404269513, "learning_rate": 0.00015145841927761196, "step": 12280 }, { "epoch": 4.09606404269513, "loss": 0.33873653411865234, "step": 12280 }, { "ce_loss": 0.12671172618865967, "epoch": 4.09606404269513, "step": 12280 }, { "distill_loss": 0.12484362721443176, "epoch": 4.09606404269513, "step": 12280 }, { "epoch": 4.09606404269513, "ref_ce_loss": 0.06715315580368042, "step": 12280 }, { "epoch": 4.09606404269513, "loss": 0.33130505681037903, "step": 12280 }, { "ce_loss": 0.061360765248537064, "epoch": 4.09606404269513, "step": 12280 }, { "distill_loss": 0.11928476393222809, "epoch": 4.09606404269513, "step": 12280 }, { "epoch": 4.09606404269513, "ref_ce_loss": 0.08986619114875793, "step": 12280 }, { "epoch": 4.09606404269513, "loss": 0.2806569039821625, "step": 12280 }, { "ce_loss": 0.0503183975815773, "epoch": 4.09606404269513, "step": 12280 }, { "distill_loss": 0.148472860455513, "epoch": 4.09606404269513, "step": 12280 }, { "epoch": 4.09606404269513, "ref_ce_loss": 0.08172249048948288, "step": 12280 }, { "epoch": 4.09606404269513, "loss": 0.39750438928604126, "step": 12280 }, { "ce_loss": 0.12096478044986725, "epoch": 4.09606404269513, "step": 12280 }, { "distill_loss": 0.13770119845867157, "epoch": 4.09606404269513, "step": 12280 }, { "epoch": 4.09606404269513, "ref_ce_loss": 0.08861135691404343, "step": 12280 }, { "epoch": 4.099399599733156, "loss": 0.4581, "step": 12290 }, { "epoch": 4.099399599733156, "grad_norm": 3.1234652996063232, "step": 12290 }, { "epoch": 4.099399599733156, "learning_rate": 0.00015125586615941873, "step": 12290 }, { "epoch": 4.099399599733156, "loss": 0.37716343998908997, "step": 12290 }, { "ce_loss": 0.10681670159101486, "epoch": 4.099399599733156, "step": 12290 }, { "distill_loss": 0.1452990025281906, "epoch": 4.099399599733156, "step": 12290 }, { "epoch": 4.099399599733156, "ref_ce_loss": 0.10262039303779602, "step": 12290 }, { "epoch": 4.099399599733156, "loss": 0.5069200992584229, "step": 12290 }, { "ce_loss": 0.11011409014463425, "epoch": 4.099399599733156, "step": 12290 }, { "distill_loss": 0.15488030016422272, "epoch": 4.099399599733156, "step": 12290 }, { "epoch": 4.099399599733156, "ref_ce_loss": 0.08841795474290848, "step": 12290 }, { "epoch": 4.099399599733156, "loss": 0.3167344927787781, "step": 12290 }, { "ce_loss": 0.05991573631763458, "epoch": 4.099399599733156, "step": 12290 }, { "distill_loss": 0.136915922164917, "epoch": 4.099399599733156, "step": 12290 }, { "epoch": 4.099399599733156, "ref_ce_loss": 0.07452834397554398, "step": 12290 }, { "epoch": 4.099399599733156, "loss": 0.3132794499397278, "step": 12290 }, { "ce_loss": 0.08132331073284149, "epoch": 4.099399599733156, "step": 12290 }, { "distill_loss": 0.09339018166065216, "epoch": 4.099399599733156, "step": 12290 }, { "epoch": 4.099399599733156, "ref_ce_loss": 0.09748751670122147, "step": 12290 }, { "epoch": 4.102735156771181, "loss": 0.4027, "step": 12300 }, { "epoch": 4.102735156771181, "grad_norm": 2.9308016300201416, "step": 12300 }, { "epoch": 4.102735156771181, "learning_rate": 0.00015105331075102103, "step": 12300 }, { "epoch": 4.102735156771181, "loss": 0.375118613243103, "step": 12300 }, { "ce_loss": 0.10460587590932846, "epoch": 4.102735156771181, "step": 12300 }, { "distill_loss": 0.14734624326229095, "epoch": 4.102735156771181, "step": 12300 }, { "epoch": 4.102735156771181, "ref_ce_loss": 0.12303036451339722, "step": 12300 }, { "epoch": 4.102735156771181, "loss": 0.2725878655910492, "step": 12300 }, { "ce_loss": 0.08057550340890884, "epoch": 4.102735156771181, "step": 12300 }, { "distill_loss": 0.1144259050488472, "epoch": 4.102735156771181, "step": 12300 }, { "epoch": 4.102735156771181, "ref_ce_loss": 0.07711077481508255, "step": 12300 }, { "epoch": 4.102735156771181, "loss": 0.2181396335363388, "step": 12300 }, { "ce_loss": 0.05747740715742111, "epoch": 4.102735156771181, "step": 12300 }, { "distill_loss": 0.09495794028043747, "epoch": 4.102735156771181, "step": 12300 }, { "epoch": 4.102735156771181, "ref_ce_loss": 0.06535113602876663, "step": 12300 }, { "epoch": 4.102735156771181, "loss": 0.30614638328552246, "step": 12300 }, { "ce_loss": 0.13755322992801666, "epoch": 4.102735156771181, "step": 12300 }, { "distill_loss": 0.09745433181524277, "epoch": 4.102735156771181, "step": 12300 }, { "epoch": 4.102735156771181, "ref_ce_loss": 0.0710759088397026, "step": 12300 }, { "epoch": 4.106070713809206, "loss": 0.379, "step": 12310 }, { "epoch": 4.106070713809206, "grad_norm": 9.995908737182617, "step": 12310 }, { "epoch": 4.106070713809206, "learning_rate": 0.0001508507534218, "step": 12310 }, { "epoch": 4.106070713809206, "loss": 0.28247153759002686, "step": 12310 }, { "ce_loss": 0.06919530034065247, "epoch": 4.106070713809206, "step": 12310 }, { "distill_loss": 0.1077154353260994, "epoch": 4.106070713809206, "step": 12310 }, { "epoch": 4.106070713809206, "ref_ce_loss": 0.06293756514787674, "step": 12310 }, { "epoch": 4.106070713809206, "loss": 0.3820110559463501, "step": 12310 }, { "ce_loss": 0.08053288608789444, "epoch": 4.106070713809206, "step": 12310 }, { "distill_loss": 0.10802694410085678, "epoch": 4.106070713809206, "step": 12310 }, { "epoch": 4.106070713809206, "ref_ce_loss": 0.0862024575471878, "step": 12310 }, { "epoch": 4.106070713809206, "loss": 0.6087753772735596, "step": 12310 }, { "ce_loss": 0.117802694439888, "epoch": 4.106070713809206, "step": 12310 }, { "distill_loss": 0.12614843249320984, "epoch": 4.106070713809206, "step": 12310 }, { "epoch": 4.106070713809206, "ref_ce_loss": 0.08279848843812943, "step": 12310 }, { "epoch": 4.106070713809206, "loss": 0.4322092533111572, "step": 12310 }, { "ce_loss": 0.13464225828647614, "epoch": 4.106070713809206, "step": 12310 }, { "distill_loss": 0.11292240023612976, "epoch": 4.106070713809206, "step": 12310 }, { "epoch": 4.106070713809206, "ref_ce_loss": 0.08117159456014633, "step": 12310 }, { "epoch": 4.109406270847232, "loss": 0.4053, "step": 12320 }, { "epoch": 4.109406270847232, "grad_norm": 3.473320245742798, "step": 12320 }, { "epoch": 4.109406270847232, "learning_rate": 0.00015064819454114033, "step": 12320 }, { "epoch": 4.109406270847232, "loss": 0.373969167470932, "step": 12320 }, { "ce_loss": 0.1063445508480072, "epoch": 4.109406270847232, "step": 12320 }, { "distill_loss": 0.15159161388874054, "epoch": 4.109406270847232, "step": 12320 }, { "epoch": 4.109406270847232, "ref_ce_loss": 0.05470012128353119, "step": 12320 }, { "epoch": 4.109406270847232, "loss": 0.4092685580253601, "step": 12320 }, { "ce_loss": 0.16364896297454834, "epoch": 4.109406270847232, "step": 12320 }, { "distill_loss": 0.1414794623851776, "epoch": 4.109406270847232, "step": 12320 }, { "epoch": 4.109406270847232, "ref_ce_loss": 0.10386661440134048, "step": 12320 }, { "epoch": 4.109406270847232, "loss": 0.2099723517894745, "step": 12320 }, { "ce_loss": 0.024279853329062462, "epoch": 4.109406270847232, "step": 12320 }, { "distill_loss": 0.07701626420021057, "epoch": 4.109406270847232, "step": 12320 }, { "epoch": 4.109406270847232, "ref_ce_loss": 0.05946137383580208, "step": 12320 }, { "epoch": 4.109406270847232, "loss": 0.6005130410194397, "step": 12320 }, { "ce_loss": 0.13691778481006622, "epoch": 4.109406270847232, "step": 12320 }, { "distill_loss": 0.10761863738298416, "epoch": 4.109406270847232, "step": 12320 }, { "epoch": 4.109406270847232, "ref_ce_loss": 0.17544463276863098, "step": 12320 }, { "epoch": 4.112741827885257, "loss": 0.4127, "step": 12330 }, { "epoch": 4.112741827885257, "grad_norm": 1.8041068315505981, "step": 12330 }, { "epoch": 4.112741827885257, "learning_rate": 0.0001504456344784295, "step": 12330 }, { "epoch": 4.112741827885257, "loss": 0.44366419315338135, "step": 12330 }, { "ce_loss": 0.16683217883110046, "epoch": 4.112741827885257, "step": 12330 }, { "distill_loss": 0.13771888613700867, "epoch": 4.112741827885257, "step": 12330 }, { "epoch": 4.112741827885257, "ref_ce_loss": 0.09939558058977127, "step": 12330 }, { "epoch": 4.112741827885257, "loss": 0.6878728270530701, "step": 12330 }, { "ce_loss": 0.0950092151761055, "epoch": 4.112741827885257, "step": 12330 }, { "distill_loss": 0.1335957795381546, "epoch": 4.112741827885257, "step": 12330 }, { "epoch": 4.112741827885257, "ref_ce_loss": 0.09792289137840271, "step": 12330 }, { "epoch": 4.112741827885257, "loss": 0.28441381454467773, "step": 12330 }, { "ce_loss": 0.1099325641989708, "epoch": 4.112741827885257, "step": 12330 }, { "distill_loss": 0.10999204218387604, "epoch": 4.112741827885257, "step": 12330 }, { "epoch": 4.112741827885257, "ref_ce_loss": 0.06433827430009842, "step": 12330 }, { "epoch": 4.112741827885257, "loss": 0.29116740822792053, "step": 12330 }, { "ce_loss": 0.0614168606698513, "epoch": 4.112741827885257, "step": 12330 }, { "distill_loss": 0.11611483991146088, "epoch": 4.112741827885257, "step": 12330 }, { "epoch": 4.112741827885257, "ref_ce_loss": 0.05571691691875458, "step": 12330 }, { "epoch": 4.116077384923282, "loss": 0.4103, "step": 12340 }, { "epoch": 4.116077384923282, "grad_norm": 3.1782829761505127, "step": 12340 }, { "epoch": 4.116077384923282, "learning_rate": 0.00015024307360305715, "step": 12340 }, { "epoch": 4.116077384923282, "loss": 0.4817643165588379, "step": 12340 }, { "ce_loss": 0.1433020681142807, "epoch": 4.116077384923282, "step": 12340 }, { "distill_loss": 0.15565498173236847, "epoch": 4.116077384923282, "step": 12340 }, { "epoch": 4.116077384923282, "ref_ce_loss": 0.08871158957481384, "step": 12340 }, { "epoch": 4.116077384923282, "loss": 0.4128016233444214, "step": 12340 }, { "ce_loss": 0.17788727581501007, "epoch": 4.116077384923282, "step": 12340 }, { "distill_loss": 0.13444003462791443, "epoch": 4.116077384923282, "step": 12340 }, { "epoch": 4.116077384923282, "ref_ce_loss": 0.10033713281154633, "step": 12340 }, { "epoch": 4.116077384923282, "loss": 0.31887951493263245, "step": 12340 }, { "ce_loss": 0.03353816643357277, "epoch": 4.116077384923282, "step": 12340 }, { "distill_loss": 0.13450832664966583, "epoch": 4.116077384923282, "step": 12340 }, { "epoch": 4.116077384923282, "ref_ce_loss": 0.10326790064573288, "step": 12340 }, { "epoch": 4.116077384923282, "loss": 0.248275026679039, "step": 12340 }, { "ce_loss": 0.07996595650911331, "epoch": 4.116077384923282, "step": 12340 }, { "distill_loss": 0.09831559658050537, "epoch": 4.116077384923282, "step": 12340 }, { "epoch": 4.116077384923282, "ref_ce_loss": 0.06948510557413101, "step": 12340 }, { "epoch": 4.119412941961308, "loss": 0.4763, "step": 12350 }, { "epoch": 4.119412941961308, "grad_norm": 1.4909628629684448, "step": 12350 }, { "epoch": 4.119412941961308, "learning_rate": 0.0001500405122844145, "step": 12350 }, { "epoch": 4.119412941961308, "loss": 0.22745607793331146, "step": 12350 }, { "ce_loss": 0.062056880444288254, "epoch": 4.119412941961308, "step": 12350 }, { "distill_loss": 0.09504853934049606, "epoch": 4.119412941961308, "step": 12350 }, { "epoch": 4.119412941961308, "ref_ce_loss": 0.049994029104709625, "step": 12350 }, { "epoch": 4.119412941961308, "loss": 0.44903331995010376, "step": 12350 }, { "ce_loss": 0.04053228721022606, "epoch": 4.119412941961308, "step": 12350 }, { "distill_loss": 0.12767958641052246, "epoch": 4.119412941961308, "step": 12350 }, { "epoch": 4.119412941961308, "ref_ce_loss": 0.06790989637374878, "step": 12350 }, { "epoch": 4.119412941961308, "loss": 0.4405551552772522, "step": 12350 }, { "ce_loss": 0.17078498005867004, "epoch": 4.119412941961308, "step": 12350 }, { "distill_loss": 0.16165587306022644, "epoch": 4.119412941961308, "step": 12350 }, { "epoch": 4.119412941961308, "ref_ce_loss": 0.10795851796865463, "step": 12350 }, { "epoch": 4.119412941961308, "loss": 0.23271147906780243, "step": 12350 }, { "ce_loss": 0.07316546142101288, "epoch": 4.119412941961308, "step": 12350 }, { "distill_loss": 0.11298181116580963, "epoch": 4.119412941961308, "step": 12350 }, { "epoch": 4.119412941961308, "ref_ce_loss": 0.04643390700221062, "step": 12350 }, { "epoch": 4.122748498999333, "loss": 0.4043, "step": 12360 }, { "epoch": 4.122748498999333, "grad_norm": 2.5978572368621826, "step": 12360 }, { "epoch": 4.122748498999333, "learning_rate": 0.00014983795089189335, "step": 12360 }, { "epoch": 4.122748498999333, "loss": 0.33180752396583557, "step": 12360 }, { "ce_loss": 0.06547296792268753, "epoch": 4.122748498999333, "step": 12360 }, { "distill_loss": 0.12039823830127716, "epoch": 4.122748498999333, "step": 12360 }, { "epoch": 4.122748498999333, "ref_ce_loss": 0.10022406280040741, "step": 12360 }, { "epoch": 4.122748498999333, "loss": 0.5192918181419373, "step": 12360 }, { "ce_loss": 0.12186074256896973, "epoch": 4.122748498999333, "step": 12360 }, { "distill_loss": 0.18573838472366333, "epoch": 4.122748498999333, "step": 12360 }, { "epoch": 4.122748498999333, "ref_ce_loss": 0.11628958582878113, "step": 12360 }, { "epoch": 4.122748498999333, "loss": 0.38019275665283203, "step": 12360 }, { "ce_loss": 0.1042938232421875, "epoch": 4.122748498999333, "step": 12360 }, { "distill_loss": 0.13982954621315002, "epoch": 4.122748498999333, "step": 12360 }, { "epoch": 4.122748498999333, "ref_ce_loss": 0.09626378118991852, "step": 12360 }, { "epoch": 4.122748498999333, "loss": 0.49562186002731323, "step": 12360 }, { "ce_loss": 0.1083085834980011, "epoch": 4.122748498999333, "step": 12360 }, { "distill_loss": 0.09659755975008011, "epoch": 4.122748498999333, "step": 12360 }, { "epoch": 4.122748498999333, "ref_ce_loss": 0.09821917116641998, "step": 12360 }, { "epoch": 4.126084056037358, "loss": 0.464, "step": 12370 }, { "epoch": 4.126084056037358, "grad_norm": 2.7247259616851807, "step": 12370 }, { "epoch": 4.126084056037358, "learning_rate": 0.0001496353897948859, "step": 12370 }, { "epoch": 4.126084056037358, "loss": 0.29314735531806946, "step": 12370 }, { "ce_loss": 0.07004120200872421, "epoch": 4.126084056037358, "step": 12370 }, { "distill_loss": 0.1270415484905243, "epoch": 4.126084056037358, "step": 12370 }, { "epoch": 4.126084056037358, "ref_ce_loss": 0.06225429102778435, "step": 12370 }, { "epoch": 4.126084056037358, "loss": 0.23097294569015503, "step": 12370 }, { "ce_loss": 0.04366536810994148, "epoch": 4.126084056037358, "step": 12370 }, { "distill_loss": 0.1317523717880249, "epoch": 4.126084056037358, "step": 12370 }, { "epoch": 4.126084056037358, "ref_ce_loss": 0.055458322167396545, "step": 12370 }, { "epoch": 4.126084056037358, "loss": 0.5454058647155762, "step": 12370 }, { "ce_loss": 0.18543444573879242, "epoch": 4.126084056037358, "step": 12370 }, { "distill_loss": 0.21535417437553406, "epoch": 4.126084056037358, "step": 12370 }, { "epoch": 4.126084056037358, "ref_ce_loss": 0.09900393337011337, "step": 12370 }, { "epoch": 4.126084056037358, "loss": 0.40011027455329895, "step": 12370 }, { "ce_loss": 0.06823138147592545, "epoch": 4.126084056037358, "step": 12370 }, { "distill_loss": 0.1889926940202713, "epoch": 4.126084056037358, "step": 12370 }, { "epoch": 4.126084056037358, "ref_ce_loss": 0.10016527026891708, "step": 12370 }, { "epoch": 4.129419613075384, "loss": 0.4531, "step": 12380 }, { "epoch": 4.129419613075384, "grad_norm": 4.133249759674072, "step": 12380 }, { "epoch": 4.129419613075384, "learning_rate": 0.00014943282936278365, "step": 12380 }, { "epoch": 4.129419613075384, "loss": 0.3787361979484558, "step": 12380 }, { "ce_loss": 0.14036217331886292, "epoch": 4.129419613075384, "step": 12380 }, { "distill_loss": 0.11300624161958694, "epoch": 4.129419613075384, "step": 12380 }, { "epoch": 4.129419613075384, "ref_ce_loss": 0.09571686387062073, "step": 12380 }, { "epoch": 4.129419613075384, "loss": 0.41098782420158386, "step": 12380 }, { "ce_loss": 0.0697634220123291, "epoch": 4.129419613075384, "step": 12380 }, { "distill_loss": 0.21002747118473053, "epoch": 4.129419613075384, "step": 12380 }, { "epoch": 4.129419613075384, "ref_ce_loss": 0.054851651191711426, "step": 12380 }, { "epoch": 4.129419613075384, "loss": 0.3206042945384979, "step": 12380 }, { "ce_loss": 0.09390709549188614, "epoch": 4.129419613075384, "step": 12380 }, { "distill_loss": 0.14140871167182922, "epoch": 4.129419613075384, "step": 12380 }, { "epoch": 4.129419613075384, "ref_ce_loss": 0.08524361997842789, "step": 12380 }, { "epoch": 4.129419613075384, "loss": 0.3636821508407593, "step": 12380 }, { "ce_loss": 0.09315887093544006, "epoch": 4.129419613075384, "step": 12380 }, { "distill_loss": 0.1522243171930313, "epoch": 4.129419613075384, "step": 12380 }, { "epoch": 4.129419613075384, "ref_ce_loss": 0.0804595798254013, "step": 12380 }, { "epoch": 4.132755170113409, "loss": 0.473, "step": 12390 }, { "epoch": 4.132755170113409, "grad_norm": 3.5067243576049805, "step": 12390 }, { "epoch": 4.132755170113409, "learning_rate": 0.00014923026996497684, "step": 12390 }, { "epoch": 4.132755170113409, "loss": 0.5671949982643127, "step": 12390 }, { "ce_loss": 0.1705818623304367, "epoch": 4.132755170113409, "step": 12390 }, { "distill_loss": 0.25043511390686035, "epoch": 4.132755170113409, "step": 12390 }, { "epoch": 4.132755170113409, "ref_ce_loss": 0.10165385156869888, "step": 12390 }, { "epoch": 4.132755170113409, "loss": 0.5505607724189758, "step": 12390 }, { "ce_loss": 0.15338660776615143, "epoch": 4.132755170113409, "step": 12390 }, { "distill_loss": 0.25098103284835815, "epoch": 4.132755170113409, "step": 12390 }, { "epoch": 4.132755170113409, "ref_ce_loss": 0.07809601724147797, "step": 12390 }, { "epoch": 4.132755170113409, "loss": 0.2750885784626007, "step": 12390 }, { "ce_loss": 0.04651748389005661, "epoch": 4.132755170113409, "step": 12390 }, { "distill_loss": 0.13873888552188873, "epoch": 4.132755170113409, "step": 12390 }, { "epoch": 4.132755170113409, "ref_ce_loss": 0.04965699091553688, "step": 12390 }, { "epoch": 4.132755170113409, "loss": 0.46758201718330383, "step": 12390 }, { "ce_loss": 0.09850476682186127, "epoch": 4.132755170113409, "step": 12390 }, { "distill_loss": 0.16520605981349945, "epoch": 4.132755170113409, "step": 12390 }, { "epoch": 4.132755170113409, "ref_ce_loss": 0.13813252747058868, "step": 12390 }, { "epoch": 4.136090727151434, "loss": 0.43, "step": 12400 }, { "epoch": 4.136090727151434, "grad_norm": 2.588744640350342, "step": 12400 }, { "epoch": 4.136090727151434, "learning_rate": 0.00014902771197085403, "step": 12400 }, { "epoch": 4.136090727151434, "loss": 0.43794015049934387, "step": 12400 }, { "ce_loss": 0.0985134094953537, "epoch": 4.136090727151434, "step": 12400 }, { "distill_loss": 0.1885952353477478, "epoch": 4.136090727151434, "step": 12400 }, { "epoch": 4.136090727151434, "ref_ce_loss": 0.1172669306397438, "step": 12400 }, { "epoch": 4.136090727151434, "loss": 0.3357079029083252, "step": 12400 }, { "ce_loss": 0.08318466693162918, "epoch": 4.136090727151434, "step": 12400 }, { "distill_loss": 0.12697243690490723, "epoch": 4.136090727151434, "step": 12400 }, { "epoch": 4.136090727151434, "ref_ce_loss": 0.0950666069984436, "step": 12400 }, { "epoch": 4.136090727151434, "loss": 0.2388584166765213, "step": 12400 }, { "ce_loss": 0.04853922128677368, "epoch": 4.136090727151434, "step": 12400 }, { "distill_loss": 0.10496456176042557, "epoch": 4.136090727151434, "step": 12400 }, { "epoch": 4.136090727151434, "ref_ce_loss": 0.06415881961584091, "step": 12400 }, { "epoch": 4.136090727151434, "loss": 0.31081292033195496, "step": 12400 }, { "ce_loss": 0.10069255530834198, "epoch": 4.136090727151434, "step": 12400 }, { "distill_loss": 0.1103476732969284, "epoch": 4.136090727151434, "step": 12400 }, { "epoch": 4.136090727151434, "ref_ce_loss": 0.07207563519477844, "step": 12400 }, { "epoch": 4.13942628418946, "loss": 0.4173, "step": 12410 }, { "epoch": 4.13942628418946, "grad_norm": 2.5426387786865234, "step": 12410 }, { "epoch": 4.13942628418946, "learning_rate": 0.00014882515574980108, "step": 12410 }, { "epoch": 4.13942628418946, "loss": 0.3717375099658966, "step": 12410 }, { "ce_loss": 0.06511876732110977, "epoch": 4.13942628418946, "step": 12410 }, { "distill_loss": 0.1163686066865921, "epoch": 4.13942628418946, "step": 12410 }, { "epoch": 4.13942628418946, "ref_ce_loss": 0.06961381435394287, "step": 12410 }, { "epoch": 4.13942628418946, "loss": 0.25051334500312805, "step": 12410 }, { "ce_loss": 0.05856660380959511, "epoch": 4.13942628418946, "step": 12410 }, { "distill_loss": 0.08555157482624054, "epoch": 4.13942628418946, "step": 12410 }, { "epoch": 4.13942628418946, "ref_ce_loss": 0.06662749499082565, "step": 12410 }, { "epoch": 4.13942628418946, "loss": 0.3159472644329071, "step": 12410 }, { "ce_loss": 0.026417195796966553, "epoch": 4.13942628418946, "step": 12410 }, { "distill_loss": 0.09527157992124557, "epoch": 4.13942628418946, "step": 12410 }, { "epoch": 4.13942628418946, "ref_ce_loss": 0.09266111254692078, "step": 12410 }, { "epoch": 4.13942628418946, "loss": 0.5626707673072815, "step": 12410 }, { "ce_loss": 0.14858536422252655, "epoch": 4.13942628418946, "step": 12410 }, { "distill_loss": 0.16457600891590118, "epoch": 4.13942628418946, "step": 12410 }, { "epoch": 4.13942628418946, "ref_ce_loss": 0.09589312970638275, "step": 12410 }, { "epoch": 4.142761841227485, "loss": 0.4934, "step": 12420 }, { "epoch": 4.142761841227485, "grad_norm": 3.4323041439056396, "step": 12420 }, { "epoch": 4.142761841227485, "learning_rate": 0.00014862260167120052, "step": 12420 }, { "epoch": 4.142761841227485, "loss": 0.3895532488822937, "step": 12420 }, { "ce_loss": 0.11411762237548828, "epoch": 4.142761841227485, "step": 12420 }, { "distill_loss": 0.18560034036636353, "epoch": 4.142761841227485, "step": 12420 }, { "epoch": 4.142761841227485, "ref_ce_loss": 0.08894186466932297, "step": 12420 }, { "epoch": 4.142761841227485, "loss": 0.5213585495948792, "step": 12420 }, { "ce_loss": 0.06978300213813782, "epoch": 4.142761841227485, "step": 12420 }, { "distill_loss": 0.2566857933998108, "epoch": 4.142761841227485, "step": 12420 }, { "epoch": 4.142761841227485, "ref_ce_loss": 0.10135982930660248, "step": 12420 }, { "epoch": 4.142761841227485, "loss": 0.5565406084060669, "step": 12420 }, { "ce_loss": 0.058990854769945145, "epoch": 4.142761841227485, "step": 12420 }, { "distill_loss": 0.19492602348327637, "epoch": 4.142761841227485, "step": 12420 }, { "epoch": 4.142761841227485, "ref_ce_loss": 0.07594392448663712, "step": 12420 }, { "epoch": 4.142761841227485, "loss": 0.2676648497581482, "step": 12420 }, { "ce_loss": 0.07127406448125839, "epoch": 4.142761841227485, "step": 12420 }, { "distill_loss": 0.1313502937555313, "epoch": 4.142761841227485, "step": 12420 }, { "epoch": 4.142761841227485, "ref_ce_loss": 0.06474132835865021, "step": 12420 }, { "epoch": 4.1460973982655105, "loss": 0.4454, "step": 12430 }, { "epoch": 4.1460973982655105, "grad_norm": 2.4196767807006836, "step": 12430 }, { "epoch": 4.1460973982655105, "learning_rate": 0.00014842005010443126, "step": 12430 }, { "epoch": 4.1460973982655105, "loss": 0.3624645471572876, "step": 12430 }, { "ce_loss": 0.09504459798336029, "epoch": 4.1460973982655105, "step": 12430 }, { "distill_loss": 0.09769954532384872, "epoch": 4.1460973982655105, "step": 12430 }, { "epoch": 4.1460973982655105, "ref_ce_loss": 0.09856883436441422, "step": 12430 }, { "epoch": 4.1460973982655105, "loss": 0.2978444993495941, "step": 12430 }, { "ce_loss": 0.07314880937337875, "epoch": 4.1460973982655105, "step": 12430 }, { "distill_loss": 0.11760088801383972, "epoch": 4.1460973982655105, "step": 12430 }, { "epoch": 4.1460973982655105, "ref_ce_loss": 0.1069493219256401, "step": 12430 }, { "epoch": 4.1460973982655105, "loss": 0.27242863178253174, "step": 12430 }, { "ce_loss": 0.06576551496982574, "epoch": 4.1460973982655105, "step": 12430 }, { "distill_loss": 0.09319284558296204, "epoch": 4.1460973982655105, "step": 12430 }, { "epoch": 4.1460973982655105, "ref_ce_loss": 0.07609212398529053, "step": 12430 }, { "epoch": 4.1460973982655105, "loss": 0.35750612616539, "step": 12430 }, { "ce_loss": 0.06778037548065186, "epoch": 4.1460973982655105, "step": 12430 }, { "distill_loss": 0.10166697949171066, "epoch": 4.1460973982655105, "step": 12430 }, { "epoch": 4.1460973982655105, "ref_ce_loss": 0.07472172379493713, "step": 12430 }, { "epoch": 4.149432955303536, "loss": 0.3868, "step": 12440 }, { "epoch": 4.149432955303536, "grad_norm": 2.221003532409668, "step": 12440 }, { "epoch": 4.149432955303536, "learning_rate": 0.0001482175014188673, "step": 12440 }, { "epoch": 4.149432955303536, "loss": 0.2614574730396271, "step": 12440 }, { "ce_loss": 0.031120847910642624, "epoch": 4.149432955303536, "step": 12440 }, { "distill_loss": 0.08347459137439728, "epoch": 4.149432955303536, "step": 12440 }, { "epoch": 4.149432955303536, "ref_ce_loss": 0.06839940696954727, "step": 12440 }, { "epoch": 4.149432955303536, "loss": 0.27395689487457275, "step": 12440 }, { "ce_loss": 0.061532970517873764, "epoch": 4.149432955303536, "step": 12440 }, { "distill_loss": 0.12716756761074066, "epoch": 4.149432955303536, "step": 12440 }, { "epoch": 4.149432955303536, "ref_ce_loss": 0.08494030684232712, "step": 12440 }, { "epoch": 4.149432955303536, "loss": 0.20453256368637085, "step": 12440 }, { "ce_loss": 0.03099118359386921, "epoch": 4.149432955303536, "step": 12440 }, { "distill_loss": 0.0905686765909195, "epoch": 4.149432955303536, "step": 12440 }, { "epoch": 4.149432955303536, "ref_ce_loss": 0.04775973781943321, "step": 12440 }, { "epoch": 4.149432955303536, "loss": 0.2593122124671936, "step": 12440 }, { "ce_loss": 0.027723398059606552, "epoch": 4.149432955303536, "step": 12440 }, { "distill_loss": 0.12843185663223267, "epoch": 4.149432955303536, "step": 12440 }, { "epoch": 4.149432955303536, "ref_ce_loss": 0.10249225795269012, "step": 12440 }, { "epoch": 4.152768512341561, "loss": 0.4185, "step": 12450 }, { "epoch": 4.152768512341561, "grad_norm": 3.3163435459136963, "step": 12450 }, { "epoch": 4.152768512341561, "learning_rate": 0.00014801495598387764, "step": 12450 }, { "epoch": 4.152768512341561, "loss": 0.3037762939929962, "step": 12450 }, { "ce_loss": 0.08418890833854675, "epoch": 4.152768512341561, "step": 12450 }, { "distill_loss": 0.13176003098487854, "epoch": 4.152768512341561, "step": 12450 }, { "epoch": 4.152768512341561, "ref_ce_loss": 0.08765073120594025, "step": 12450 }, { "epoch": 4.152768512341561, "loss": 0.3808235228061676, "step": 12450 }, { "ce_loss": 0.08875785768032074, "epoch": 4.152768512341561, "step": 12450 }, { "distill_loss": 0.1582205891609192, "epoch": 4.152768512341561, "step": 12450 }, { "epoch": 4.152768512341561, "ref_ce_loss": 0.08822675794363022, "step": 12450 }, { "epoch": 4.152768512341561, "loss": 0.803396999835968, "step": 12450 }, { "ce_loss": 0.12241890281438828, "epoch": 4.152768512341561, "step": 12450 }, { "distill_loss": 0.11734962463378906, "epoch": 4.152768512341561, "step": 12450 }, { "epoch": 4.152768512341561, "ref_ce_loss": 0.0981023758649826, "step": 12450 }, { "epoch": 4.152768512341561, "loss": 0.23134736716747284, "step": 12450 }, { "ce_loss": 0.04157764092087746, "epoch": 4.152768512341561, "step": 12450 }, { "distill_loss": 0.10362343490123749, "epoch": 4.152768512341561, "step": 12450 }, { "epoch": 4.152768512341561, "ref_ce_loss": 0.08595871925354004, "step": 12450 }, { "epoch": 4.1561040693795865, "loss": 0.4531, "step": 12460 }, { "epoch": 4.1561040693795865, "grad_norm": 3.0537631511688232, "step": 12460 }, { "epoch": 4.1561040693795865, "learning_rate": 0.00014781241416882525, "step": 12460 }, { "epoch": 4.1561040693795865, "loss": 0.46858227252960205, "step": 12460 }, { "ce_loss": 0.17086638510227203, "epoch": 4.1561040693795865, "step": 12460 }, { "distill_loss": 0.1451958417892456, "epoch": 4.1561040693795865, "step": 12460 }, { "epoch": 4.1561040693795865, "ref_ce_loss": 0.07415025681257248, "step": 12460 }, { "epoch": 4.1561040693795865, "loss": 0.4011555016040802, "step": 12460 }, { "ce_loss": 0.09203765541315079, "epoch": 4.1561040693795865, "step": 12460 }, { "distill_loss": 0.14013737440109253, "epoch": 4.1561040693795865, "step": 12460 }, { "epoch": 4.1561040693795865, "ref_ce_loss": 0.07352907210588455, "step": 12460 }, { "epoch": 4.1561040693795865, "loss": 0.5737204551696777, "step": 12460 }, { "ce_loss": 0.10794582217931747, "epoch": 4.1561040693795865, "step": 12460 }, { "distill_loss": 0.11963769048452377, "epoch": 4.1561040693795865, "step": 12460 }, { "epoch": 4.1561040693795865, "ref_ce_loss": 0.1095840260386467, "step": 12460 }, { "epoch": 4.1561040693795865, "loss": 0.43205690383911133, "step": 12460 }, { "ce_loss": 0.15435588359832764, "epoch": 4.1561040693795865, "step": 12460 }, { "distill_loss": 0.1649441421031952, "epoch": 4.1561040693795865, "step": 12460 }, { "epoch": 4.1561040693795865, "ref_ce_loss": 0.07601425796747208, "step": 12460 }, { "epoch": 4.159439626417612, "loss": 0.4305, "step": 12470 }, { "epoch": 4.159439626417612, "grad_norm": 3.452244520187378, "step": 12470 }, { "epoch": 4.159439626417612, "learning_rate": 0.00014760987634306646, "step": 12470 }, { "epoch": 4.159439626417612, "loss": 0.65595543384552, "step": 12470 }, { "ce_loss": 0.1943955272436142, "epoch": 4.159439626417612, "step": 12470 }, { "distill_loss": 0.13412387669086456, "epoch": 4.159439626417612, "step": 12470 }, { "epoch": 4.159439626417612, "ref_ce_loss": 0.09166346490383148, "step": 12470 }, { "epoch": 4.159439626417612, "loss": 0.47722163796424866, "step": 12470 }, { "ce_loss": 0.15214447677135468, "epoch": 4.159439626417612, "step": 12470 }, { "distill_loss": 0.21324175596237183, "epoch": 4.159439626417612, "step": 12470 }, { "epoch": 4.159439626417612, "ref_ce_loss": 0.09311570227146149, "step": 12470 }, { "epoch": 4.159439626417612, "loss": 0.6223658919334412, "step": 12470 }, { "ce_loss": 0.1359712928533554, "epoch": 4.159439626417612, "step": 12470 }, { "distill_loss": 0.18622928857803345, "epoch": 4.159439626417612, "step": 12470 }, { "epoch": 4.159439626417612, "ref_ce_loss": 0.14117476344108582, "step": 12470 }, { "epoch": 4.159439626417612, "loss": 0.45449307560920715, "step": 12470 }, { "ce_loss": 0.1701548993587494, "epoch": 4.159439626417612, "step": 12470 }, { "distill_loss": 0.1619931310415268, "epoch": 4.159439626417612, "step": 12470 }, { "epoch": 4.159439626417612, "ref_ce_loss": 0.1218772828578949, "step": 12470 }, { "epoch": 4.162775183455637, "loss": 0.4336, "step": 12480 }, { "epoch": 4.162775183455637, "grad_norm": 2.113687753677368, "step": 12480 }, { "epoch": 4.162775183455637, "learning_rate": 0.0001474073428759504, "step": 12480 }, { "epoch": 4.162775183455637, "loss": 0.40800178050994873, "step": 12480 }, { "ce_loss": 0.05434617027640343, "epoch": 4.162775183455637, "step": 12480 }, { "distill_loss": 0.13565826416015625, "epoch": 4.162775183455637, "step": 12480 }, { "epoch": 4.162775183455637, "ref_ce_loss": 0.08128561824560165, "step": 12480 }, { "epoch": 4.162775183455637, "loss": 0.32694897055625916, "step": 12480 }, { "ce_loss": 0.042434632778167725, "epoch": 4.162775183455637, "step": 12480 }, { "distill_loss": 0.10554373264312744, "epoch": 4.162775183455637, "step": 12480 }, { "epoch": 4.162775183455637, "ref_ce_loss": 0.08126919716596603, "step": 12480 }, { "epoch": 4.162775183455637, "loss": 0.1619693487882614, "step": 12480 }, { "ce_loss": 0.011762240901589394, "epoch": 4.162775183455637, "step": 12480 }, { "distill_loss": 0.06072559207677841, "epoch": 4.162775183455637, "step": 12480 }, { "epoch": 4.162775183455637, "ref_ce_loss": 0.046694181859493256, "step": 12480 }, { "epoch": 4.162775183455637, "loss": 0.46953803300857544, "step": 12480 }, { "ce_loss": 0.15216673910617828, "epoch": 4.162775183455637, "step": 12480 }, { "distill_loss": 0.15188661217689514, "epoch": 4.162775183455637, "step": 12480 }, { "epoch": 4.162775183455637, "ref_ce_loss": 0.09307637065649033, "step": 12480 }, { "epoch": 4.166110740493663, "loss": 0.3863, "step": 12490 }, { "epoch": 4.166110740493663, "grad_norm": 1.9610040187835693, "step": 12490 }, { "epoch": 4.166110740493663, "learning_rate": 0.0001472048141368182, "step": 12490 }, { "epoch": 4.166110740493663, "loss": 0.4812275767326355, "step": 12490 }, { "ce_loss": 0.16773614287376404, "epoch": 4.166110740493663, "step": 12490 }, { "distill_loss": 0.14370931684970856, "epoch": 4.166110740493663, "step": 12490 }, { "epoch": 4.166110740493663, "ref_ce_loss": 0.12923017144203186, "step": 12490 }, { "epoch": 4.166110740493663, "loss": 0.3812367618083954, "step": 12490 }, { "ce_loss": 0.10464449226856232, "epoch": 4.166110740493663, "step": 12490 }, { "distill_loss": 0.11006525903940201, "epoch": 4.166110740493663, "step": 12490 }, { "epoch": 4.166110740493663, "ref_ce_loss": 0.09356817603111267, "step": 12490 }, { "epoch": 4.166110740493663, "loss": 0.32714563608169556, "step": 12490 }, { "ce_loss": 0.07036662846803665, "epoch": 4.166110740493663, "step": 12490 }, { "distill_loss": 0.09512940049171448, "epoch": 4.166110740493663, "step": 12490 }, { "epoch": 4.166110740493663, "ref_ce_loss": 0.06031448766589165, "step": 12490 }, { "epoch": 4.166110740493663, "loss": 0.32152891159057617, "step": 12490 }, { "ce_loss": 0.11197475343942642, "epoch": 4.166110740493663, "step": 12490 }, { "distill_loss": 0.10625480115413666, "epoch": 4.166110740493663, "step": 12490 }, { "epoch": 4.166110740493663, "ref_ce_loss": 0.07325571775436401, "step": 12490 }, { "epoch": 4.169446297531688, "loss": 0.4025, "step": 12500 }, { "epoch": 4.169446297531688, "grad_norm": 4.8676300048828125, "step": 12500 }, { "epoch": 4.169446297531688, "learning_rate": 0.0001470022904950024, "step": 12500 }, { "epoch": 4.169446297531688, "loss": 0.34299716353416443, "step": 12500 }, { "ce_loss": 0.12802059948444366, "epoch": 4.169446297531688, "step": 12500 }, { "distill_loss": 0.09531907737255096, "epoch": 4.169446297531688, "step": 12500 }, { "epoch": 4.169446297531688, "ref_ce_loss": 0.07253427058458328, "step": 12500 }, { "epoch": 4.169446297531688, "loss": 0.5606489777565002, "step": 12500 }, { "ce_loss": 0.16046318411827087, "epoch": 4.169446297531688, "step": 12500 }, { "distill_loss": 0.2585448622703552, "epoch": 4.169446297531688, "step": 12500 }, { "epoch": 4.169446297531688, "ref_ce_loss": 0.14125660061836243, "step": 12500 }, { "epoch": 4.169446297531688, "loss": 0.2809467017650604, "step": 12500 }, { "ce_loss": 0.0834980458021164, "epoch": 4.169446297531688, "step": 12500 }, { "distill_loss": 0.13691724836826324, "epoch": 4.169446297531688, "step": 12500 }, { "epoch": 4.169446297531688, "ref_ce_loss": 0.06044924259185791, "step": 12500 }, { "epoch": 4.169446297531688, "loss": 0.29263341426849365, "step": 12500 }, { "ce_loss": 0.0704631358385086, "epoch": 4.169446297531688, "step": 12500 }, { "distill_loss": 0.11643597483634949, "epoch": 4.169446297531688, "step": 12500 }, { "epoch": 4.169446297531688, "ref_ce_loss": 0.10554829239845276, "step": 12500 }, { "epoch": 4.172781854569713, "loss": 0.4319, "step": 12510 }, { "epoch": 4.172781854569713, "grad_norm": 4.185899257659912, "step": 12510 }, { "epoch": 4.172781854569713, "learning_rate": 0.00014679977231982629, "step": 12510 }, { "epoch": 4.172781854569713, "loss": 0.30824387073516846, "step": 12510 }, { "ce_loss": 0.08948945254087448, "epoch": 4.172781854569713, "step": 12510 }, { "distill_loss": 0.12239973247051239, "epoch": 4.172781854569713, "step": 12510 }, { "epoch": 4.172781854569713, "ref_ce_loss": 0.06972790509462357, "step": 12510 }, { "epoch": 4.172781854569713, "loss": 0.2979793846607208, "step": 12510 }, { "ce_loss": 0.09574341028928757, "epoch": 4.172781854569713, "step": 12510 }, { "distill_loss": 0.09483419358730316, "epoch": 4.172781854569713, "step": 12510 }, { "epoch": 4.172781854569713, "ref_ce_loss": 0.07258374243974686, "step": 12510 }, { "epoch": 4.172781854569713, "loss": 0.20826782286167145, "step": 12510 }, { "ce_loss": 0.05446157604455948, "epoch": 4.172781854569713, "step": 12510 }, { "distill_loss": 0.08192051947116852, "epoch": 4.172781854569713, "step": 12510 }, { "epoch": 4.172781854569713, "ref_ce_loss": 0.07135691493749619, "step": 12510 }, { "epoch": 4.172781854569713, "loss": 0.43391022086143494, "step": 12510 }, { "ce_loss": 0.11679257452487946, "epoch": 4.172781854569713, "step": 12510 }, { "distill_loss": 0.15242500603199005, "epoch": 4.172781854569713, "step": 12510 }, { "epoch": 4.172781854569713, "ref_ce_loss": 0.11640404164791107, "step": 12510 }, { "epoch": 4.176117411607739, "loss": 0.3719, "step": 12520 }, { "epoch": 4.176117411607739, "grad_norm": 1.7702113389968872, "step": 12520 }, { "epoch": 4.176117411607739, "learning_rate": 0.000146597259980603, "step": 12520 }, { "epoch": 4.176117411607739, "loss": 0.3008236885070801, "step": 12520 }, { "ce_loss": 0.03070242889225483, "epoch": 4.176117411607739, "step": 12520 }, { "distill_loss": 0.1274239718914032, "epoch": 4.176117411607739, "step": 12520 }, { "epoch": 4.176117411607739, "ref_ce_loss": 0.08409994840621948, "step": 12520 }, { "epoch": 4.176117411607739, "loss": 0.3606027662754059, "step": 12520 }, { "ce_loss": 0.0853457897901535, "epoch": 4.176117411607739, "step": 12520 }, { "distill_loss": 0.09666283428668976, "epoch": 4.176117411607739, "step": 12520 }, { "epoch": 4.176117411607739, "ref_ce_loss": 0.11446385830640793, "step": 12520 }, { "epoch": 4.176117411607739, "loss": 0.3016287386417389, "step": 12520 }, { "ce_loss": 0.08930810540914536, "epoch": 4.176117411607739, "step": 12520 }, { "distill_loss": 0.10128715634346008, "epoch": 4.176117411607739, "step": 12520 }, { "epoch": 4.176117411607739, "ref_ce_loss": 0.07626650482416153, "step": 12520 }, { "epoch": 4.176117411607739, "loss": 0.26100292801856995, "step": 12520 }, { "ce_loss": 0.08020920306444168, "epoch": 4.176117411607739, "step": 12520 }, { "distill_loss": 0.10896246135234833, "epoch": 4.176117411607739, "step": 12520 }, { "epoch": 4.176117411607739, "ref_ce_loss": 0.07162782549858093, "step": 12520 }, { "epoch": 4.179452968645764, "loss": 0.349, "step": 12530 }, { "epoch": 4.179452968645764, "grad_norm": 2.14904522895813, "step": 12530 }, { "epoch": 4.179452968645764, "learning_rate": 0.00014639475384663528, "step": 12530 }, { "epoch": 4.179452968645764, "loss": 0.2830139100551605, "step": 12530 }, { "ce_loss": 0.0933670923113823, "epoch": 4.179452968645764, "step": 12530 }, { "distill_loss": 0.10775136947631836, "epoch": 4.179452968645764, "step": 12530 }, { "epoch": 4.179452968645764, "ref_ce_loss": 0.059976205229759216, "step": 12530 }, { "epoch": 4.179452968645764, "loss": 0.4265592396259308, "step": 12530 }, { "ce_loss": 0.08390668779611588, "epoch": 4.179452968645764, "step": 12530 }, { "distill_loss": 0.18817268311977386, "epoch": 4.179452968645764, "step": 12530 }, { "epoch": 4.179452968645764, "ref_ce_loss": 0.09286605566740036, "step": 12530 }, { "epoch": 4.179452968645764, "loss": 0.25192058086395264, "step": 12530 }, { "ce_loss": 0.07058953493833542, "epoch": 4.179452968645764, "step": 12530 }, { "distill_loss": 0.10347865521907806, "epoch": 4.179452968645764, "step": 12530 }, { "epoch": 4.179452968645764, "ref_ce_loss": 0.07778134942054749, "step": 12530 }, { "epoch": 4.179452968645764, "loss": 0.25412583351135254, "step": 12530 }, { "ce_loss": 0.062413640320301056, "epoch": 4.179452968645764, "step": 12530 }, { "distill_loss": 0.08789204806089401, "epoch": 4.179452968645764, "step": 12530 }, { "epoch": 4.179452968645764, "ref_ce_loss": 0.08234740793704987, "step": 12530 }, { "epoch": 4.182788525683789, "loss": 0.3877, "step": 12540 }, { "epoch": 4.182788525683789, "grad_norm": 2.911775588989258, "step": 12540 }, { "epoch": 4.182788525683789, "learning_rate": 0.0001461922542872144, "step": 12540 }, { "epoch": 4.182788525683789, "loss": 0.69565749168396, "step": 12540 }, { "ce_loss": 0.1180061474442482, "epoch": 4.182788525683789, "step": 12540 }, { "distill_loss": 0.1855594664812088, "epoch": 4.182788525683789, "step": 12540 }, { "epoch": 4.182788525683789, "ref_ce_loss": 0.10253113508224487, "step": 12540 }, { "epoch": 4.182788525683789, "loss": 0.24786855280399323, "step": 12540 }, { "ce_loss": 0.024127228185534477, "epoch": 4.182788525683789, "step": 12540 }, { "distill_loss": 0.13936744630336761, "epoch": 4.182788525683789, "step": 12540 }, { "epoch": 4.182788525683789, "ref_ce_loss": 0.06584622710943222, "step": 12540 }, { "epoch": 4.182788525683789, "loss": 0.29231300950050354, "step": 12540 }, { "ce_loss": 0.10617182403802872, "epoch": 4.182788525683789, "step": 12540 }, { "distill_loss": 0.11738552898168564, "epoch": 4.182788525683789, "step": 12540 }, { "epoch": 4.182788525683789, "ref_ce_loss": 0.06845077127218246, "step": 12540 }, { "epoch": 4.182788525683789, "loss": 0.248124897480011, "step": 12540 }, { "ce_loss": 0.03263659402728081, "epoch": 4.182788525683789, "step": 12540 }, { "distill_loss": 0.09977734833955765, "epoch": 4.182788525683789, "step": 12540 }, { "epoch": 4.182788525683789, "ref_ce_loss": 0.07695674896240234, "step": 12540 }, { "epoch": 4.186124082721815, "loss": 0.3926, "step": 12550 }, { "epoch": 4.186124082721815, "grad_norm": 7.9306321144104, "step": 12550 }, { "epoch": 4.186124082721815, "learning_rate": 0.00014598976167161964, "step": 12550 }, { "epoch": 4.186124082721815, "loss": 0.6581223011016846, "step": 12550 }, { "ce_loss": 0.12450020760297775, "epoch": 4.186124082721815, "step": 12550 }, { "distill_loss": 0.16247011721134186, "epoch": 4.186124082721815, "step": 12550 }, { "epoch": 4.186124082721815, "ref_ce_loss": 0.14108185470104218, "step": 12550 }, { "epoch": 4.186124082721815, "loss": 0.22678981721401215, "step": 12550 }, { "ce_loss": 0.05303997918963432, "epoch": 4.186124082721815, "step": 12550 }, { "distill_loss": 0.11390000581741333, "epoch": 4.186124082721815, "step": 12550 }, { "epoch": 4.186124082721815, "ref_ce_loss": 0.059662774205207825, "step": 12550 }, { "epoch": 4.186124082721815, "loss": 0.3068074584007263, "step": 12550 }, { "ce_loss": 0.11068800836801529, "epoch": 4.186124082721815, "step": 12550 }, { "distill_loss": 0.09961576014757156, "epoch": 4.186124082721815, "step": 12550 }, { "epoch": 4.186124082721815, "ref_ce_loss": 0.07074546813964844, "step": 12550 }, { "epoch": 4.186124082721815, "loss": 0.4554699957370758, "step": 12550 }, { "ce_loss": 0.13982751965522766, "epoch": 4.186124082721815, "step": 12550 }, { "distill_loss": 0.14476770162582397, "epoch": 4.186124082721815, "step": 12550 }, { "epoch": 4.186124082721815, "ref_ce_loss": 0.12475011497735977, "step": 12550 }, { "epoch": 4.18945963975984, "loss": 0.4378, "step": 12560 }, { "epoch": 4.18945963975984, "grad_norm": 2.6521835327148438, "step": 12560 }, { "epoch": 4.18945963975984, "learning_rate": 0.00014578727636911773, "step": 12560 }, { "epoch": 4.18945963975984, "loss": 0.4609094262123108, "step": 12560 }, { "ce_loss": 0.14568175375461578, "epoch": 4.18945963975984, "step": 12560 }, { "distill_loss": 0.16340190172195435, "epoch": 4.18945963975984, "step": 12560 }, { "epoch": 4.18945963975984, "ref_ce_loss": 0.10280240327119827, "step": 12560 }, { "epoch": 4.18945963975984, "loss": 0.22618383169174194, "step": 12560 }, { "ce_loss": 0.04781392216682434, "epoch": 4.18945963975984, "step": 12560 }, { "distill_loss": 0.09708165377378464, "epoch": 4.18945963975984, "step": 12560 }, { "epoch": 4.18945963975984, "ref_ce_loss": 0.05278674140572548, "step": 12560 }, { "epoch": 4.18945963975984, "loss": 0.3315080404281616, "step": 12560 }, { "ce_loss": 0.06448005139827728, "epoch": 4.18945963975984, "step": 12560 }, { "distill_loss": 0.12131848186254501, "epoch": 4.18945963975984, "step": 12560 }, { "epoch": 4.18945963975984, "ref_ce_loss": 0.07438777387142181, "step": 12560 }, { "epoch": 4.18945963975984, "loss": 0.6073876023292542, "step": 12560 }, { "ce_loss": 0.21663253009319305, "epoch": 4.18945963975984, "step": 12560 }, { "distill_loss": 0.18993952870368958, "epoch": 4.18945963975984, "step": 12560 }, { "epoch": 4.18945963975984, "ref_ce_loss": 0.13462689518928528, "step": 12560 }, { "epoch": 4.192795196797865, "loss": 0.4217, "step": 12570 }, { "epoch": 4.192795196797865, "grad_norm": 3.1643049716949463, "step": 12570 }, { "epoch": 4.192795196797865, "learning_rate": 0.0001455847987489619, "step": 12570 }, { "epoch": 4.192795196797865, "loss": 0.37981656193733215, "step": 12570 }, { "ce_loss": 0.07878326624631882, "epoch": 4.192795196797865, "step": 12570 }, { "distill_loss": 0.14073777198791504, "epoch": 4.192795196797865, "step": 12570 }, { "epoch": 4.192795196797865, "ref_ce_loss": 0.12595392763614655, "step": 12570 }, { "epoch": 4.192795196797865, "loss": 0.5180966854095459, "step": 12570 }, { "ce_loss": 0.12497790902853012, "epoch": 4.192795196797865, "step": 12570 }, { "distill_loss": 0.11337775737047195, "epoch": 4.192795196797865, "step": 12570 }, { "epoch": 4.192795196797865, "ref_ce_loss": 0.05922972410917282, "step": 12570 }, { "epoch": 4.192795196797865, "loss": 0.23794496059417725, "step": 12570 }, { "ce_loss": 0.046828463673591614, "epoch": 4.192795196797865, "step": 12570 }, { "distill_loss": 0.0907125473022461, "epoch": 4.192795196797865, "step": 12570 }, { "epoch": 4.192795196797865, "ref_ce_loss": 0.06597310304641724, "step": 12570 }, { "epoch": 4.192795196797865, "loss": 0.23115824162960052, "step": 12570 }, { "ce_loss": 0.04305850341916084, "epoch": 4.192795196797865, "step": 12570 }, { "distill_loss": 0.09254015237092972, "epoch": 4.192795196797865, "step": 12570 }, { "epoch": 4.192795196797865, "ref_ce_loss": 0.09520301222801208, "step": 12570 }, { "epoch": 4.196130753835891, "loss": 0.4201, "step": 12580 }, { "epoch": 4.196130753835891, "grad_norm": 2.1938881874084473, "step": 12580 }, { "epoch": 4.196130753835891, "learning_rate": 0.0001453823291803915, "step": 12580 }, { "epoch": 4.196130753835891, "loss": 0.3288840055465698, "step": 12580 }, { "ce_loss": 0.11204256117343903, "epoch": 4.196130753835891, "step": 12580 }, { "distill_loss": 0.10862526297569275, "epoch": 4.196130753835891, "step": 12580 }, { "epoch": 4.196130753835891, "ref_ce_loss": 0.07976261526346207, "step": 12580 }, { "epoch": 4.196130753835891, "loss": 0.4981669783592224, "step": 12580 }, { "ce_loss": 0.14517666399478912, "epoch": 4.196130753835891, "step": 12580 }, { "distill_loss": 0.1696842610836029, "epoch": 4.196130753835891, "step": 12580 }, { "epoch": 4.196130753835891, "ref_ce_loss": 0.10310443490743637, "step": 12580 }, { "epoch": 4.196130753835891, "loss": 0.3159114718437195, "step": 12580 }, { "ce_loss": 0.07832693308591843, "epoch": 4.196130753835891, "step": 12580 }, { "distill_loss": 0.09905324131250381, "epoch": 4.196130753835891, "step": 12580 }, { "epoch": 4.196130753835891, "ref_ce_loss": 0.07301445305347443, "step": 12580 }, { "epoch": 4.196130753835891, "loss": 0.3570947051048279, "step": 12580 }, { "ce_loss": 0.112045057117939, "epoch": 4.196130753835891, "step": 12580 }, { "distill_loss": 0.12575042247772217, "epoch": 4.196130753835891, "step": 12580 }, { "epoch": 4.196130753835891, "ref_ce_loss": 0.08894684910774231, "step": 12580 }, { "epoch": 4.199466310873916, "loss": 0.4287, "step": 12590 }, { "epoch": 4.199466310873916, "grad_norm": 2.1162467002868652, "step": 12590 }, { "epoch": 4.199466310873916, "learning_rate": 0.00014517986803263115, "step": 12590 }, { "epoch": 4.199466310873916, "loss": 0.3382636606693268, "step": 12590 }, { "ce_loss": 0.10414419323205948, "epoch": 4.199466310873916, "step": 12590 }, { "distill_loss": 0.14084231853485107, "epoch": 4.199466310873916, "step": 12590 }, { "epoch": 4.199466310873916, "ref_ce_loss": 0.09309055656194687, "step": 12590 }, { "epoch": 4.199466310873916, "loss": 0.41425347328186035, "step": 12590 }, { "ce_loss": 0.09124680608510971, "epoch": 4.199466310873916, "step": 12590 }, { "distill_loss": 0.11330849677324295, "epoch": 4.199466310873916, "step": 12590 }, { "epoch": 4.199466310873916, "ref_ce_loss": 0.09014124423265457, "step": 12590 }, { "epoch": 4.199466310873916, "loss": 0.6550116539001465, "step": 12590 }, { "ce_loss": 0.19489094614982605, "epoch": 4.199466310873916, "step": 12590 }, { "distill_loss": 0.2267588973045349, "epoch": 4.199466310873916, "step": 12590 }, { "epoch": 4.199466310873916, "ref_ce_loss": 0.08232640475034714, "step": 12590 }, { "epoch": 4.199466310873916, "loss": 0.5072152614593506, "step": 12590 }, { "ce_loss": 0.0739305317401886, "epoch": 4.199466310873916, "step": 12590 }, { "distill_loss": 0.1428247094154358, "epoch": 4.199466310873916, "step": 12590 }, { "epoch": 4.199466310873916, "ref_ce_loss": 0.08314472436904907, "step": 12590 }, { "epoch": 4.202801867911941, "loss": 0.4778, "step": 12600 }, { "epoch": 4.202801867911941, "grad_norm": 5.220403671264648, "step": 12600 }, { "epoch": 4.202801867911941, "learning_rate": 0.00014497741567489012, "step": 12600 }, { "epoch": 4.202801867911941, "loss": 0.4604666829109192, "step": 12600 }, { "ce_loss": 0.1026303842663765, "epoch": 4.202801867911941, "step": 12600 }, { "distill_loss": 0.12971565127372742, "epoch": 4.202801867911941, "step": 12600 }, { "epoch": 4.202801867911941, "ref_ce_loss": 0.09788954257965088, "step": 12600 }, { "epoch": 4.202801867911941, "loss": 0.3484407067298889, "step": 12600 }, { "ce_loss": 0.05491747707128525, "epoch": 4.202801867911941, "step": 12600 }, { "distill_loss": 0.12664100527763367, "epoch": 4.202801867911941, "step": 12600 }, { "epoch": 4.202801867911941, "ref_ce_loss": 0.08006755262613297, "step": 12600 }, { "epoch": 4.202801867911941, "loss": 0.28191614151000977, "step": 12600 }, { "ce_loss": 0.029443496838212013, "epoch": 4.202801867911941, "step": 12600 }, { "distill_loss": 0.10391257703304291, "epoch": 4.202801867911941, "step": 12600 }, { "epoch": 4.202801867911941, "ref_ce_loss": 0.10279714316129684, "step": 12600 }, { "epoch": 4.202801867911941, "loss": 0.4100857377052307, "step": 12600 }, { "ce_loss": 0.13263317942619324, "epoch": 4.202801867911941, "step": 12600 }, { "distill_loss": 0.12876605987548828, "epoch": 4.202801867911941, "step": 12600 }, { "epoch": 4.202801867911941, "ref_ce_loss": 0.10536639392375946, "step": 12600 }, { "epoch": 4.206137424949967, "loss": 0.3917, "step": 12610 }, { "epoch": 4.206137424949967, "grad_norm": 2.95518159866333, "step": 12610 }, { "epoch": 4.206137424949967, "learning_rate": 0.00014477497247636167, "step": 12610 }, { "epoch": 4.206137424949967, "loss": 0.33207955956459045, "step": 12610 }, { "ce_loss": 0.05851322412490845, "epoch": 4.206137424949967, "step": 12610 }, { "distill_loss": 0.12857557833194733, "epoch": 4.206137424949967, "step": 12610 }, { "epoch": 4.206137424949967, "ref_ce_loss": 0.04979289323091507, "step": 12610 }, { "epoch": 4.206137424949967, "loss": 0.3608654737472534, "step": 12610 }, { "ce_loss": 0.13061387836933136, "epoch": 4.206137424949967, "step": 12610 }, { "distill_loss": 0.10170701146125793, "epoch": 4.206137424949967, "step": 12610 }, { "epoch": 4.206137424949967, "ref_ce_loss": 0.064691923558712, "step": 12610 }, { "epoch": 4.206137424949967, "loss": 0.5522586703300476, "step": 12610 }, { "ce_loss": 0.12592393159866333, "epoch": 4.206137424949967, "step": 12610 }, { "distill_loss": 0.17105364799499512, "epoch": 4.206137424949967, "step": 12610 }, { "epoch": 4.206137424949967, "ref_ce_loss": 0.12697115540504456, "step": 12610 }, { "epoch": 4.206137424949967, "loss": 0.27305784821510315, "step": 12610 }, { "ce_loss": 0.06921637058258057, "epoch": 4.206137424949967, "step": 12610 }, { "distill_loss": 0.12910543382167816, "epoch": 4.206137424949967, "step": 12610 }, { "epoch": 4.206137424949967, "ref_ce_loss": 0.05369148030877113, "step": 12610 }, { "epoch": 4.209472981987992, "loss": 0.4288, "step": 12620 }, { "epoch": 4.209472981987992, "grad_norm": 2.8391435146331787, "step": 12620 }, { "epoch": 4.209472981987992, "learning_rate": 0.0001445725388062223, "step": 12620 }, { "epoch": 4.209472981987992, "loss": 0.586120069026947, "step": 12620 }, { "ce_loss": 0.13455958664417267, "epoch": 4.209472981987992, "step": 12620 }, { "distill_loss": 0.18541017174720764, "epoch": 4.209472981987992, "step": 12620 }, { "epoch": 4.209472981987992, "ref_ce_loss": 0.09259842336177826, "step": 12620 }, { "epoch": 4.209472981987992, "loss": 0.6589750051498413, "step": 12620 }, { "ce_loss": 0.19765350222587585, "epoch": 4.209472981987992, "step": 12620 }, { "distill_loss": 0.15606580674648285, "epoch": 4.209472981987992, "step": 12620 }, { "epoch": 4.209472981987992, "ref_ce_loss": 0.12804923951625824, "step": 12620 }, { "epoch": 4.209472981987992, "loss": 0.26120057702064514, "step": 12620 }, { "ce_loss": 0.08098047226667404, "epoch": 4.209472981987992, "step": 12620 }, { "distill_loss": 0.09746217727661133, "epoch": 4.209472981987992, "step": 12620 }, { "epoch": 4.209472981987992, "ref_ce_loss": 0.05925130099058151, "step": 12620 }, { "epoch": 4.209472981987992, "loss": 0.2098916471004486, "step": 12620 }, { "ce_loss": 0.03318024426698685, "epoch": 4.209472981987992, "step": 12620 }, { "distill_loss": 0.12412787973880768, "epoch": 4.209472981987992, "step": 12620 }, { "epoch": 4.209472981987992, "ref_ce_loss": 0.05251970887184143, "step": 12620 }, { "epoch": 4.2128085390260175, "loss": 0.427, "step": 12630 }, { "epoch": 4.2128085390260175, "grad_norm": 2.3983237743377686, "step": 12630 }, { "epoch": 4.2128085390260175, "learning_rate": 0.00014437011503363117, "step": 12630 }, { "epoch": 4.2128085390260175, "loss": 0.4466587007045746, "step": 12630 }, { "ce_loss": 0.12958595156669617, "epoch": 4.2128085390260175, "step": 12630 }, { "distill_loss": 0.15985801815986633, "epoch": 4.2128085390260175, "step": 12630 }, { "epoch": 4.2128085390260175, "ref_ce_loss": 0.09321147948503494, "step": 12630 }, { "epoch": 4.2128085390260175, "loss": 0.40337681770324707, "step": 12630 }, { "ce_loss": 0.1810450404882431, "epoch": 4.2128085390260175, "step": 12630 }, { "distill_loss": 0.1132136881351471, "epoch": 4.2128085390260175, "step": 12630 }, { "epoch": 4.2128085390260175, "ref_ce_loss": 0.08407022804021835, "step": 12630 }, { "epoch": 4.2128085390260175, "loss": 0.6117267608642578, "step": 12630 }, { "ce_loss": 0.18386036157608032, "epoch": 4.2128085390260175, "step": 12630 }, { "distill_loss": 0.1988612860441208, "epoch": 4.2128085390260175, "step": 12630 }, { "epoch": 4.2128085390260175, "ref_ce_loss": 0.12381020188331604, "step": 12630 }, { "epoch": 4.2128085390260175, "loss": 0.3889603018760681, "step": 12630 }, { "ce_loss": 0.11408506333827972, "epoch": 4.2128085390260175, "step": 12630 }, { "distill_loss": 0.17046092450618744, "epoch": 4.2128085390260175, "step": 12630 }, { "epoch": 4.2128085390260175, "ref_ce_loss": 0.10424398630857468, "step": 12630 }, { "epoch": 4.216144096064043, "loss": 0.4039, "step": 12640 }, { "epoch": 4.216144096064043, "grad_norm": 2.042799472808838, "step": 12640 }, { "epoch": 4.216144096064043, "learning_rate": 0.0001441677015277295, "step": 12640 }, { "epoch": 4.216144096064043, "loss": 0.5464071035385132, "step": 12640 }, { "ce_loss": 0.06709546595811844, "epoch": 4.216144096064043, "step": 12640 }, { "distill_loss": 0.15532134473323822, "epoch": 4.216144096064043, "step": 12640 }, { "epoch": 4.216144096064043, "ref_ce_loss": 0.0987556055188179, "step": 12640 }, { "epoch": 4.216144096064043, "loss": 0.35175639390945435, "step": 12640 }, { "ce_loss": 0.09659342467784882, "epoch": 4.216144096064043, "step": 12640 }, { "distill_loss": 0.12190660834312439, "epoch": 4.216144096064043, "step": 12640 }, { "epoch": 4.216144096064043, "ref_ce_loss": 0.11512437462806702, "step": 12640 }, { "epoch": 4.216144096064043, "loss": 0.37998417019844055, "step": 12640 }, { "ce_loss": 0.07071369886398315, "epoch": 4.216144096064043, "step": 12640 }, { "distill_loss": 0.14697347581386566, "epoch": 4.216144096064043, "step": 12640 }, { "epoch": 4.216144096064043, "ref_ce_loss": 0.1176176369190216, "step": 12640 }, { "epoch": 4.216144096064043, "loss": 0.28571704030036926, "step": 12640 }, { "ce_loss": 0.047390177845954895, "epoch": 4.216144096064043, "step": 12640 }, { "distill_loss": 0.10813229531049728, "epoch": 4.216144096064043, "step": 12640 }, { "epoch": 4.216144096064043, "ref_ce_loss": 0.08257561177015305, "step": 12640 }, { "epoch": 4.219479653102068, "loss": 0.3949, "step": 12650 }, { "epoch": 4.219479653102068, "grad_norm": 2.189134359359741, "step": 12650 }, { "epoch": 4.219479653102068, "learning_rate": 0.00014396529865763947, "step": 12650 }, { "epoch": 4.219479653102068, "loss": 0.2742716073989868, "step": 12650 }, { "ce_loss": 0.04334789887070656, "epoch": 4.219479653102068, "step": 12650 }, { "distill_loss": 0.10433655232191086, "epoch": 4.219479653102068, "step": 12650 }, { "epoch": 4.219479653102068, "ref_ce_loss": 0.07580353319644928, "step": 12650 }, { "epoch": 4.219479653102068, "loss": 0.39648038148880005, "step": 12650 }, { "ce_loss": 0.08383668214082718, "epoch": 4.219479653102068, "step": 12650 }, { "distill_loss": 0.12050480395555496, "epoch": 4.219479653102068, "step": 12650 }, { "epoch": 4.219479653102068, "ref_ce_loss": 0.09287303686141968, "step": 12650 }, { "epoch": 4.219479653102068, "loss": 0.46412718296051025, "step": 12650 }, { "ce_loss": 0.09492798149585724, "epoch": 4.219479653102068, "step": 12650 }, { "distill_loss": 0.16380393505096436, "epoch": 4.219479653102068, "step": 12650 }, { "epoch": 4.219479653102068, "ref_ce_loss": 0.08897554874420166, "step": 12650 }, { "epoch": 4.219479653102068, "loss": 0.36522769927978516, "step": 12650 }, { "ce_loss": 0.07890743762254715, "epoch": 4.219479653102068, "step": 12650 }, { "distill_loss": 0.1172899603843689, "epoch": 4.219479653102068, "step": 12650 }, { "epoch": 4.219479653102068, "ref_ce_loss": 0.07658599317073822, "step": 12650 }, { "epoch": 4.2228152101400935, "loss": 0.3507, "step": 12660 }, { "epoch": 4.2228152101400935, "grad_norm": 1.6064761877059937, "step": 12660 }, { "epoch": 4.2228152101400935, "learning_rate": 0.0001437629067924643, "step": 12660 }, { "epoch": 4.2228152101400935, "loss": 0.3317145109176636, "step": 12660 }, { "ce_loss": 0.07839033752679825, "epoch": 4.2228152101400935, "step": 12660 }, { "distill_loss": 0.10722742974758148, "epoch": 4.2228152101400935, "step": 12660 }, { "epoch": 4.2228152101400935, "ref_ce_loss": 0.11385882645845413, "step": 12660 }, { "epoch": 4.2228152101400935, "loss": 0.32069358229637146, "step": 12660 }, { "ce_loss": 0.10389983654022217, "epoch": 4.2228152101400935, "step": 12660 }, { "distill_loss": 0.14690223336219788, "epoch": 4.2228152101400935, "step": 12660 }, { "epoch": 4.2228152101400935, "ref_ce_loss": 0.06953489780426025, "step": 12660 }, { "epoch": 4.2228152101400935, "loss": 0.4394501745700836, "step": 12660 }, { "ce_loss": 0.17041996121406555, "epoch": 4.2228152101400935, "step": 12660 }, { "distill_loss": 0.11526565998792648, "epoch": 4.2228152101400935, "step": 12660 }, { "epoch": 4.2228152101400935, "ref_ce_loss": 0.09521762281656265, "step": 12660 }, { "epoch": 4.2228152101400935, "loss": 0.4944007396697998, "step": 12660 }, { "ce_loss": 0.14096632599830627, "epoch": 4.2228152101400935, "step": 12660 }, { "distill_loss": 0.12511637806892395, "epoch": 4.2228152101400935, "step": 12660 }, { "epoch": 4.2228152101400935, "ref_ce_loss": 0.09930511564016342, "step": 12660 }, { "epoch": 4.226150767178119, "loss": 0.4517, "step": 12670 }, { "epoch": 4.226150767178119, "grad_norm": 3.4801747798919678, "step": 12670 }, { "epoch": 4.226150767178119, "learning_rate": 0.00014356052630128675, "step": 12670 }, { "epoch": 4.226150767178119, "loss": 0.4144769310951233, "step": 12670 }, { "ce_loss": 0.09750276803970337, "epoch": 4.226150767178119, "step": 12670 }, { "distill_loss": 0.1400170475244522, "epoch": 4.226150767178119, "step": 12670 }, { "epoch": 4.226150767178119, "ref_ce_loss": 0.0747247189283371, "step": 12670 }, { "epoch": 4.226150767178119, "loss": 0.2999867796897888, "step": 12670 }, { "ce_loss": 0.05919264629483223, "epoch": 4.226150767178119, "step": 12670 }, { "distill_loss": 0.12513509392738342, "epoch": 4.226150767178119, "step": 12670 }, { "epoch": 4.226150767178119, "ref_ce_loss": 0.08204268664121628, "step": 12670 }, { "epoch": 4.226150767178119, "loss": 0.4107738435268402, "step": 12670 }, { "ce_loss": 0.1336008906364441, "epoch": 4.226150767178119, "step": 12670 }, { "distill_loss": 0.15651835501194, "epoch": 4.226150767178119, "step": 12670 }, { "epoch": 4.226150767178119, "ref_ce_loss": 0.08766976743936539, "step": 12670 }, { "epoch": 4.226150767178119, "loss": 0.43431609869003296, "step": 12670 }, { "ce_loss": 0.12974825501441956, "epoch": 4.226150767178119, "step": 12670 }, { "distill_loss": 0.11243186146020889, "epoch": 4.226150767178119, "step": 12670 }, { "epoch": 4.226150767178119, "ref_ce_loss": 0.09981807321310043, "step": 12670 }, { "epoch": 4.229486324216144, "loss": 0.3862, "step": 12680 }, { "epoch": 4.229486324216144, "grad_norm": 1.9466087818145752, "step": 12680 }, { "epoch": 4.229486324216144, "learning_rate": 0.00014335815755316903, "step": 12680 }, { "epoch": 4.229486324216144, "loss": 0.46811777353286743, "step": 12680 }, { "ce_loss": 0.10368335247039795, "epoch": 4.229486324216144, "step": 12680 }, { "distill_loss": 0.11334482580423355, "epoch": 4.229486324216144, "step": 12680 }, { "epoch": 4.229486324216144, "ref_ce_loss": 0.09854672104120255, "step": 12680 }, { "epoch": 4.229486324216144, "loss": 0.49574559926986694, "step": 12680 }, { "ce_loss": 0.0772734060883522, "epoch": 4.229486324216144, "step": 12680 }, { "distill_loss": 0.13109725713729858, "epoch": 4.229486324216144, "step": 12680 }, { "epoch": 4.229486324216144, "ref_ce_loss": 0.06632824242115021, "step": 12680 }, { "epoch": 4.229486324216144, "loss": 0.2777520418167114, "step": 12680 }, { "ce_loss": 0.04080289229750633, "epoch": 4.229486324216144, "step": 12680 }, { "distill_loss": 0.09627961367368698, "epoch": 4.229486324216144, "step": 12680 }, { "epoch": 4.229486324216144, "ref_ce_loss": 0.05981730669736862, "step": 12680 }, { "epoch": 4.229486324216144, "loss": 0.43288472294807434, "step": 12680 }, { "ce_loss": 0.1106693223118782, "epoch": 4.229486324216144, "step": 12680 }, { "distill_loss": 0.1380157321691513, "epoch": 4.229486324216144, "step": 12680 }, { "epoch": 4.229486324216144, "ref_ce_loss": 0.12508343160152435, "step": 12680 }, { "epoch": 4.23282188125417, "loss": 0.3997, "step": 12690 }, { "epoch": 4.23282188125417, "grad_norm": 2.4736568927764893, "step": 12690 }, { "epoch": 4.23282188125417, "learning_rate": 0.00014315580091715202, "step": 12690 }, { "epoch": 4.23282188125417, "loss": 0.3552623391151428, "step": 12690 }, { "ce_loss": 0.08149318397045135, "epoch": 4.23282188125417, "step": 12690 }, { "distill_loss": 0.1482114940881729, "epoch": 4.23282188125417, "step": 12690 }, { "epoch": 4.23282188125417, "ref_ce_loss": 0.08624199777841568, "step": 12690 }, { "epoch": 4.23282188125417, "loss": 0.3837560713291168, "step": 12690 }, { "ce_loss": 0.12796638906002045, "epoch": 4.23282188125417, "step": 12690 }, { "distill_loss": 0.1531001329421997, "epoch": 4.23282188125417, "step": 12690 }, { "epoch": 4.23282188125417, "ref_ce_loss": 0.07761276513338089, "step": 12690 }, { "epoch": 4.23282188125417, "loss": 0.4810293912887573, "step": 12690 }, { "ce_loss": 0.16699600219726562, "epoch": 4.23282188125417, "step": 12690 }, { "distill_loss": 0.12791535258293152, "epoch": 4.23282188125417, "step": 12690 }, { "epoch": 4.23282188125417, "ref_ce_loss": 0.15135811269283295, "step": 12690 }, { "epoch": 4.23282188125417, "loss": 0.5207319855690002, "step": 12690 }, { "ce_loss": 0.1772138774394989, "epoch": 4.23282188125417, "step": 12690 }, { "distill_loss": 0.16876177489757538, "epoch": 4.23282188125417, "step": 12690 }, { "epoch": 4.23282188125417, "ref_ce_loss": 0.06250439584255219, "step": 12690 }, { "epoch": 4.236157438292195, "loss": 0.4372, "step": 12700 }, { "epoch": 4.236157438292195, "grad_norm": 3.4386374950408936, "step": 12700 }, { "epoch": 4.236157438292195, "learning_rate": 0.00014295345676225427, "step": 12700 }, { "epoch": 4.236157438292195, "loss": 0.6116504669189453, "step": 12700 }, { "ce_loss": 0.12445548921823502, "epoch": 4.236157438292195, "step": 12700 }, { "distill_loss": 0.18399251997470856, "epoch": 4.236157438292195, "step": 12700 }, { "epoch": 4.236157438292195, "ref_ce_loss": 0.10828568041324615, "step": 12700 }, { "epoch": 4.236157438292195, "loss": 0.41157230734825134, "step": 12700 }, { "ce_loss": 0.058100927621126175, "epoch": 4.236157438292195, "step": 12700 }, { "distill_loss": 0.12735587358474731, "epoch": 4.236157438292195, "step": 12700 }, { "epoch": 4.236157438292195, "ref_ce_loss": 0.08694470673799515, "step": 12700 }, { "epoch": 4.236157438292195, "loss": 0.29641038179397583, "step": 12700 }, { "ce_loss": 0.05376739799976349, "epoch": 4.236157438292195, "step": 12700 }, { "distill_loss": 0.15917444229125977, "epoch": 4.236157438292195, "step": 12700 }, { "epoch": 4.236157438292195, "ref_ce_loss": 0.05772160366177559, "step": 12700 }, { "epoch": 4.236157438292195, "loss": 0.37515804171562195, "step": 12700 }, { "ce_loss": 0.08279716223478317, "epoch": 4.236157438292195, "step": 12700 }, { "distill_loss": 0.15383553504943848, "epoch": 4.236157438292195, "step": 12700 }, { "epoch": 4.236157438292195, "ref_ce_loss": 0.11051376909017563, "step": 12700 }, { "epoch": 4.23949299533022, "loss": 0.4558, "step": 12710 }, { "epoch": 4.23949299533022, "grad_norm": 2.184640884399414, "step": 12710 }, { "epoch": 4.23949299533022, "learning_rate": 0.0001427511254574717, "step": 12710 }, { "epoch": 4.23949299533022, "loss": 0.5931957960128784, "step": 12710 }, { "ce_loss": 0.16643624007701874, "epoch": 4.23949299533022, "step": 12710 }, { "distill_loss": 0.168610081076622, "epoch": 4.23949299533022, "step": 12710 }, { "epoch": 4.23949299533022, "ref_ce_loss": 0.07215865701436996, "step": 12710 }, { "epoch": 4.23949299533022, "loss": 0.4027775526046753, "step": 12710 }, { "ce_loss": 0.12669742107391357, "epoch": 4.23949299533022, "step": 12710 }, { "distill_loss": 0.16142867505550385, "epoch": 4.23949299533022, "step": 12710 }, { "epoch": 4.23949299533022, "ref_ce_loss": 0.09073889255523682, "step": 12710 }, { "epoch": 4.23949299533022, "loss": 0.5018163323402405, "step": 12710 }, { "ce_loss": 0.15854166448116302, "epoch": 4.23949299533022, "step": 12710 }, { "distill_loss": 0.18602553009986877, "epoch": 4.23949299533022, "step": 12710 }, { "epoch": 4.23949299533022, "ref_ce_loss": 0.11441418528556824, "step": 12710 }, { "epoch": 4.23949299533022, "loss": 0.3939116299152374, "step": 12710 }, { "ce_loss": 0.06274707615375519, "epoch": 4.23949299533022, "step": 12710 }, { "distill_loss": 0.11968748271465302, "epoch": 4.23949299533022, "step": 12710 }, { "epoch": 4.23949299533022, "ref_ce_loss": 0.12243720889091492, "step": 12710 }, { "epoch": 4.242828552368246, "loss": 0.4039, "step": 12720 }, { "epoch": 4.242828552368246, "grad_norm": 1.5773797035217285, "step": 12720 }, { "epoch": 4.242828552368246, "learning_rate": 0.00014254880737177696, "step": 12720 }, { "epoch": 4.242828552368246, "loss": 0.750359296798706, "step": 12720 }, { "ce_loss": 0.11326040327548981, "epoch": 4.242828552368246, "step": 12720 }, { "distill_loss": 0.1426267921924591, "epoch": 4.242828552368246, "step": 12720 }, { "epoch": 4.242828552368246, "ref_ce_loss": 0.08958389610052109, "step": 12720 }, { "epoch": 4.242828552368246, "loss": 0.8291692733764648, "step": 12720 }, { "ce_loss": 0.10521968454122543, "epoch": 4.242828552368246, "step": 12720 }, { "distill_loss": 0.15522602200508118, "epoch": 4.242828552368246, "step": 12720 }, { "epoch": 4.242828552368246, "ref_ce_loss": 0.10163272172212601, "step": 12720 }, { "epoch": 4.242828552368246, "loss": 0.3573615849018097, "step": 12720 }, { "ce_loss": 0.08448813855648041, "epoch": 4.242828552368246, "step": 12720 }, { "distill_loss": 0.1453476995229721, "epoch": 4.242828552368246, "step": 12720 }, { "epoch": 4.242828552368246, "ref_ce_loss": 0.08841101080179214, "step": 12720 }, { "epoch": 4.242828552368246, "loss": 0.3858444094657898, "step": 12720 }, { "ce_loss": 0.1092870756983757, "epoch": 4.242828552368246, "step": 12720 }, { "distill_loss": 0.1326705813407898, "epoch": 4.242828552368246, "step": 12720 }, { "epoch": 4.242828552368246, "ref_ce_loss": 0.11946248263120651, "step": 12720 }, { "epoch": 4.246164109406271, "loss": 0.4509, "step": 12730 }, { "epoch": 4.246164109406271, "grad_norm": 2.041639804840088, "step": 12730 }, { "epoch": 4.246164109406271, "learning_rate": 0.00014234650287411825, "step": 12730 }, { "epoch": 4.246164109406271, "loss": 0.5689536929130554, "step": 12730 }, { "ce_loss": 0.05791715532541275, "epoch": 4.246164109406271, "step": 12730 }, { "distill_loss": 0.09933005273342133, "epoch": 4.246164109406271, "step": 12730 }, { "epoch": 4.246164109406271, "ref_ce_loss": 0.05065986141562462, "step": 12730 }, { "epoch": 4.246164109406271, "loss": 0.4074205160140991, "step": 12730 }, { "ce_loss": 0.13159328699111938, "epoch": 4.246164109406271, "step": 12730 }, { "distill_loss": 0.18942898511886597, "epoch": 4.246164109406271, "step": 12730 }, { "epoch": 4.246164109406271, "ref_ce_loss": 0.06139923259615898, "step": 12730 }, { "epoch": 4.246164109406271, "loss": 0.28600767254829407, "step": 12730 }, { "ce_loss": 0.0682460144162178, "epoch": 4.246164109406271, "step": 12730 }, { "distill_loss": 0.11950859427452087, "epoch": 4.246164109406271, "step": 12730 }, { "epoch": 4.246164109406271, "ref_ce_loss": 0.05969545617699623, "step": 12730 }, { "epoch": 4.246164109406271, "loss": 0.2821713984012604, "step": 12730 }, { "ce_loss": 0.08232744038105011, "epoch": 4.246164109406271, "step": 12730 }, { "distill_loss": 0.10216699540615082, "epoch": 4.246164109406271, "step": 12730 }, { "epoch": 4.246164109406271, "ref_ce_loss": 0.0690549984574318, "step": 12730 }, { "epoch": 4.249499666444296, "loss": 0.4278, "step": 12740 }, { "epoch": 4.249499666444296, "grad_norm": 2.6584384441375732, "step": 12740 }, { "epoch": 4.249499666444296, "learning_rate": 0.00014214421233341927, "step": 12740 }, { "epoch": 4.249499666444296, "loss": 0.38530316948890686, "step": 12740 }, { "ce_loss": 0.06119230389595032, "epoch": 4.249499666444296, "step": 12740 }, { "distill_loss": 0.13711406290531158, "epoch": 4.249499666444296, "step": 12740 }, { "epoch": 4.249499666444296, "ref_ce_loss": 0.09056949615478516, "step": 12740 }, { "epoch": 4.249499666444296, "loss": 0.3610149919986725, "step": 12740 }, { "ce_loss": 0.1303795576095581, "epoch": 4.249499666444296, "step": 12740 }, { "distill_loss": 0.10823221504688263, "epoch": 4.249499666444296, "step": 12740 }, { "epoch": 4.249499666444296, "ref_ce_loss": 0.12236493825912476, "step": 12740 }, { "epoch": 4.249499666444296, "loss": 0.37968602776527405, "step": 12740 }, { "ce_loss": 0.11002121865749359, "epoch": 4.249499666444296, "step": 12740 }, { "distill_loss": 0.15049538016319275, "epoch": 4.249499666444296, "step": 12740 }, { "epoch": 4.249499666444296, "ref_ce_loss": 0.08402389287948608, "step": 12740 }, { "epoch": 4.249499666444296, "loss": 0.30963027477264404, "step": 12740 }, { "ce_loss": 0.09638215601444244, "epoch": 4.249499666444296, "step": 12740 }, { "distill_loss": 0.10256923735141754, "epoch": 4.249499666444296, "step": 12740 }, { "epoch": 4.249499666444296, "ref_ce_loss": 0.08142927289009094, "step": 12740 }, { "epoch": 4.252835223482322, "loss": 0.4419, "step": 12750 }, { "epoch": 4.252835223482322, "grad_norm": 2.8495850563049316, "step": 12750 }, { "epoch": 4.252835223482322, "learning_rate": 0.0001419419361185781, "step": 12750 }, { "epoch": 4.252835223482322, "loss": 0.49968162178993225, "step": 12750 }, { "ce_loss": 0.11410761624574661, "epoch": 4.252835223482322, "step": 12750 }, { "distill_loss": 0.2119593620300293, "epoch": 4.252835223482322, "step": 12750 }, { "epoch": 4.252835223482322, "ref_ce_loss": 0.09284225106239319, "step": 12750 }, { "epoch": 4.252835223482322, "loss": 0.49200567603111267, "step": 12750 }, { "ce_loss": 0.18147218227386475, "epoch": 4.252835223482322, "step": 12750 }, { "distill_loss": 0.16002869606018066, "epoch": 4.252835223482322, "step": 12750 }, { "epoch": 4.252835223482322, "ref_ce_loss": 0.057403866201639175, "step": 12750 }, { "epoch": 4.252835223482322, "loss": 0.32698220014572144, "step": 12750 }, { "ce_loss": 0.08053304255008698, "epoch": 4.252835223482322, "step": 12750 }, { "distill_loss": 0.16222812235355377, "epoch": 4.252835223482322, "step": 12750 }, { "epoch": 4.252835223482322, "ref_ce_loss": 0.08418859541416168, "step": 12750 }, { "epoch": 4.252835223482322, "loss": 0.37825435400009155, "step": 12750 }, { "ce_loss": 0.11178170144557953, "epoch": 4.252835223482322, "step": 12750 }, { "distill_loss": 0.13081073760986328, "epoch": 4.252835223482322, "step": 12750 }, { "epoch": 4.252835223482322, "ref_ce_loss": 0.09388069063425064, "step": 12750 }, { "epoch": 4.256170780520347, "loss": 0.4337, "step": 12760 }, { "epoch": 4.256170780520347, "grad_norm": 3.041410207748413, "step": 12760 }, { "epoch": 4.256170780520347, "learning_rate": 0.00014173967459846684, "step": 12760 }, { "epoch": 4.256170780520347, "loss": 0.4082252085208893, "step": 12760 }, { "ce_loss": 0.07274634391069412, "epoch": 4.256170780520347, "step": 12760 }, { "distill_loss": 0.18472713232040405, "epoch": 4.256170780520347, "step": 12760 }, { "epoch": 4.256170780520347, "ref_ce_loss": 0.08851490914821625, "step": 12760 }, { "epoch": 4.256170780520347, "loss": 0.4447764456272125, "step": 12760 }, { "ce_loss": 0.15090270340442657, "epoch": 4.256170780520347, "step": 12760 }, { "distill_loss": 0.11900592595338821, "epoch": 4.256170780520347, "step": 12760 }, { "epoch": 4.256170780520347, "ref_ce_loss": 0.0788751021027565, "step": 12760 }, { "epoch": 4.256170780520347, "loss": 0.502668023109436, "step": 12760 }, { "ce_loss": 0.09444031119346619, "epoch": 4.256170780520347, "step": 12760 }, { "distill_loss": 0.1498761624097824, "epoch": 4.256170780520347, "step": 12760 }, { "epoch": 4.256170780520347, "ref_ce_loss": 0.10147394984960556, "step": 12760 }, { "epoch": 4.256170780520347, "loss": 0.7868169546127319, "step": 12760 }, { "ce_loss": 0.14243212342262268, "epoch": 4.256170780520347, "step": 12760 }, { "distill_loss": 0.11945310980081558, "epoch": 4.256170780520347, "step": 12760 }, { "epoch": 4.256170780520347, "ref_ce_loss": 0.04398519545793533, "step": 12760 }, { "epoch": 4.259506337558372, "loss": 0.441, "step": 12770 }, { "epoch": 4.259506337558372, "grad_norm": 2.3769431114196777, "step": 12770 }, { "epoch": 4.259506337558372, "learning_rate": 0.00014153742814193066, "step": 12770 }, { "epoch": 4.259506337558372, "loss": 0.23193150758743286, "step": 12770 }, { "ce_loss": 0.018042435869574547, "epoch": 4.259506337558372, "step": 12770 }, { "distill_loss": 0.11690559983253479, "epoch": 4.259506337558372, "step": 12770 }, { "epoch": 4.259506337558372, "ref_ce_loss": 0.09686476737260818, "step": 12770 }, { "epoch": 4.259506337558372, "loss": 0.38023316860198975, "step": 12770 }, { "ce_loss": 0.14664292335510254, "epoch": 4.259506337558372, "step": 12770 }, { "distill_loss": 0.11862847208976746, "epoch": 4.259506337558372, "step": 12770 }, { "epoch": 4.259506337558372, "ref_ce_loss": 0.09226536750793457, "step": 12770 }, { "epoch": 4.259506337558372, "loss": 0.3144304156303406, "step": 12770 }, { "ce_loss": 0.07538256794214249, "epoch": 4.259506337558372, "step": 12770 }, { "distill_loss": 0.13220840692520142, "epoch": 4.259506337558372, "step": 12770 }, { "epoch": 4.259506337558372, "ref_ce_loss": 0.0619903989136219, "step": 12770 }, { "epoch": 4.259506337558372, "loss": 0.5431100130081177, "step": 12770 }, { "ce_loss": 0.11705781519412994, "epoch": 4.259506337558372, "step": 12770 }, { "distill_loss": 0.13160298764705658, "epoch": 4.259506337558372, "step": 12770 }, { "epoch": 4.259506337558372, "ref_ce_loss": 0.1292382925748825, "step": 12770 }, { "epoch": 4.262841894596398, "loss": 0.4342, "step": 12780 }, { "epoch": 4.262841894596398, "grad_norm": 4.000396728515625, "step": 12780 }, { "epoch": 4.262841894596398, "learning_rate": 0.00014133519711778734, "step": 12780 }, { "epoch": 4.262841894596398, "loss": 0.608391523361206, "step": 12780 }, { "ce_loss": 0.1167433112859726, "epoch": 4.262841894596398, "step": 12780 }, { "distill_loss": 0.16401061415672302, "epoch": 4.262841894596398, "step": 12780 }, { "epoch": 4.262841894596398, "ref_ce_loss": 0.09299744665622711, "step": 12780 }, { "epoch": 4.262841894596398, "loss": 0.4936920404434204, "step": 12780 }, { "ce_loss": 0.11663345247507095, "epoch": 4.262841894596398, "step": 12780 }, { "distill_loss": 0.15554045140743256, "epoch": 4.262841894596398, "step": 12780 }, { "epoch": 4.262841894596398, "ref_ce_loss": 0.0900701954960823, "step": 12780 }, { "epoch": 4.262841894596398, "loss": 0.29712164402008057, "step": 12780 }, { "ce_loss": 0.10573622584342957, "epoch": 4.262841894596398, "step": 12780 }, { "distill_loss": 0.0938921868801117, "epoch": 4.262841894596398, "step": 12780 }, { "epoch": 4.262841894596398, "ref_ce_loss": 0.07227256894111633, "step": 12780 }, { "epoch": 4.262841894596398, "loss": 0.2216934859752655, "step": 12780 }, { "ce_loss": 0.037140414118766785, "epoch": 4.262841894596398, "step": 12780 }, { "distill_loss": 0.11072830855846405, "epoch": 4.262841894596398, "step": 12780 }, { "epoch": 4.262841894596398, "ref_ce_loss": 0.05503278598189354, "step": 12780 }, { "epoch": 4.266177451634423, "loss": 0.4581, "step": 12790 }, { "epoch": 4.266177451634423, "grad_norm": 2.528873920440674, "step": 12790 }, { "epoch": 4.266177451634423, "learning_rate": 0.00014113298189482652, "step": 12790 }, { "epoch": 4.266177451634423, "loss": 0.29387056827545166, "step": 12790 }, { "ce_loss": 0.0636768564581871, "epoch": 4.266177451634423, "step": 12790 }, { "distill_loss": 0.12197493016719818, "epoch": 4.266177451634423, "step": 12790 }, { "epoch": 4.266177451634423, "ref_ce_loss": 0.08008244633674622, "step": 12790 }, { "epoch": 4.266177451634423, "loss": 0.35066884756088257, "step": 12790 }, { "ce_loss": 0.08643355220556259, "epoch": 4.266177451634423, "step": 12790 }, { "distill_loss": 0.1131490170955658, "epoch": 4.266177451634423, "step": 12790 }, { "epoch": 4.266177451634423, "ref_ce_loss": 0.11212163418531418, "step": 12790 }, { "epoch": 4.266177451634423, "loss": 0.5954700708389282, "step": 12790 }, { "ce_loss": 0.1914316564798355, "epoch": 4.266177451634423, "step": 12790 }, { "distill_loss": 0.1783093959093094, "epoch": 4.266177451634423, "step": 12790 }, { "epoch": 4.266177451634423, "ref_ce_loss": 0.1309003233909607, "step": 12790 }, { "epoch": 4.266177451634423, "loss": 0.2525702714920044, "step": 12790 }, { "ce_loss": 0.05564585328102112, "epoch": 4.266177451634423, "step": 12790 }, { "distill_loss": 0.11780757457017899, "epoch": 4.266177451634423, "step": 12790 }, { "epoch": 4.266177451634423, "ref_ce_loss": 0.07907214760780334, "step": 12790 }, { "epoch": 4.269513008672448, "loss": 0.4115, "step": 12800 }, { "epoch": 4.269513008672448, "grad_norm": 3.688870906829834, "step": 12800 }, { "epoch": 4.269513008672448, "learning_rate": 0.00014093078284180892, "step": 12800 }, { "epoch": 4.269513008672448, "loss": 0.2441685050725937, "step": 12800 }, { "ce_loss": 0.07012347877025604, "epoch": 4.269513008672448, "step": 12800 }, { "distill_loss": 0.0923108235001564, "epoch": 4.269513008672448, "step": 12800 }, { "epoch": 4.269513008672448, "ref_ce_loss": 0.06695084273815155, "step": 12800 }, { "epoch": 4.269513008672448, "loss": 0.42757582664489746, "step": 12800 }, { "ce_loss": 0.13388575613498688, "epoch": 4.269513008672448, "step": 12800 }, { "distill_loss": 0.1347452849149704, "epoch": 4.269513008672448, "step": 12800 }, { "epoch": 4.269513008672448, "ref_ce_loss": 0.09412025660276413, "step": 12800 }, { "epoch": 4.269513008672448, "loss": 0.42638054490089417, "step": 12800 }, { "ce_loss": 0.16536660492420197, "epoch": 4.269513008672448, "step": 12800 }, { "distill_loss": 0.18622267246246338, "epoch": 4.269513008672448, "step": 12800 }, { "epoch": 4.269513008672448, "ref_ce_loss": 0.07472360134124756, "step": 12800 }, { "epoch": 4.269513008672448, "loss": 0.42994916439056396, "step": 12800 }, { "ce_loss": 0.1114339604973793, "epoch": 4.269513008672448, "step": 12800 }, { "distill_loss": 0.13885322213172913, "epoch": 4.269513008672448, "step": 12800 }, { "epoch": 4.269513008672448, "ref_ce_loss": 0.1167064681649208, "step": 12800 }, { "epoch": 4.272848565710474, "loss": 0.4355, "step": 12810 }, { "epoch": 4.272848565710474, "grad_norm": 2.8595855236053467, "step": 12810 }, { "epoch": 4.272848565710474, "learning_rate": 0.00014072860032746592, "step": 12810 }, { "epoch": 4.272848565710474, "loss": 0.5688683390617371, "step": 12810 }, { "ce_loss": 0.1201975867152214, "epoch": 4.272848565710474, "step": 12810 }, { "distill_loss": 0.1736508160829544, "epoch": 4.272848565710474, "step": 12810 }, { "epoch": 4.272848565710474, "ref_ce_loss": 0.08065781742334366, "step": 12810 }, { "epoch": 4.272848565710474, "loss": 0.5388529300689697, "step": 12810 }, { "ce_loss": 0.13207614421844482, "epoch": 4.272848565710474, "step": 12810 }, { "distill_loss": 0.22794324159622192, "epoch": 4.272848565710474, "step": 12810 }, { "epoch": 4.272848565710474, "ref_ce_loss": 0.10184848308563232, "step": 12810 }, { "epoch": 4.272848565710474, "loss": 0.39517244696617126, "step": 12810 }, { "ce_loss": 0.059914980083703995, "epoch": 4.272848565710474, "step": 12810 }, { "distill_loss": 0.16114996373653412, "epoch": 4.272848565710474, "step": 12810 }, { "epoch": 4.272848565710474, "ref_ce_loss": 0.08862312138080597, "step": 12810 }, { "epoch": 4.272848565710474, "loss": 0.4757397472858429, "step": 12810 }, { "ce_loss": 0.12424880266189575, "epoch": 4.272848565710474, "step": 12810 }, { "distill_loss": 0.15512457489967346, "epoch": 4.272848565710474, "step": 12810 }, { "epoch": 4.272848565710474, "ref_ce_loss": 0.07396658509969711, "step": 12810 }, { "epoch": 4.276184122748499, "loss": 0.4181, "step": 12820 }, { "epoch": 4.276184122748499, "grad_norm": 3.4736826419830322, "step": 12820 }, { "epoch": 4.276184122748499, "learning_rate": 0.0001405264347204987, "step": 12820 }, { "epoch": 4.276184122748499, "loss": 0.4792317748069763, "step": 12820 }, { "ce_loss": 0.07258214801549911, "epoch": 4.276184122748499, "step": 12820 }, { "distill_loss": 0.16453388333320618, "epoch": 4.276184122748499, "step": 12820 }, { "epoch": 4.276184122748499, "ref_ce_loss": 0.08609256148338318, "step": 12820 }, { "epoch": 4.276184122748499, "loss": 0.32401522994041443, "step": 12820 }, { "ce_loss": 0.10857275873422623, "epoch": 4.276184122748499, "step": 12820 }, { "distill_loss": 0.11418082565069199, "epoch": 4.276184122748499, "step": 12820 }, { "epoch": 4.276184122748499, "ref_ce_loss": 0.08071093261241913, "step": 12820 }, { "epoch": 4.276184122748499, "loss": 0.309609055519104, "step": 12820 }, { "ce_loss": 0.07577096670866013, "epoch": 4.276184122748499, "step": 12820 }, { "distill_loss": 0.1278049200773239, "epoch": 4.276184122748499, "step": 12820 }, { "epoch": 4.276184122748499, "ref_ce_loss": 0.075919009745121, "step": 12820 }, { "epoch": 4.276184122748499, "loss": 0.39440685510635376, "step": 12820 }, { "ce_loss": 0.1170484647154808, "epoch": 4.276184122748499, "step": 12820 }, { "distill_loss": 0.14061780273914337, "epoch": 4.276184122748499, "step": 12820 }, { "epoch": 4.276184122748499, "ref_ce_loss": 0.08952930569648743, "step": 12820 }, { "epoch": 4.2795196797865245, "loss": 0.4295, "step": 12830 }, { "epoch": 4.2795196797865245, "grad_norm": 1.8937972784042358, "step": 12830 }, { "epoch": 4.2795196797865245, "learning_rate": 0.00014032428638957747, "step": 12830 }, { "epoch": 4.2795196797865245, "loss": 0.4438021779060364, "step": 12830 }, { "ce_loss": 0.1217060387134552, "epoch": 4.2795196797865245, "step": 12830 }, { "distill_loss": 0.1110411062836647, "epoch": 4.2795196797865245, "step": 12830 }, { "epoch": 4.2795196797865245, "ref_ce_loss": 0.14042538404464722, "step": 12830 }, { "epoch": 4.2795196797865245, "loss": 0.30124080181121826, "step": 12830 }, { "ce_loss": 0.05258672684431076, "epoch": 4.2795196797865245, "step": 12830 }, { "distill_loss": 0.09894955903291702, "epoch": 4.2795196797865245, "step": 12830 }, { "epoch": 4.2795196797865245, "ref_ce_loss": 0.08771368116140366, "step": 12830 }, { "epoch": 4.2795196797865245, "loss": 0.7485233545303345, "step": 12830 }, { "ce_loss": 0.046000298112630844, "epoch": 4.2795196797865245, "step": 12830 }, { "distill_loss": 0.10718851536512375, "epoch": 4.2795196797865245, "step": 12830 }, { "epoch": 4.2795196797865245, "ref_ce_loss": 0.08277367800474167, "step": 12830 }, { "epoch": 4.2795196797865245, "loss": 0.4868257939815521, "step": 12830 }, { "ce_loss": 0.15199749171733856, "epoch": 4.2795196797865245, "step": 12830 }, { "distill_loss": 0.15022709965705872, "epoch": 4.2795196797865245, "step": 12830 }, { "epoch": 4.2795196797865245, "ref_ce_loss": 0.06878019124269485, "step": 12830 }, { "epoch": 4.28285523682455, "loss": 0.3795, "step": 12840 }, { "epoch": 4.28285523682455, "grad_norm": 2.2695956230163574, "step": 12840 }, { "epoch": 4.28285523682455, "learning_rate": 0.0001401221557033411, "step": 12840 }, { "epoch": 4.28285523682455, "loss": 0.4565892815589905, "step": 12840 }, { "ce_loss": 0.10048975050449371, "epoch": 4.28285523682455, "step": 12840 }, { "distill_loss": 0.12798023223876953, "epoch": 4.28285523682455, "step": 12840 }, { "epoch": 4.28285523682455, "ref_ce_loss": 0.11618823558092117, "step": 12840 }, { "epoch": 4.28285523682455, "loss": 0.36068567633628845, "step": 12840 }, { "ce_loss": 0.09661982953548431, "epoch": 4.28285523682455, "step": 12840 }, { "distill_loss": 0.15107427537441254, "epoch": 4.28285523682455, "step": 12840 }, { "epoch": 4.28285523682455, "ref_ce_loss": 0.08204665035009384, "step": 12840 }, { "epoch": 4.28285523682455, "loss": 0.3397977948188782, "step": 12840 }, { "ce_loss": 0.08543965220451355, "epoch": 4.28285523682455, "step": 12840 }, { "distill_loss": 0.15523698925971985, "epoch": 4.28285523682455, "step": 12840 }, { "epoch": 4.28285523682455, "ref_ce_loss": 0.07969725877046585, "step": 12840 }, { "epoch": 4.28285523682455, "loss": 0.49361568689346313, "step": 12840 }, { "ce_loss": 0.1492895632982254, "epoch": 4.28285523682455, "step": 12840 }, { "distill_loss": 0.18394288420677185, "epoch": 4.28285523682455, "step": 12840 }, { "epoch": 4.28285523682455, "ref_ce_loss": 0.1353003978729248, "step": 12840 }, { "epoch": 4.286190793862575, "loss": 0.4424, "step": 12850 }, { "epoch": 4.286190793862575, "grad_norm": 2.4746813774108887, "step": 12850 }, { "epoch": 4.286190793862575, "learning_rate": 0.0001399200430303963, "step": 12850 }, { "epoch": 4.286190793862575, "loss": 0.6018202900886536, "step": 12850 }, { "ce_loss": 0.16130967438220978, "epoch": 4.286190793862575, "step": 12850 }, { "distill_loss": 0.22548778355121613, "epoch": 4.286190793862575, "step": 12850 }, { "epoch": 4.286190793862575, "ref_ce_loss": 0.11540760099887848, "step": 12850 }, { "epoch": 4.286190793862575, "loss": 0.5410385727882385, "step": 12850 }, { "ce_loss": 0.13062411546707153, "epoch": 4.286190793862575, "step": 12850 }, { "distill_loss": 0.17460712790489197, "epoch": 4.286190793862575, "step": 12850 }, { "epoch": 4.286190793862575, "ref_ce_loss": 0.1254759281873703, "step": 12850 }, { "epoch": 4.286190793862575, "loss": 0.7663313150405884, "step": 12850 }, { "ce_loss": 0.07273073494434357, "epoch": 4.286190793862575, "step": 12850 }, { "distill_loss": 0.11530451476573944, "epoch": 4.286190793862575, "step": 12850 }, { "epoch": 4.286190793862575, "ref_ce_loss": 0.058275263756513596, "step": 12850 }, { "epoch": 4.286190793862575, "loss": 0.5315554141998291, "step": 12850 }, { "ce_loss": 0.1627359837293625, "epoch": 4.286190793862575, "step": 12850 }, { "distill_loss": 0.204071044921875, "epoch": 4.286190793862575, "step": 12850 }, { "epoch": 4.286190793862575, "ref_ce_loss": 0.130750373005867, "step": 12850 }, { "epoch": 4.2895263509006005, "loss": 0.4477, "step": 12860 }, { "epoch": 4.2895263509006005, "grad_norm": 2.307910203933716, "step": 12860 }, { "epoch": 4.2895263509006005, "learning_rate": 0.00013971794873931674, "step": 12860 }, { "epoch": 4.2895263509006005, "loss": 0.47329601645469666, "step": 12860 }, { "ce_loss": 0.16464509069919586, "epoch": 4.2895263509006005, "step": 12860 }, { "distill_loss": 0.1366921365261078, "epoch": 4.2895263509006005, "step": 12860 }, { "epoch": 4.2895263509006005, "ref_ce_loss": 0.1367800086736679, "step": 12860 }, { "epoch": 4.2895263509006005, "loss": 0.7709516286849976, "step": 12860 }, { "ce_loss": 0.09364837408065796, "epoch": 4.2895263509006005, "step": 12860 }, { "distill_loss": 0.12354176491498947, "epoch": 4.2895263509006005, "step": 12860 }, { "epoch": 4.2895263509006005, "ref_ce_loss": 0.07555168122053146, "step": 12860 }, { "epoch": 4.2895263509006005, "loss": 0.4722943902015686, "step": 12860 }, { "ce_loss": 0.12364023178815842, "epoch": 4.2895263509006005, "step": 12860 }, { "distill_loss": 0.11006634682416916, "epoch": 4.2895263509006005, "step": 12860 }, { "epoch": 4.2895263509006005, "ref_ce_loss": 0.12410124391317368, "step": 12860 }, { "epoch": 4.2895263509006005, "loss": 0.44431930780410767, "step": 12860 }, { "ce_loss": 0.1067247986793518, "epoch": 4.2895263509006005, "step": 12860 }, { "distill_loss": 0.14513981342315674, "epoch": 4.2895263509006005, "step": 12860 }, { "epoch": 4.2895263509006005, "ref_ce_loss": 0.1249135211110115, "step": 12860 }, { "epoch": 4.292861907938626, "loss": 0.4131, "step": 12870 }, { "epoch": 4.292861907938626, "grad_norm": 2.4325857162475586, "step": 12870 }, { "epoch": 4.292861907938626, "learning_rate": 0.0001395158731986428, "step": 12870 }, { "epoch": 4.292861907938626, "loss": 0.5351008772850037, "step": 12870 }, { "ce_loss": 0.16410230100154877, "epoch": 4.292861907938626, "step": 12870 }, { "distill_loss": 0.19365465641021729, "epoch": 4.292861907938626, "step": 12870 }, { "epoch": 4.292861907938626, "ref_ce_loss": 0.08920037001371384, "step": 12870 }, { "epoch": 4.292861907938626, "loss": 0.4966471791267395, "step": 12870 }, { "ce_loss": 0.16271260380744934, "epoch": 4.292861907938626, "step": 12870 }, { "distill_loss": 0.11182349920272827, "epoch": 4.292861907938626, "step": 12870 }, { "epoch": 4.292861907938626, "ref_ce_loss": 0.10847348719835281, "step": 12870 }, { "epoch": 4.292861907938626, "loss": 0.3838697075843811, "step": 12870 }, { "ce_loss": 0.0985030084848404, "epoch": 4.292861907938626, "step": 12870 }, { "distill_loss": 0.10047741234302521, "epoch": 4.292861907938626, "step": 12870 }, { "epoch": 4.292861907938626, "ref_ce_loss": 0.0935835912823677, "step": 12870 }, { "epoch": 4.292861907938626, "loss": 0.3411202132701874, "step": 12870 }, { "ce_loss": 0.08071853220462799, "epoch": 4.292861907938626, "step": 12870 }, { "distill_loss": 0.16016238927841187, "epoch": 4.292861907938626, "step": 12870 }, { "epoch": 4.292861907938626, "ref_ce_loss": 0.05927430838346481, "step": 12870 }, { "epoch": 4.296197464976651, "loss": 0.3782, "step": 12880 }, { "epoch": 4.296197464976651, "grad_norm": 2.5318892002105713, "step": 12880 }, { "epoch": 4.296197464976651, "learning_rate": 0.00013931381677688044, "step": 12880 }, { "epoch": 4.296197464976651, "loss": 0.2524895966053009, "step": 12880 }, { "ce_loss": 0.024324163794517517, "epoch": 4.296197464976651, "step": 12880 }, { "distill_loss": 0.14109152555465698, "epoch": 4.296197464976651, "step": 12880 }, { "epoch": 4.296197464976651, "ref_ce_loss": 0.08681479096412659, "step": 12880 }, { "epoch": 4.296197464976651, "loss": 0.2724023759365082, "step": 12880 }, { "ce_loss": 0.04453759267926216, "epoch": 4.296197464976651, "step": 12880 }, { "distill_loss": 0.1311105340719223, "epoch": 4.296197464976651, "step": 12880 }, { "epoch": 4.296197464976651, "ref_ce_loss": 0.09659942239522934, "step": 12880 }, { "epoch": 4.296197464976651, "loss": 0.5654150247573853, "step": 12880 }, { "ce_loss": 0.10163300484418869, "epoch": 4.296197464976651, "step": 12880 }, { "distill_loss": 0.10299927741289139, "epoch": 4.296197464976651, "step": 12880 }, { "epoch": 4.296197464976651, "ref_ce_loss": 0.07143683731555939, "step": 12880 }, { "epoch": 4.296197464976651, "loss": 0.29135653376579285, "step": 12880 }, { "ce_loss": 0.04858585447072983, "epoch": 4.296197464976651, "step": 12880 }, { "distill_loss": 0.14956815540790558, "epoch": 4.296197464976651, "step": 12880 }, { "epoch": 4.296197464976651, "ref_ce_loss": 0.09311891347169876, "step": 12880 }, { "epoch": 4.299533022014677, "loss": 0.4558, "step": 12890 }, { "epoch": 4.299533022014677, "grad_norm": 3.607006072998047, "step": 12890 }, { "epoch": 4.299533022014677, "learning_rate": 0.0001391117798425009, "step": 12890 }, { "epoch": 4.299533022014677, "loss": 0.588657021522522, "step": 12890 }, { "ce_loss": 0.05568239837884903, "epoch": 4.299533022014677, "step": 12890 }, { "distill_loss": 0.09379260241985321, "epoch": 4.299533022014677, "step": 12890 }, { "epoch": 4.299533022014677, "ref_ce_loss": 0.10455752909183502, "step": 12890 }, { "epoch": 4.299533022014677, "loss": 0.29446786642074585, "step": 12890 }, { "ce_loss": 0.07457101345062256, "epoch": 4.299533022014677, "step": 12890 }, { "distill_loss": 0.13319659233093262, "epoch": 4.299533022014677, "step": 12890 }, { "epoch": 4.299533022014677, "ref_ce_loss": 0.06350753456354141, "step": 12890 }, { "epoch": 4.299533022014677, "loss": 0.4449309706687927, "step": 12890 }, { "ce_loss": 0.0714930072426796, "epoch": 4.299533022014677, "step": 12890 }, { "distill_loss": 0.149851456284523, "epoch": 4.299533022014677, "step": 12890 }, { "epoch": 4.299533022014677, "ref_ce_loss": 0.08158783614635468, "step": 12890 }, { "epoch": 4.299533022014677, "loss": 0.5386632084846497, "step": 12890 }, { "ce_loss": 0.22696617245674133, "epoch": 4.299533022014677, "step": 12890 }, { "distill_loss": 0.15260447561740875, "epoch": 4.299533022014677, "step": 12890 }, { "epoch": 4.299533022014677, "ref_ce_loss": 0.11920776963233948, "step": 12890 }, { "epoch": 4.302868579052702, "loss": 0.4182, "step": 12900 }, { "epoch": 4.302868579052702, "grad_norm": 1.9943865537643433, "step": 12900 }, { "epoch": 4.302868579052702, "learning_rate": 0.00013890976276393998, "step": 12900 }, { "epoch": 4.302868579052702, "loss": 0.6233004331588745, "step": 12900 }, { "ce_loss": 0.1952550858259201, "epoch": 4.302868579052702, "step": 12900 }, { "distill_loss": 0.16152338683605194, "epoch": 4.302868579052702, "step": 12900 }, { "epoch": 4.302868579052702, "ref_ce_loss": 0.11507376283407211, "step": 12900 }, { "epoch": 4.302868579052702, "loss": 0.3819960355758667, "step": 12900 }, { "ce_loss": 0.0760917067527771, "epoch": 4.302868579052702, "step": 12900 }, { "distill_loss": 0.12423792481422424, "epoch": 4.302868579052702, "step": 12900 }, { "epoch": 4.302868579052702, "ref_ce_loss": 0.13592945039272308, "step": 12900 }, { "epoch": 4.302868579052702, "loss": 0.32491952180862427, "step": 12900 }, { "ce_loss": 0.12789982557296753, "epoch": 4.302868579052702, "step": 12900 }, { "distill_loss": 0.10568737983703613, "epoch": 4.302868579052702, "step": 12900 }, { "epoch": 4.302868579052702, "ref_ce_loss": 0.09125398099422455, "step": 12900 }, { "epoch": 4.302868579052702, "loss": 0.38200074434280396, "step": 12900 }, { "ce_loss": 0.130323588848114, "epoch": 4.302868579052702, "step": 12900 }, { "distill_loss": 0.11945810168981552, "epoch": 4.302868579052702, "step": 12900 }, { "epoch": 4.302868579052702, "ref_ce_loss": 0.07663031667470932, "step": 12900 }, { "epoch": 4.306204136090727, "loss": 0.3974, "step": 12910 }, { "epoch": 4.306204136090727, "grad_norm": 2.2518882751464844, "step": 12910 }, { "epoch": 4.306204136090727, "learning_rate": 0.00013870776590959693, "step": 12910 }, { "epoch": 4.306204136090727, "loss": 0.3219544291496277, "step": 12910 }, { "ce_loss": 0.1147737130522728, "epoch": 4.306204136090727, "step": 12910 }, { "distill_loss": 0.1198507696390152, "epoch": 4.306204136090727, "step": 12910 }, { "epoch": 4.306204136090727, "ref_ce_loss": 0.08728796243667603, "step": 12910 }, { "epoch": 4.306204136090727, "loss": 0.40332797169685364, "step": 12910 }, { "ce_loss": 0.103706955909729, "epoch": 4.306204136090727, "step": 12910 }, { "distill_loss": 0.10040242224931717, "epoch": 4.306204136090727, "step": 12910 }, { "epoch": 4.306204136090727, "ref_ce_loss": 0.07252658158540726, "step": 12910 }, { "epoch": 4.306204136090727, "loss": 0.3360239863395691, "step": 12910 }, { "ce_loss": 0.1121392697095871, "epoch": 4.306204136090727, "step": 12910 }, { "distill_loss": 0.11188937723636627, "epoch": 4.306204136090727, "step": 12910 }, { "epoch": 4.306204136090727, "ref_ce_loss": 0.08032336086034775, "step": 12910 }, { "epoch": 4.306204136090727, "loss": 0.27747413516044617, "step": 12910 }, { "ce_loss": 0.06619500368833542, "epoch": 4.306204136090727, "step": 12910 }, { "distill_loss": 0.10412861406803131, "epoch": 4.306204136090727, "step": 12910 }, { "epoch": 4.306204136090727, "ref_ce_loss": 0.08311747759580612, "step": 12910 }, { "epoch": 4.309539693128753, "loss": 0.3994, "step": 12920 }, { "epoch": 4.309539693128753, "grad_norm": 2.2300546169281006, "step": 12920 }, { "epoch": 4.309539693128753, "learning_rate": 0.00013850578964783454, "step": 12920 }, { "epoch": 4.309539693128753, "loss": 0.4618908762931824, "step": 12920 }, { "ce_loss": 0.14769093692302704, "epoch": 4.309539693128753, "step": 12920 }, { "distill_loss": 0.12974810600280762, "epoch": 4.309539693128753, "step": 12920 }, { "epoch": 4.309539693128753, "ref_ce_loss": 0.08218572288751602, "step": 12920 }, { "epoch": 4.309539693128753, "loss": 0.36354151368141174, "step": 12920 }, { "ce_loss": 0.10538803040981293, "epoch": 4.309539693128753, "step": 12920 }, { "distill_loss": 0.12680771946907043, "epoch": 4.309539693128753, "step": 12920 }, { "epoch": 4.309539693128753, "ref_ce_loss": 0.09596758335828781, "step": 12920 }, { "epoch": 4.309539693128753, "loss": 0.3615802526473999, "step": 12920 }, { "ce_loss": 0.14971128106117249, "epoch": 4.309539693128753, "step": 12920 }, { "distill_loss": 0.12290134280920029, "epoch": 4.309539693128753, "step": 12920 }, { "epoch": 4.309539693128753, "ref_ce_loss": 0.07071920484304428, "step": 12920 }, { "epoch": 4.309539693128753, "loss": 0.1990204006433487, "step": 12920 }, { "ce_loss": 0.026068618521094322, "epoch": 4.309539693128753, "step": 12920 }, { "distill_loss": 0.1140662431716919, "epoch": 4.309539693128753, "step": 12920 }, { "epoch": 4.309539693128753, "ref_ce_loss": 0.05882774293422699, "step": 12920 }, { "epoch": 4.312875250166778, "loss": 0.4269, "step": 12930 }, { "epoch": 4.312875250166778, "grad_norm": 2.498473882675171, "step": 12930 }, { "epoch": 4.312875250166778, "learning_rate": 0.00013830383434697765, "step": 12930 }, { "epoch": 4.312875250166778, "loss": 0.9417632818222046, "step": 12930 }, { "ce_loss": 0.07766778767108917, "epoch": 4.312875250166778, "step": 12930 }, { "distill_loss": 0.12822362780570984, "epoch": 4.312875250166778, "step": 12930 }, { "epoch": 4.312875250166778, "ref_ce_loss": 0.10774123668670654, "step": 12930 }, { "epoch": 4.312875250166778, "loss": 0.3450748920440674, "step": 12930 }, { "ce_loss": 0.07935471087694168, "epoch": 4.312875250166778, "step": 12930 }, { "distill_loss": 0.10800100117921829, "epoch": 4.312875250166778, "step": 12930 }, { "epoch": 4.312875250166778, "ref_ce_loss": 0.08134301751852036, "step": 12930 }, { "epoch": 4.312875250166778, "loss": 0.4334663152694702, "step": 12930 }, { "ce_loss": 0.15563234686851501, "epoch": 4.312875250166778, "step": 12930 }, { "distill_loss": 0.1411948800086975, "epoch": 4.312875250166778, "step": 12930 }, { "epoch": 4.312875250166778, "ref_ce_loss": 0.10971379280090332, "step": 12930 }, { "epoch": 4.312875250166778, "loss": 0.8446991443634033, "step": 12930 }, { "ce_loss": 0.09985993802547455, "epoch": 4.312875250166778, "step": 12930 }, { "distill_loss": 0.20070119202136993, "epoch": 4.312875250166778, "step": 12930 }, { "epoch": 4.312875250166778, "ref_ce_loss": 0.11902379244565964, "step": 12930 }, { "epoch": 4.316210807204803, "loss": 0.415, "step": 12940 }, { "epoch": 4.316210807204803, "grad_norm": 3.0529587268829346, "step": 12940 }, { "epoch": 4.316210807204803, "learning_rate": 0.00013810190037531314, "step": 12940 }, { "epoch": 4.316210807204803, "loss": 0.33493930101394653, "step": 12940 }, { "ce_loss": 0.10959524661302567, "epoch": 4.316210807204803, "step": 12940 }, { "distill_loss": 0.13235542178153992, "epoch": 4.316210807204803, "step": 12940 }, { "epoch": 4.316210807204803, "ref_ce_loss": 0.09294150024652481, "step": 12940 }, { "epoch": 4.316210807204803, "loss": 0.43003711104393005, "step": 12940 }, { "ce_loss": 0.1794055849313736, "epoch": 4.316210807204803, "step": 12940 }, { "distill_loss": 0.11357323825359344, "epoch": 4.316210807204803, "step": 12940 }, { "epoch": 4.316210807204803, "ref_ce_loss": 0.13700471818447113, "step": 12940 }, { "epoch": 4.316210807204803, "loss": 0.306405633687973, "step": 12940 }, { "ce_loss": 0.09713051468133926, "epoch": 4.316210807204803, "step": 12940 }, { "distill_loss": 0.11166144907474518, "epoch": 4.316210807204803, "step": 12940 }, { "epoch": 4.316210807204803, "ref_ce_loss": 0.05975410342216492, "step": 12940 }, { "epoch": 4.316210807204803, "loss": 0.4291016161441803, "step": 12940 }, { "ce_loss": 0.09967274218797684, "epoch": 4.316210807204803, "step": 12940 }, { "distill_loss": 0.14119568467140198, "epoch": 4.316210807204803, "step": 12940 }, { "epoch": 4.316210807204803, "ref_ce_loss": 0.12897689640522003, "step": 12940 }, { "epoch": 4.319546364242829, "loss": 0.4561, "step": 12950 }, { "epoch": 4.319546364242829, "grad_norm": 4.655219078063965, "step": 12950 }, { "epoch": 4.319546364242829, "learning_rate": 0.00013789998810108904, "step": 12950 }, { "epoch": 4.319546364242829, "loss": 0.34342288970947266, "step": 12950 }, { "ce_loss": 0.046129222959280014, "epoch": 4.319546364242829, "step": 12950 }, { "distill_loss": 0.10202468931674957, "epoch": 4.319546364242829, "step": 12950 }, { "epoch": 4.319546364242829, "ref_ce_loss": 0.071707583963871, "step": 12950 }, { "epoch": 4.319546364242829, "loss": 0.5038535594940186, "step": 12950 }, { "ce_loss": 0.11776093393564224, "epoch": 4.319546364242829, "step": 12950 }, { "distill_loss": 0.10977289825677872, "epoch": 4.319546364242829, "step": 12950 }, { "epoch": 4.319546364242829, "ref_ce_loss": 0.09411168843507767, "step": 12950 }, { "epoch": 4.319546364242829, "loss": 0.3241851329803467, "step": 12950 }, { "ce_loss": 0.08705181628465652, "epoch": 4.319546364242829, "step": 12950 }, { "distill_loss": 0.11426226794719696, "epoch": 4.319546364242829, "step": 12950 }, { "epoch": 4.319546364242829, "ref_ce_loss": 0.08718805015087128, "step": 12950 }, { "epoch": 4.319546364242829, "loss": 0.316148966550827, "step": 12950 }, { "ce_loss": 0.0800526961684227, "epoch": 4.319546364242829, "step": 12950 }, { "distill_loss": 0.11344519257545471, "epoch": 4.319546364242829, "step": 12950 }, { "epoch": 4.319546364242829, "ref_ce_loss": 0.0967073068022728, "step": 12950 }, { "epoch": 4.322881921280854, "loss": 0.4104, "step": 12960 }, { "epoch": 4.322881921280854, "grad_norm": 2.311840057373047, "step": 12960 }, { "epoch": 4.322881921280854, "learning_rate": 0.00013769809789251347, "step": 12960 }, { "epoch": 4.322881921280854, "loss": 0.7348376512527466, "step": 12960 }, { "ce_loss": 0.08843173086643219, "epoch": 4.322881921280854, "step": 12960 }, { "distill_loss": 0.09694448858499527, "epoch": 4.322881921280854, "step": 12960 }, { "epoch": 4.322881921280854, "ref_ce_loss": 0.08761128038167953, "step": 12960 }, { "epoch": 4.322881921280854, "loss": 0.36715108156204224, "step": 12960 }, { "ce_loss": 0.11807632446289062, "epoch": 4.322881921280854, "step": 12960 }, { "distill_loss": 0.13147501647472382, "epoch": 4.322881921280854, "step": 12960 }, { "epoch": 4.322881921280854, "ref_ce_loss": 0.09746716916561127, "step": 12960 }, { "epoch": 4.322881921280854, "loss": 0.4032805562019348, "step": 12960 }, { "ce_loss": 0.10035624355077744, "epoch": 4.322881921280854, "step": 12960 }, { "distill_loss": 0.16447895765304565, "epoch": 4.322881921280854, "step": 12960 }, { "epoch": 4.322881921280854, "ref_ce_loss": 0.11302655190229416, "step": 12960 }, { "epoch": 4.322881921280854, "loss": 0.5197752714157104, "step": 12960 }, { "ce_loss": 0.10778123140335083, "epoch": 4.322881921280854, "step": 12960 }, { "distill_loss": 0.1056332215666771, "epoch": 4.322881921280854, "step": 12960 }, { "epoch": 4.322881921280854, "ref_ce_loss": 0.060031503438949585, "step": 12960 }, { "epoch": 4.326217478318879, "loss": 0.4357, "step": 12970 }, { "epoch": 4.326217478318879, "grad_norm": 2.313013792037964, "step": 12970 }, { "epoch": 4.326217478318879, "learning_rate": 0.00013749623011775463, "step": 12970 }, { "epoch": 4.326217478318879, "loss": 0.34826603531837463, "step": 12970 }, { "ce_loss": 0.07239992171525955, "epoch": 4.326217478318879, "step": 12970 }, { "distill_loss": 0.15117846429347992, "epoch": 4.326217478318879, "step": 12970 }, { "epoch": 4.326217478318879, "ref_ce_loss": 0.07360713928937912, "step": 12970 }, { "epoch": 4.326217478318879, "loss": 0.35273313522338867, "step": 12970 }, { "ce_loss": 0.057199910283088684, "epoch": 4.326217478318879, "step": 12970 }, { "distill_loss": 0.13013851642608643, "epoch": 4.326217478318879, "step": 12970 }, { "epoch": 4.326217478318879, "ref_ce_loss": 0.12024141103029251, "step": 12970 }, { "epoch": 4.326217478318879, "loss": 0.5760383605957031, "step": 12970 }, { "ce_loss": 0.1058734655380249, "epoch": 4.326217478318879, "step": 12970 }, { "distill_loss": 0.09850041568279266, "epoch": 4.326217478318879, "step": 12970 }, { "epoch": 4.326217478318879, "ref_ce_loss": 0.12359046190977097, "step": 12970 }, { "epoch": 4.326217478318879, "loss": 0.32047203183174133, "step": 12970 }, { "ce_loss": 0.07591608166694641, "epoch": 4.326217478318879, "step": 12970 }, { "distill_loss": 0.1338208019733429, "epoch": 4.326217478318879, "step": 12970 }, { "epoch": 4.326217478318879, "ref_ce_loss": 0.07232265919446945, "step": 12970 }, { "epoch": 4.329553035356905, "loss": 0.4049, "step": 12980 }, { "epoch": 4.329553035356905, "grad_norm": 4.8108649253845215, "step": 12980 }, { "epoch": 4.329553035356905, "learning_rate": 0.00013729438514493983, "step": 12980 }, { "epoch": 4.329553035356905, "loss": 0.7760747671127319, "step": 12980 }, { "ce_loss": 0.1079782098531723, "epoch": 4.329553035356905, "step": 12980 }, { "distill_loss": 0.17459788918495178, "epoch": 4.329553035356905, "step": 12980 }, { "epoch": 4.329553035356905, "ref_ce_loss": 0.08016688376665115, "step": 12980 }, { "epoch": 4.329553035356905, "loss": 0.4666213393211365, "step": 12980 }, { "ce_loss": 0.09899456799030304, "epoch": 4.329553035356905, "step": 12980 }, { "distill_loss": 0.18947945535182953, "epoch": 4.329553035356905, "step": 12980 }, { "epoch": 4.329553035356905, "ref_ce_loss": 0.1227281391620636, "step": 12980 }, { "epoch": 4.329553035356905, "loss": 0.4355705976486206, "step": 12980 }, { "ce_loss": 0.11216352880001068, "epoch": 4.329553035356905, "step": 12980 }, { "distill_loss": 0.13032612204551697, "epoch": 4.329553035356905, "step": 12980 }, { "epoch": 4.329553035356905, "ref_ce_loss": 0.10035806149244308, "step": 12980 }, { "epoch": 4.329553035356905, "loss": 0.4230186343193054, "step": 12980 }, { "ce_loss": 0.04993223026394844, "epoch": 4.329553035356905, "step": 12980 }, { "distill_loss": 0.2290044128894806, "epoch": 4.329553035356905, "step": 12980 }, { "epoch": 4.329553035356905, "ref_ce_loss": 0.09435595571994781, "step": 12980 }, { "epoch": 4.33288859239493, "loss": 0.4533, "step": 12990 }, { "epoch": 4.33288859239493, "grad_norm": 4.167677879333496, "step": 12990 }, { "epoch": 4.33288859239493, "learning_rate": 0.00013709256334215445, "step": 12990 }, { "epoch": 4.33288859239493, "loss": 0.9822889566421509, "step": 12990 }, { "ce_loss": 0.14912807941436768, "epoch": 4.33288859239493, "step": 12990 }, { "distill_loss": 0.15161332488059998, "epoch": 4.33288859239493, "step": 12990 }, { "epoch": 4.33288859239493, "ref_ce_loss": 0.08670554310083389, "step": 12990 }, { "epoch": 4.33288859239493, "loss": 0.38593828678131104, "step": 12990 }, { "ce_loss": 0.08408203721046448, "epoch": 4.33288859239493, "step": 12990 }, { "distill_loss": 0.15504911541938782, "epoch": 4.33288859239493, "step": 12990 }, { "epoch": 4.33288859239493, "ref_ce_loss": 0.08915705233812332, "step": 12990 }, { "epoch": 4.33288859239493, "loss": 0.3300465941429138, "step": 12990 }, { "ce_loss": 0.05601300671696663, "epoch": 4.33288859239493, "step": 12990 }, { "distill_loss": 0.0853685736656189, "epoch": 4.33288859239493, "step": 12990 }, { "epoch": 4.33288859239493, "ref_ce_loss": 0.06712325662374496, "step": 12990 }, { "epoch": 4.33288859239493, "loss": 0.3273487687110901, "step": 12990 }, { "ce_loss": 0.07666195929050446, "epoch": 4.33288859239493, "step": 12990 }, { "distill_loss": 0.13456887006759644, "epoch": 4.33288859239493, "step": 12990 }, { "epoch": 4.33288859239493, "ref_ce_loss": 0.08263428509235382, "step": 12990 }, { "epoch": 4.336224149432955, "loss": 0.4599, "step": 13000 }, { "epoch": 4.336224149432955, "grad_norm": 3.2254204750061035, "step": 13000 }, { "epoch": 4.336224149432955, "learning_rate": 0.00013689076507744207, "step": 13000 }, { "epoch": 4.336224149432955, "loss": 0.27640801668167114, "step": 13000 }, { "ce_loss": 0.06200258433818817, "epoch": 4.336224149432955, "step": 13000 }, { "distill_loss": 0.11888018995523453, "epoch": 4.336224149432955, "step": 13000 }, { "epoch": 4.336224149432955, "ref_ce_loss": 0.07222095876932144, "step": 13000 }, { "epoch": 4.336224149432955, "loss": 0.2827991843223572, "step": 13000 }, { "ce_loss": 0.07215210050344467, "epoch": 4.336224149432955, "step": 13000 }, { "distill_loss": 0.133390411734581, "epoch": 4.336224149432955, "step": 13000 }, { "epoch": 4.336224149432955, "ref_ce_loss": 0.0737648531794548, "step": 13000 }, { "epoch": 4.336224149432955, "loss": 0.3525514304637909, "step": 13000 }, { "ce_loss": 0.08922962844371796, "epoch": 4.336224149432955, "step": 13000 }, { "distill_loss": 0.11442573368549347, "epoch": 4.336224149432955, "step": 13000 }, { "epoch": 4.336224149432955, "ref_ce_loss": 0.0543455071747303, "step": 13000 }, { "epoch": 4.336224149432955, "loss": 0.27908873558044434, "step": 13000 }, { "ce_loss": 0.05551055073738098, "epoch": 4.336224149432955, "step": 13000 }, { "distill_loss": 0.11368223279714584, "epoch": 4.336224149432955, "step": 13000 }, { "epoch": 4.336224149432955, "ref_ce_loss": 0.10890942811965942, "step": 13000 }, { "epoch": 4.339559706470981, "loss": 0.3712, "step": 13010 }, { "epoch": 4.339559706470981, "grad_norm": 3.2621657848358154, "step": 13010 }, { "epoch": 4.339559706470981, "learning_rate": 0.00013668899071880297, "step": 13010 }, { "epoch": 4.339559706470981, "loss": 0.27522847056388855, "step": 13010 }, { "ce_loss": 0.07751738280057907, "epoch": 4.339559706470981, "step": 13010 }, { "distill_loss": 0.13736335933208466, "epoch": 4.339559706470981, "step": 13010 }, { "epoch": 4.339559706470981, "ref_ce_loss": 0.06012954190373421, "step": 13010 }, { "epoch": 4.339559706470981, "loss": 0.7454414367675781, "step": 13010 }, { "ce_loss": 0.12859775125980377, "epoch": 4.339559706470981, "step": 13010 }, { "distill_loss": 0.17422287166118622, "epoch": 4.339559706470981, "step": 13010 }, { "epoch": 4.339559706470981, "ref_ce_loss": 0.12988649308681488, "step": 13010 }, { "epoch": 4.339559706470981, "loss": 0.4908011555671692, "step": 13010 }, { "ce_loss": 0.15541724860668182, "epoch": 4.339559706470981, "step": 13010 }, { "distill_loss": 0.15849357843399048, "epoch": 4.339559706470981, "step": 13010 }, { "epoch": 4.339559706470981, "ref_ce_loss": 0.10041403025388718, "step": 13010 }, { "epoch": 4.339559706470981, "loss": 0.3782275319099426, "step": 13010 }, { "ce_loss": 0.11592970043420792, "epoch": 4.339559706470981, "step": 13010 }, { "distill_loss": 0.19228774309158325, "epoch": 4.339559706470981, "step": 13010 }, { "epoch": 4.339559706470981, "ref_ce_loss": 0.06995546817779541, "step": 13010 }, { "epoch": 4.342895263509006, "loss": 0.4515, "step": 13020 }, { "epoch": 4.342895263509006, "grad_norm": 2.6184566020965576, "step": 13020 }, { "epoch": 4.342895263509006, "learning_rate": 0.000136487240634194, "step": 13020 }, { "epoch": 4.342895263509006, "loss": 0.453509122133255, "step": 13020 }, { "ce_loss": 0.10892651230096817, "epoch": 4.342895263509006, "step": 13020 }, { "distill_loss": 0.2049221247434616, "epoch": 4.342895263509006, "step": 13020 }, { "epoch": 4.342895263509006, "ref_ce_loss": 0.1100277379155159, "step": 13020 }, { "epoch": 4.342895263509006, "loss": 0.42655685544013977, "step": 13020 }, { "ce_loss": 0.06593813747167587, "epoch": 4.342895263509006, "step": 13020 }, { "distill_loss": 0.18975017964839935, "epoch": 4.342895263509006, "step": 13020 }, { "epoch": 4.342895263509006, "ref_ce_loss": 0.10121740400791168, "step": 13020 }, { "epoch": 4.342895263509006, "loss": 0.3206443786621094, "step": 13020 }, { "ce_loss": 0.11419174075126648, "epoch": 4.342895263509006, "step": 13020 }, { "distill_loss": 0.13377337157726288, "epoch": 4.342895263509006, "step": 13020 }, { "epoch": 4.342895263509006, "ref_ce_loss": 0.056987863034009933, "step": 13020 }, { "epoch": 4.342895263509006, "loss": 0.34542882442474365, "step": 13020 }, { "ce_loss": 0.11716454476118088, "epoch": 4.342895263509006, "step": 13020 }, { "distill_loss": 0.11346367001533508, "epoch": 4.342895263509006, "step": 13020 }, { "epoch": 4.342895263509006, "ref_ce_loss": 0.11472050100564957, "step": 13020 }, { "epoch": 4.3462308205470315, "loss": 0.4454, "step": 13030 }, { "epoch": 4.3462308205470315, "grad_norm": 2.3557121753692627, "step": 13030 }, { "epoch": 4.3462308205470315, "learning_rate": 0.00013628551519152783, "step": 13030 }, { "epoch": 4.3462308205470315, "loss": 0.3510717451572418, "step": 13030 }, { "ce_loss": 0.07808232307434082, "epoch": 4.3462308205470315, "step": 13030 }, { "distill_loss": 0.11191411316394806, "epoch": 4.3462308205470315, "step": 13030 }, { "epoch": 4.3462308205470315, "ref_ce_loss": 0.0939103364944458, "step": 13030 }, { "epoch": 4.3462308205470315, "loss": 0.39723458886146545, "step": 13030 }, { "ce_loss": 0.14701463282108307, "epoch": 4.3462308205470315, "step": 13030 }, { "distill_loss": 0.1371280550956726, "epoch": 4.3462308205470315, "step": 13030 }, { "epoch": 4.3462308205470315, "ref_ce_loss": 0.0885378047823906, "step": 13030 }, { "epoch": 4.3462308205470315, "loss": 0.5219001770019531, "step": 13030 }, { "ce_loss": 0.20793606340885162, "epoch": 4.3462308205470315, "step": 13030 }, { "distill_loss": 0.2547592520713806, "epoch": 4.3462308205470315, "step": 13030 }, { "epoch": 4.3462308205470315, "ref_ce_loss": 0.0590997040271759, "step": 13030 }, { "epoch": 4.3462308205470315, "loss": 0.2735883593559265, "step": 13030 }, { "ce_loss": 0.03217329457402229, "epoch": 4.3462308205470315, "step": 13030 }, { "distill_loss": 0.07507725805044174, "epoch": 4.3462308205470315, "step": 13030 }, { "epoch": 4.3462308205470315, "ref_ce_loss": 0.06571685522794724, "step": 13030 }, { "epoch": 4.349566377585057, "loss": 0.4346, "step": 13040 }, { "epoch": 4.349566377585057, "grad_norm": 1.9382163286209106, "step": 13040 }, { "epoch": 4.349566377585057, "learning_rate": 0.000136083814758672, "step": 13040 }, { "epoch": 4.349566377585057, "loss": 0.34804975986480713, "step": 13040 }, { "ce_loss": 0.11048239469528198, "epoch": 4.349566377585057, "step": 13040 }, { "distill_loss": 0.17029768228530884, "epoch": 4.349566377585057, "step": 13040 }, { "epoch": 4.349566377585057, "ref_ce_loss": 0.0672643780708313, "step": 13040 }, { "epoch": 4.349566377585057, "loss": 0.3496224284172058, "step": 13040 }, { "ce_loss": 0.05408291146159172, "epoch": 4.349566377585057, "step": 13040 }, { "distill_loss": 0.18122805655002594, "epoch": 4.349566377585057, "step": 13040 }, { "epoch": 4.349566377585057, "ref_ce_loss": 0.08082221448421478, "step": 13040 }, { "epoch": 4.349566377585057, "loss": 0.7453316450119019, "step": 13040 }, { "ce_loss": 0.14862661063671112, "epoch": 4.349566377585057, "step": 13040 }, { "distill_loss": 0.24175487458705902, "epoch": 4.349566377585057, "step": 13040 }, { "epoch": 4.349566377585057, "ref_ce_loss": 0.09618813544511795, "step": 13040 }, { "epoch": 4.349566377585057, "loss": 0.46699488162994385, "step": 13040 }, { "ce_loss": 0.14248669147491455, "epoch": 4.349566377585057, "step": 13040 }, { "distill_loss": 0.15172013640403748, "epoch": 4.349566377585057, "step": 13040 }, { "epoch": 4.349566377585057, "ref_ce_loss": 0.11561673134565353, "step": 13040 }, { "epoch": 4.352901934623082, "loss": 0.4355, "step": 13050 }, { "epoch": 4.352901934623082, "grad_norm": 2.936624050140381, "step": 13050 }, { "epoch": 4.352901934623082, "learning_rate": 0.00013588213970344855, "step": 13050 }, { "epoch": 4.352901934623082, "loss": 0.44997403025627136, "step": 13050 }, { "ce_loss": 0.13845403492450714, "epoch": 4.352901934623082, "step": 13050 }, { "distill_loss": 0.1162261962890625, "epoch": 4.352901934623082, "step": 13050 }, { "epoch": 4.352901934623082, "ref_ce_loss": 0.10550343990325928, "step": 13050 }, { "epoch": 4.352901934623082, "loss": 0.6536597609519958, "step": 13050 }, { "ce_loss": 0.1402038037776947, "epoch": 4.352901934623082, "step": 13050 }, { "distill_loss": 0.18752452731132507, "epoch": 4.352901934623082, "step": 13050 }, { "epoch": 4.352901934623082, "ref_ce_loss": 0.08539095520973206, "step": 13050 }, { "epoch": 4.352901934623082, "loss": 0.35335773229599, "step": 13050 }, { "ce_loss": 0.09392538666725159, "epoch": 4.352901934623082, "step": 13050 }, { "distill_loss": 0.13803143799304962, "epoch": 4.352901934623082, "step": 13050 }, { "epoch": 4.352901934623082, "ref_ce_loss": 0.09819263964891434, "step": 13050 }, { "epoch": 4.352901934623082, "loss": 0.29352426528930664, "step": 13050 }, { "ce_loss": 0.05404730513691902, "epoch": 4.352901934623082, "step": 13050 }, { "distill_loss": 0.1509397327899933, "epoch": 4.352901934623082, "step": 13050 }, { "epoch": 4.352901934623082, "ref_ce_loss": 0.08849099278450012, "step": 13050 }, { "epoch": 4.3562374916611075, "loss": 0.401, "step": 13060 }, { "epoch": 4.3562374916611075, "grad_norm": 2.245464324951172, "step": 13060 }, { "epoch": 4.3562374916611075, "learning_rate": 0.00013568049039363326, "step": 13060 }, { "epoch": 4.3562374916611075, "loss": 0.34581252932548523, "step": 13060 }, { "ce_loss": 0.07167209684848785, "epoch": 4.3562374916611075, "step": 13060 }, { "distill_loss": 0.13952210545539856, "epoch": 4.3562374916611075, "step": 13060 }, { "epoch": 4.3562374916611075, "ref_ce_loss": 0.06178533658385277, "step": 13060 }, { "epoch": 4.3562374916611075, "loss": 0.3579484522342682, "step": 13060 }, { "ce_loss": 0.07104761153459549, "epoch": 4.3562374916611075, "step": 13060 }, { "distill_loss": 0.19464844465255737, "epoch": 4.3562374916611075, "step": 13060 }, { "epoch": 4.3562374916611075, "ref_ce_loss": 0.06834821403026581, "step": 13060 }, { "epoch": 4.3562374916611075, "loss": 0.47544968128204346, "step": 13060 }, { "ce_loss": 0.1545322984457016, "epoch": 4.3562374916611075, "step": 13060 }, { "distill_loss": 0.16924047470092773, "epoch": 4.3562374916611075, "step": 13060 }, { "epoch": 4.3562374916611075, "ref_ce_loss": 0.09331899136304855, "step": 13060 }, { "epoch": 4.3562374916611075, "loss": 0.27666276693344116, "step": 13060 }, { "ce_loss": 0.099562868475914, "epoch": 4.3562374916611075, "step": 13060 }, { "distill_loss": 0.11031017452478409, "epoch": 4.3562374916611075, "step": 13060 }, { "epoch": 4.3562374916611075, "ref_ce_loss": 0.06676922738552094, "step": 13060 }, { "epoch": 4.359573048699133, "loss": 0.4247, "step": 13070 }, { "epoch": 4.359573048699133, "grad_norm": 2.441117286682129, "step": 13070 }, { "epoch": 4.359573048699133, "learning_rate": 0.00013547886719695486, "step": 13070 }, { "epoch": 4.359573048699133, "loss": 0.4055763781070709, "step": 13070 }, { "ce_loss": 0.1165999099612236, "epoch": 4.359573048699133, "step": 13070 }, { "distill_loss": 0.13579559326171875, "epoch": 4.359573048699133, "step": 13070 }, { "epoch": 4.359573048699133, "ref_ce_loss": 0.10974972695112228, "step": 13070 }, { "epoch": 4.359573048699133, "loss": 0.31318339705467224, "step": 13070 }, { "ce_loss": 0.10058171302080154, "epoch": 4.359573048699133, "step": 13070 }, { "distill_loss": 0.09889546036720276, "epoch": 4.359573048699133, "step": 13070 }, { "epoch": 4.359573048699133, "ref_ce_loss": 0.06917301565408707, "step": 13070 }, { "epoch": 4.359573048699133, "loss": 0.3399220108985901, "step": 13070 }, { "ce_loss": 0.07428082078695297, "epoch": 4.359573048699133, "step": 13070 }, { "distill_loss": 0.17034077644348145, "epoch": 4.359573048699133, "step": 13070 }, { "epoch": 4.359573048699133, "ref_ce_loss": 0.0660679042339325, "step": 13070 }, { "epoch": 4.359573048699133, "loss": 0.9781274795532227, "step": 13070 }, { "ce_loss": 0.14656391739845276, "epoch": 4.359573048699133, "step": 13070 }, { "distill_loss": 0.12748153507709503, "epoch": 4.359573048699133, "step": 13070 }, { "epoch": 4.359573048699133, "ref_ce_loss": 0.09130503237247467, "step": 13070 }, { "epoch": 4.362908605737158, "loss": 0.4427, "step": 13080 }, { "epoch": 4.362908605737158, "grad_norm": 3.1027774810791016, "step": 13080 }, { "epoch": 4.362908605737158, "learning_rate": 0.00013527727048109463, "step": 13080 }, { "epoch": 4.362908605737158, "loss": 0.4412672817707062, "step": 13080 }, { "ce_loss": 0.1664845198392868, "epoch": 4.362908605737158, "step": 13080 }, { "distill_loss": 0.1434018909931183, "epoch": 4.362908605737158, "step": 13080 }, { "epoch": 4.362908605737158, "ref_ce_loss": 0.1312694400548935, "step": 13080 }, { "epoch": 4.362908605737158, "loss": 0.3231005072593689, "step": 13080 }, { "ce_loss": 0.10845252871513367, "epoch": 4.362908605737158, "step": 13080 }, { "distill_loss": 0.140834778547287, "epoch": 4.362908605737158, "step": 13080 }, { "epoch": 4.362908605737158, "ref_ce_loss": 0.07371904700994492, "step": 13080 }, { "epoch": 4.362908605737158, "loss": 0.3200206756591797, "step": 13080 }, { "ce_loss": 0.06258603185415268, "epoch": 4.362908605737158, "step": 13080 }, { "distill_loss": 0.13723817467689514, "epoch": 4.362908605737158, "step": 13080 }, { "epoch": 4.362908605737158, "ref_ce_loss": 0.06365492939949036, "step": 13080 }, { "epoch": 4.362908605737158, "loss": 0.23603607714176178, "step": 13080 }, { "ce_loss": 0.05964049696922302, "epoch": 4.362908605737158, "step": 13080 }, { "distill_loss": 0.11777057498693466, "epoch": 4.362908605737158, "step": 13080 }, { "epoch": 4.362908605737158, "ref_ce_loss": 0.057929519563913345, "step": 13080 }, { "epoch": 4.366244162775184, "loss": 0.417, "step": 13090 }, { "epoch": 4.366244162775184, "grad_norm": 2.3897628784179688, "step": 13090 }, { "epoch": 4.366244162775184, "learning_rate": 0.00013507570061368536, "step": 13090 }, { "epoch": 4.366244162775184, "loss": 0.5743356943130493, "step": 13090 }, { "ce_loss": 0.15212753415107727, "epoch": 4.366244162775184, "step": 13090 }, { "distill_loss": 0.19665196537971497, "epoch": 4.366244162775184, "step": 13090 }, { "epoch": 4.366244162775184, "ref_ce_loss": 0.11735095083713531, "step": 13090 }, { "epoch": 4.366244162775184, "loss": 0.9558845162391663, "step": 13090 }, { "ce_loss": 0.12546709179878235, "epoch": 4.366244162775184, "step": 13090 }, { "distill_loss": 0.17301815748214722, "epoch": 4.366244162775184, "step": 13090 }, { "epoch": 4.366244162775184, "ref_ce_loss": 0.08531199395656586, "step": 13090 }, { "epoch": 4.366244162775184, "loss": 0.4186132550239563, "step": 13090 }, { "ce_loss": 0.1030435711145401, "epoch": 4.366244162775184, "step": 13090 }, { "distill_loss": 0.15109947323799133, "epoch": 4.366244162775184, "step": 13090 }, { "epoch": 4.366244162775184, "ref_ce_loss": 0.09883557260036469, "step": 13090 }, { "epoch": 4.366244162775184, "loss": 0.49957624077796936, "step": 13090 }, { "ce_loss": 0.12739510834217072, "epoch": 4.366244162775184, "step": 13090 }, { "distill_loss": 0.22322194278240204, "epoch": 4.366244162775184, "step": 13090 }, { "epoch": 4.366244162775184, "ref_ce_loss": 0.1483670175075531, "step": 13090 }, { "epoch": 4.369579719813209, "loss": 0.424, "step": 13100 }, { "epoch": 4.369579719813209, "grad_norm": 2.3537068367004395, "step": 13100 }, { "epoch": 4.369579719813209, "learning_rate": 0.00013487415796231103, "step": 13100 }, { "epoch": 4.369579719813209, "loss": 0.382670521736145, "step": 13100 }, { "ce_loss": 0.11593491584062576, "epoch": 4.369579719813209, "step": 13100 }, { "distill_loss": 0.15580274164676666, "epoch": 4.369579719813209, "step": 13100 }, { "epoch": 4.369579719813209, "ref_ce_loss": 0.1107337549328804, "step": 13100 }, { "epoch": 4.369579719813209, "loss": 0.6211612820625305, "step": 13100 }, { "ce_loss": 0.23382166028022766, "epoch": 4.369579719813209, "step": 13100 }, { "distill_loss": 0.20548737049102783, "epoch": 4.369579719813209, "step": 13100 }, { "epoch": 4.369579719813209, "ref_ce_loss": 0.13101726770401, "step": 13100 }, { "epoch": 4.369579719813209, "loss": 0.34104016423225403, "step": 13100 }, { "ce_loss": 0.08137911558151245, "epoch": 4.369579719813209, "step": 13100 }, { "distill_loss": 0.12536190450191498, "epoch": 4.369579719813209, "step": 13100 }, { "epoch": 4.369579719813209, "ref_ce_loss": 0.1014823243021965, "step": 13100 }, { "epoch": 4.369579719813209, "loss": 0.31775692105293274, "step": 13100 }, { "ce_loss": 0.08146476745605469, "epoch": 4.369579719813209, "step": 13100 }, { "distill_loss": 0.11398107558488846, "epoch": 4.369579719813209, "step": 13100 }, { "epoch": 4.369579719813209, "ref_ce_loss": 0.07950994372367859, "step": 13100 }, { "epoch": 4.372915276851234, "loss": 0.4287, "step": 13110 }, { "epoch": 4.372915276851234, "grad_norm": 1.9185400009155273, "step": 13110 }, { "epoch": 4.372915276851234, "learning_rate": 0.00013467264289450593, "step": 13110 }, { "epoch": 4.372915276851234, "loss": 0.3162509500980377, "step": 13110 }, { "ce_loss": 0.1067291647195816, "epoch": 4.372915276851234, "step": 13110 }, { "distill_loss": 0.13923035562038422, "epoch": 4.372915276851234, "step": 13110 }, { "epoch": 4.372915276851234, "ref_ce_loss": 0.07007686793804169, "step": 13110 }, { "epoch": 4.372915276851234, "loss": 0.3750307857990265, "step": 13110 }, { "ce_loss": 0.13682420551776886, "epoch": 4.372915276851234, "step": 13110 }, { "distill_loss": 0.15495645999908447, "epoch": 4.372915276851234, "step": 13110 }, { "epoch": 4.372915276851234, "ref_ce_loss": 0.08312554657459259, "step": 13110 }, { "epoch": 4.372915276851234, "loss": 0.631348192691803, "step": 13110 }, { "ce_loss": 0.13241538405418396, "epoch": 4.372915276851234, "step": 13110 }, { "distill_loss": 0.1382109820842743, "epoch": 4.372915276851234, "step": 13110 }, { "epoch": 4.372915276851234, "ref_ce_loss": 0.09631854295730591, "step": 13110 }, { "epoch": 4.372915276851234, "loss": 0.3689813017845154, "step": 13110 }, { "ce_loss": 0.09740588068962097, "epoch": 4.372915276851234, "step": 13110 }, { "distill_loss": 0.13242462277412415, "epoch": 4.372915276851234, "step": 13110 }, { "epoch": 4.372915276851234, "ref_ce_loss": 0.0812930315732956, "step": 13110 }, { "epoch": 4.37625083388926, "loss": 0.4738, "step": 13120 }, { "epoch": 4.37625083388926, "grad_norm": 2.93460750579834, "step": 13120 }, { "epoch": 4.37625083388926, "learning_rate": 0.00013447115577775403, "step": 13120 }, { "epoch": 4.37625083388926, "loss": 0.2833386957645416, "step": 13120 }, { "ce_loss": 0.03885704651474953, "epoch": 4.37625083388926, "step": 13120 }, { "distill_loss": 0.12924322485923767, "epoch": 4.37625083388926, "step": 13120 }, { "epoch": 4.37625083388926, "ref_ce_loss": 0.07931698113679886, "step": 13120 }, { "epoch": 4.37625083388926, "loss": 0.2440667301416397, "step": 13120 }, { "ce_loss": 0.048960719257593155, "epoch": 4.37625083388926, "step": 13120 }, { "distill_loss": 0.09508942067623138, "epoch": 4.37625083388926, "step": 13120 }, { "epoch": 4.37625083388926, "ref_ce_loss": 0.07707788050174713, "step": 13120 }, { "epoch": 4.37625083388926, "loss": 0.6507571935653687, "step": 13120 }, { "ce_loss": 0.08221402764320374, "epoch": 4.37625083388926, "step": 13120 }, { "distill_loss": 0.11556510627269745, "epoch": 4.37625083388926, "step": 13120 }, { "epoch": 4.37625083388926, "ref_ce_loss": 0.08824213594198227, "step": 13120 }, { "epoch": 4.37625083388926, "loss": 0.3309069275856018, "step": 13120 }, { "ce_loss": 0.07076641917228699, "epoch": 4.37625083388926, "step": 13120 }, { "distill_loss": 0.15475329756736755, "epoch": 4.37625083388926, "step": 13120 }, { "epoch": 4.37625083388926, "ref_ce_loss": 0.10515636205673218, "step": 13120 }, { "epoch": 4.379586390927285, "loss": 0.4077, "step": 13130 }, { "epoch": 4.379586390927285, "grad_norm": 2.9743335247039795, "step": 13130 }, { "epoch": 4.379586390927285, "learning_rate": 0.00013426969697948838, "step": 13130 }, { "epoch": 4.379586390927285, "loss": 0.30378925800323486, "step": 13130 }, { "ce_loss": 0.061307426542043686, "epoch": 4.379586390927285, "step": 13130 }, { "distill_loss": 0.10912270098924637, "epoch": 4.379586390927285, "step": 13130 }, { "epoch": 4.379586390927285, "ref_ce_loss": 0.05688157305121422, "step": 13130 }, { "epoch": 4.379586390927285, "loss": 0.4710403382778168, "step": 13130 }, { "ce_loss": 0.06571760028600693, "epoch": 4.379586390927285, "step": 13130 }, { "distill_loss": 0.210685133934021, "epoch": 4.379586390927285, "step": 13130 }, { "epoch": 4.379586390927285, "ref_ce_loss": 0.060083936899900436, "step": 13130 }, { "epoch": 4.379586390927285, "loss": 0.4687569737434387, "step": 13130 }, { "ce_loss": 0.1300654113292694, "epoch": 4.379586390927285, "step": 13130 }, { "distill_loss": 0.21737553179264069, "epoch": 4.379586390927285, "step": 13130 }, { "epoch": 4.379586390927285, "ref_ce_loss": 0.12073502689599991, "step": 13130 }, { "epoch": 4.379586390927285, "loss": 0.5804852247238159, "step": 13130 }, { "ce_loss": 0.05941832438111305, "epoch": 4.379586390927285, "step": 13130 }, { "distill_loss": 0.13962823152542114, "epoch": 4.379586390927285, "step": 13130 }, { "epoch": 4.379586390927285, "ref_ce_loss": 0.08975160866975784, "step": 13130 }, { "epoch": 4.38292194796531, "loss": 0.4298, "step": 13140 }, { "epoch": 4.38292194796531, "grad_norm": 2.263075113296509, "step": 13140 }, { "epoch": 4.38292194796531, "learning_rate": 0.00013406826686709032, "step": 13140 }, { "epoch": 4.38292194796531, "loss": 0.39446863532066345, "step": 13140 }, { "ce_loss": 0.08332235366106033, "epoch": 4.38292194796531, "step": 13140 }, { "distill_loss": 0.1692182719707489, "epoch": 4.38292194796531, "step": 13140 }, { "epoch": 4.38292194796531, "ref_ce_loss": 0.09426987171173096, "step": 13140 }, { "epoch": 4.38292194796531, "loss": 0.8630024194717407, "step": 13140 }, { "ce_loss": 0.10032361000776291, "epoch": 4.38292194796531, "step": 13140 }, { "distill_loss": 0.16708731651306152, "epoch": 4.38292194796531, "step": 13140 }, { "epoch": 4.38292194796531, "ref_ce_loss": 0.12849758565425873, "step": 13140 }, { "epoch": 4.38292194796531, "loss": 0.4019727408885956, "step": 13140 }, { "ce_loss": 0.14358335733413696, "epoch": 4.38292194796531, "step": 13140 }, { "distill_loss": 0.1606614589691162, "epoch": 4.38292194796531, "step": 13140 }, { "epoch": 4.38292194796531, "ref_ce_loss": 0.06312292814254761, "step": 13140 }, { "epoch": 4.38292194796531, "loss": 0.3633531630039215, "step": 13140 }, { "ce_loss": 0.09154992550611496, "epoch": 4.38292194796531, "step": 13140 }, { "distill_loss": 0.15102434158325195, "epoch": 4.38292194796531, "step": 13140 }, { "epoch": 4.38292194796531, "ref_ce_loss": 0.09260568767786026, "step": 13140 }, { "epoch": 4.386257505003336, "loss": 0.4414, "step": 13150 }, { "epoch": 4.386257505003336, "grad_norm": 4.649960517883301, "step": 13150 }, { "epoch": 4.386257505003336, "learning_rate": 0.00013386686580788893, "step": 13150 }, { "epoch": 4.386257505003336, "loss": 0.3555895686149597, "step": 13150 }, { "ce_loss": 0.11861802637577057, "epoch": 4.386257505003336, "step": 13150 }, { "distill_loss": 0.13332310318946838, "epoch": 4.386257505003336, "step": 13150 }, { "epoch": 4.386257505003336, "ref_ce_loss": 0.08811759948730469, "step": 13150 }, { "epoch": 4.386257505003336, "loss": 0.6896891593933105, "step": 13150 }, { "ce_loss": 0.14115719497203827, "epoch": 4.386257505003336, "step": 13150 }, { "distill_loss": 0.15127448737621307, "epoch": 4.386257505003336, "step": 13150 }, { "epoch": 4.386257505003336, "ref_ce_loss": 0.10953246802091599, "step": 13150 }, { "epoch": 4.386257505003336, "loss": 0.5004947185516357, "step": 13150 }, { "ce_loss": 0.11395592987537384, "epoch": 4.386257505003336, "step": 13150 }, { "distill_loss": 0.17832502722740173, "epoch": 4.386257505003336, "step": 13150 }, { "epoch": 4.386257505003336, "ref_ce_loss": 0.12350393831729889, "step": 13150 }, { "epoch": 4.386257505003336, "loss": 0.45451653003692627, "step": 13150 }, { "ce_loss": 0.17620813846588135, "epoch": 4.386257505003336, "step": 13150 }, { "distill_loss": 0.18706226348876953, "epoch": 4.386257505003336, "step": 13150 }, { "epoch": 4.386257505003336, "ref_ce_loss": 0.06283406913280487, "step": 13150 }, { "epoch": 4.389593062041361, "loss": 0.4374, "step": 13160 }, { "epoch": 4.389593062041361, "grad_norm": 3.5954205989837646, "step": 13160 }, { "epoch": 4.389593062041361, "learning_rate": 0.00013366549416916033, "step": 13160 }, { "epoch": 4.389593062041361, "loss": 0.7484768629074097, "step": 13160 }, { "ce_loss": 0.06970355659723282, "epoch": 4.389593062041361, "step": 13160 }, { "distill_loss": 0.13633976876735687, "epoch": 4.389593062041361, "step": 13160 }, { "epoch": 4.389593062041361, "ref_ce_loss": 0.11723722517490387, "step": 13160 }, { "epoch": 4.389593062041361, "loss": 0.25856563448905945, "step": 13160 }, { "ce_loss": 0.05419513210654259, "epoch": 4.389593062041361, "step": 13160 }, { "distill_loss": 0.12057643383741379, "epoch": 4.389593062041361, "step": 13160 }, { "epoch": 4.389593062041361, "ref_ce_loss": 0.083645299077034, "step": 13160 }, { "epoch": 4.389593062041361, "loss": 0.33344385027885437, "step": 13160 }, { "ce_loss": 0.10062061250209808, "epoch": 4.389593062041361, "step": 13160 }, { "distill_loss": 0.13595770299434662, "epoch": 4.389593062041361, "step": 13160 }, { "epoch": 4.389593062041361, "ref_ce_loss": 0.0702580064535141, "step": 13160 }, { "epoch": 4.389593062041361, "loss": 0.44337448477745056, "step": 13160 }, { "ce_loss": 0.1366061568260193, "epoch": 4.389593062041361, "step": 13160 }, { "distill_loss": 0.19141413271427155, "epoch": 4.389593062041361, "step": 13160 }, { "epoch": 4.389593062041361, "ref_ce_loss": 0.11482523381710052, "step": 13160 }, { "epoch": 4.392928619079386, "loss": 0.4569, "step": 13170 }, { "epoch": 4.392928619079386, "grad_norm": 4.291663646697998, "step": 13170 }, { "epoch": 4.392928619079386, "learning_rate": 0.0001334641523181269, "step": 13170 }, { "epoch": 4.392928619079386, "loss": 0.3736570477485657, "step": 13170 }, { "ce_loss": 0.12922805547714233, "epoch": 4.392928619079386, "step": 13170 }, { "distill_loss": 0.1475725919008255, "epoch": 4.392928619079386, "step": 13170 }, { "epoch": 4.392928619079386, "ref_ce_loss": 0.09670910239219666, "step": 13170 }, { "epoch": 4.392928619079386, "loss": 0.43183663487434387, "step": 13170 }, { "ce_loss": 0.13625575602054596, "epoch": 4.392928619079386, "step": 13170 }, { "distill_loss": 0.1850631982088089, "epoch": 4.392928619079386, "step": 13170 }, { "epoch": 4.392928619079386, "ref_ce_loss": 0.1103607639670372, "step": 13170 }, { "epoch": 4.392928619079386, "loss": 0.41522231698036194, "step": 13170 }, { "ce_loss": 0.08343395590782166, "epoch": 4.392928619079386, "step": 13170 }, { "distill_loss": 0.17501504719257355, "epoch": 4.392928619079386, "step": 13170 }, { "epoch": 4.392928619079386, "ref_ce_loss": 0.1140434592962265, "step": 13170 }, { "epoch": 4.392928619079386, "loss": 0.42343854904174805, "step": 13170 }, { "ce_loss": 0.14579157531261444, "epoch": 4.392928619079386, "step": 13170 }, { "distill_loss": 0.13567650318145752, "epoch": 4.392928619079386, "step": 13170 }, { "epoch": 4.392928619079386, "ref_ce_loss": 0.09917942434549332, "step": 13170 }, { "epoch": 4.396264176117412, "loss": 0.4089, "step": 13180 }, { "epoch": 4.396264176117412, "grad_norm": 7.946816921234131, "step": 13180 }, { "epoch": 4.396264176117412, "learning_rate": 0.00013326284062195682, "step": 13180 }, { "epoch": 4.396264176117412, "loss": 0.2218421846628189, "step": 13180 }, { "ce_loss": 0.054962776601314545, "epoch": 4.396264176117412, "step": 13180 }, { "distill_loss": 0.11377930641174316, "epoch": 4.396264176117412, "step": 13180 }, { "epoch": 4.396264176117412, "ref_ce_loss": 0.052917636930942535, "step": 13180 }, { "epoch": 4.396264176117412, "loss": 0.3381150960922241, "step": 13180 }, { "ce_loss": 0.03516875579953194, "epoch": 4.396264176117412, "step": 13180 }, { "distill_loss": 0.18441802263259888, "epoch": 4.396264176117412, "step": 13180 }, { "epoch": 4.396264176117412, "ref_ce_loss": 0.07337495684623718, "step": 13180 }, { "epoch": 4.396264176117412, "loss": 0.427213579416275, "step": 13180 }, { "ce_loss": 0.13373126089572906, "epoch": 4.396264176117412, "step": 13180 }, { "distill_loss": 0.16650520265102386, "epoch": 4.396264176117412, "step": 13180 }, { "epoch": 4.396264176117412, "ref_ce_loss": 0.07328952848911285, "step": 13180 }, { "epoch": 4.396264176117412, "loss": 0.8363127708435059, "step": 13180 }, { "ce_loss": 0.07693137973546982, "epoch": 4.396264176117412, "step": 13180 }, { "distill_loss": 0.17669403553009033, "epoch": 4.396264176117412, "step": 13180 }, { "epoch": 4.396264176117412, "ref_ce_loss": 0.08033239841461182, "step": 13180 }, { "epoch": 4.399599733155437, "loss": 0.445, "step": 13190 }, { "epoch": 4.399599733155437, "grad_norm": 2.264517068862915, "step": 13190 }, { "epoch": 4.399599733155437, "learning_rate": 0.00013306155944776315, "step": 13190 }, { "epoch": 4.399599733155437, "loss": 0.45710739493370056, "step": 13190 }, { "ce_loss": 0.1478605568408966, "epoch": 4.399599733155437, "step": 13190 }, { "distill_loss": 0.13630427420139313, "epoch": 4.399599733155437, "step": 13190 }, { "epoch": 4.399599733155437, "ref_ce_loss": 0.0982251912355423, "step": 13190 }, { "epoch": 4.399599733155437, "loss": 0.4475752115249634, "step": 13190 }, { "ce_loss": 0.12199734896421432, "epoch": 4.399599733155437, "step": 13190 }, { "distill_loss": 0.15550018846988678, "epoch": 4.399599733155437, "step": 13190 }, { "epoch": 4.399599733155437, "ref_ce_loss": 0.10780463367700577, "step": 13190 }, { "epoch": 4.399599733155437, "loss": 0.34469008445739746, "step": 13190 }, { "ce_loss": 0.1141442283987999, "epoch": 4.399599733155437, "step": 13190 }, { "distill_loss": 0.13208875060081482, "epoch": 4.399599733155437, "step": 13190 }, { "epoch": 4.399599733155437, "ref_ce_loss": 0.06683417409658432, "step": 13190 }, { "epoch": 4.399599733155437, "loss": 0.37320512533187866, "step": 13190 }, { "ce_loss": 0.12317366153001785, "epoch": 4.399599733155437, "step": 13190 }, { "distill_loss": 0.19788286089897156, "epoch": 4.399599733155437, "step": 13190 }, { "epoch": 4.399599733155437, "ref_ce_loss": 0.051998071372509, "step": 13190 }, { "epoch": 4.402935290193462, "loss": 0.4291, "step": 13200 }, { "epoch": 4.402935290193462, "grad_norm": 2.0251078605651855, "step": 13200 }, { "epoch": 4.402935290193462, "learning_rate": 0.00013286030916260337, "step": 13200 }, { "epoch": 4.402935290193462, "loss": 0.4113975167274475, "step": 13200 }, { "ce_loss": 0.10742149502038956, "epoch": 4.402935290193462, "step": 13200 }, { "distill_loss": 0.14522676169872284, "epoch": 4.402935290193462, "step": 13200 }, { "epoch": 4.402935290193462, "ref_ce_loss": 0.10619600117206573, "step": 13200 }, { "epoch": 4.402935290193462, "loss": 0.3976299464702606, "step": 13200 }, { "ce_loss": 0.1575118750333786, "epoch": 4.402935290193462, "step": 13200 }, { "distill_loss": 0.10850942879915237, "epoch": 4.402935290193462, "step": 13200 }, { "epoch": 4.402935290193462, "ref_ce_loss": 0.0715140551328659, "step": 13200 }, { "epoch": 4.402935290193462, "loss": 0.5364969968795776, "step": 13200 }, { "ce_loss": 0.1614045798778534, "epoch": 4.402935290193462, "step": 13200 }, { "distill_loss": 0.23162581026554108, "epoch": 4.402935290193462, "step": 13200 }, { "epoch": 4.402935290193462, "ref_ce_loss": 0.10570058971643448, "step": 13200 }, { "epoch": 4.402935290193462, "loss": 0.3077254891395569, "step": 13200 }, { "ce_loss": 0.09167087078094482, "epoch": 4.402935290193462, "step": 13200 }, { "distill_loss": 0.1461600661277771, "epoch": 4.402935290193462, "step": 13200 }, { "epoch": 4.402935290193462, "ref_ce_loss": 0.0452951155602932, "step": 13200 }, { "epoch": 4.406270847231488, "loss": 0.4201, "step": 13210 }, { "epoch": 4.406270847231488, "grad_norm": 3.6564624309539795, "step": 13210 }, { "epoch": 4.406270847231488, "learning_rate": 0.00013265909013347865, "step": 13210 }, { "epoch": 4.406270847231488, "loss": 0.6841806769371033, "step": 13210 }, { "ce_loss": 0.10823642462491989, "epoch": 4.406270847231488, "step": 13210 }, { "distill_loss": 0.18694089353084564, "epoch": 4.406270847231488, "step": 13210 }, { "epoch": 4.406270847231488, "ref_ce_loss": 0.0933864563703537, "step": 13210 }, { "epoch": 4.406270847231488, "loss": 0.3473528325557709, "step": 13210 }, { "ce_loss": 0.10555067658424377, "epoch": 4.406270847231488, "step": 13210 }, { "distill_loss": 0.13975630700588226, "epoch": 4.406270847231488, "step": 13210 }, { "epoch": 4.406270847231488, "ref_ce_loss": 0.050673503428697586, "step": 13210 }, { "epoch": 4.406270847231488, "loss": 0.6095040440559387, "step": 13210 }, { "ce_loss": 0.15313434600830078, "epoch": 4.406270847231488, "step": 13210 }, { "distill_loss": 0.1529056876897812, "epoch": 4.406270847231488, "step": 13210 }, { "epoch": 4.406270847231488, "ref_ce_loss": 0.07480410486459732, "step": 13210 }, { "epoch": 4.406270847231488, "loss": 0.48279547691345215, "step": 13210 }, { "ce_loss": 0.1067587211728096, "epoch": 4.406270847231488, "step": 13210 }, { "distill_loss": 0.18704378604888916, "epoch": 4.406270847231488, "step": 13210 }, { "epoch": 4.406270847231488, "ref_ce_loss": 0.09729069471359253, "step": 13210 }, { "epoch": 4.409606404269513, "loss": 0.4266, "step": 13220 }, { "epoch": 4.409606404269513, "grad_norm": 2.6186203956604004, "step": 13220 }, { "epoch": 4.409606404269513, "learning_rate": 0.00013245790272733307, "step": 13220 }, { "epoch": 4.409606404269513, "loss": 0.3716428279876709, "step": 13220 }, { "ce_loss": 0.07271506637334824, "epoch": 4.409606404269513, "step": 13220 }, { "distill_loss": 0.14049409329891205, "epoch": 4.409606404269513, "step": 13220 }, { "epoch": 4.409606404269513, "ref_ce_loss": 0.0727582648396492, "step": 13220 }, { "epoch": 4.409606404269513, "loss": 0.3323671519756317, "step": 13220 }, { "ce_loss": 0.08981771767139435, "epoch": 4.409606404269513, "step": 13220 }, { "distill_loss": 0.14637064933776855, "epoch": 4.409606404269513, "step": 13220 }, { "epoch": 4.409606404269513, "ref_ce_loss": 0.06951261311769485, "step": 13220 }, { "epoch": 4.409606404269513, "loss": 0.38633036613464355, "step": 13220 }, { "ce_loss": 0.10376311838626862, "epoch": 4.409606404269513, "step": 13220 }, { "distill_loss": 0.1843695342540741, "epoch": 4.409606404269513, "step": 13220 }, { "epoch": 4.409606404269513, "ref_ce_loss": 0.068430595099926, "step": 13220 }, { "epoch": 4.409606404269513, "loss": 0.3019222319126129, "step": 13220 }, { "ce_loss": 0.015285542234778404, "epoch": 4.409606404269513, "step": 13220 }, { "distill_loss": 0.09861694276332855, "epoch": 4.409606404269513, "step": 13220 }, { "epoch": 4.409606404269513, "ref_ce_loss": 0.06827165186405182, "step": 13220 }, { "epoch": 4.4129419613075385, "loss": 0.4154, "step": 13230 }, { "epoch": 4.4129419613075385, "grad_norm": 2.5071351528167725, "step": 13230 }, { "epoch": 4.4129419613075385, "learning_rate": 0.00013225674731105318, "step": 13230 }, { "epoch": 4.4129419613075385, "loss": 0.5132056474685669, "step": 13230 }, { "ce_loss": 0.17748892307281494, "epoch": 4.4129419613075385, "step": 13230 }, { "distill_loss": 0.16994822025299072, "epoch": 4.4129419613075385, "step": 13230 }, { "epoch": 4.4129419613075385, "ref_ce_loss": 0.08495409041643143, "step": 13230 }, { "epoch": 4.4129419613075385, "loss": 0.34527939558029175, "step": 13230 }, { "ce_loss": 0.10870076715946198, "epoch": 4.4129419613075385, "step": 13230 }, { "distill_loss": 0.1736239790916443, "epoch": 4.4129419613075385, "step": 13230 }, { "epoch": 4.4129419613075385, "ref_ce_loss": 0.0629189983010292, "step": 13230 }, { "epoch": 4.4129419613075385, "loss": 0.7003483176231384, "step": 13230 }, { "ce_loss": 0.24649113416671753, "epoch": 4.4129419613075385, "step": 13230 }, { "distill_loss": 0.21717987954616547, "epoch": 4.4129419613075385, "step": 13230 }, { "epoch": 4.4129419613075385, "ref_ce_loss": 0.10224221646785736, "step": 13230 }, { "epoch": 4.4129419613075385, "loss": 0.4945812523365021, "step": 13230 }, { "ce_loss": 0.11560438573360443, "epoch": 4.4129419613075385, "step": 13230 }, { "distill_loss": 0.13503828644752502, "epoch": 4.4129419613075385, "step": 13230 }, { "epoch": 4.4129419613075385, "ref_ce_loss": 0.10286214202642441, "step": 13230 }, { "epoch": 4.416277518345564, "loss": 0.4726, "step": 13240 }, { "epoch": 4.416277518345564, "grad_norm": 4.089062690734863, "step": 13240 }, { "epoch": 4.416277518345564, "learning_rate": 0.00013205562425146696, "step": 13240 }, { "epoch": 4.416277518345564, "loss": 0.3657887578010559, "step": 13240 }, { "ce_loss": 0.08184022456407547, "epoch": 4.416277518345564, "step": 13240 }, { "distill_loss": 0.1520671248435974, "epoch": 4.416277518345564, "step": 13240 }, { "epoch": 4.416277518345564, "ref_ce_loss": 0.1031172052025795, "step": 13240 }, { "epoch": 4.416277518345564, "loss": 0.29702356457710266, "step": 13240 }, { "ce_loss": 0.09236989170312881, "epoch": 4.416277518345564, "step": 13240 }, { "distill_loss": 0.12858924269676208, "epoch": 4.416277518345564, "step": 13240 }, { "epoch": 4.416277518345564, "ref_ce_loss": 0.048820894211530685, "step": 13240 }, { "epoch": 4.416277518345564, "loss": 0.6049207448959351, "step": 13240 }, { "ce_loss": 0.07722629606723785, "epoch": 4.416277518345564, "step": 13240 }, { "distill_loss": 0.10887705534696579, "epoch": 4.416277518345564, "step": 13240 }, { "epoch": 4.416277518345564, "ref_ce_loss": 0.06840761005878448, "step": 13240 }, { "epoch": 4.416277518345564, "loss": 0.37058472633361816, "step": 13240 }, { "ce_loss": 0.10921172052621841, "epoch": 4.416277518345564, "step": 13240 }, { "distill_loss": 0.13606517016887665, "epoch": 4.416277518345564, "step": 13240 }, { "epoch": 4.416277518345564, "ref_ce_loss": 0.12518516182899475, "step": 13240 }, { "epoch": 4.419613075383589, "loss": 0.4452, "step": 13250 }, { "epoch": 4.419613075383589, "grad_norm": 2.7039458751678467, "step": 13250 }, { "epoch": 4.419613075383589, "learning_rate": 0.00013185453391534365, "step": 13250 }, { "epoch": 4.419613075383589, "loss": 0.45535069704055786, "step": 13250 }, { "ce_loss": 0.1358918398618698, "epoch": 4.419613075383589, "step": 13250 }, { "distill_loss": 0.25594010949134827, "epoch": 4.419613075383589, "step": 13250 }, { "epoch": 4.419613075383589, "ref_ce_loss": 0.06339307874441147, "step": 13250 }, { "epoch": 4.419613075383589, "loss": 0.4872303009033203, "step": 13250 }, { "ce_loss": 0.11887913197278976, "epoch": 4.419613075383589, "step": 13250 }, { "distill_loss": 0.20435214042663574, "epoch": 4.419613075383589, "step": 13250 }, { "epoch": 4.419613075383589, "ref_ce_loss": 0.07590153813362122, "step": 13250 }, { "epoch": 4.419613075383589, "loss": 0.286593496799469, "step": 13250 }, { "ce_loss": 0.09262045472860336, "epoch": 4.419613075383589, "step": 13250 }, { "distill_loss": 0.13021138310432434, "epoch": 4.419613075383589, "step": 13250 }, { "epoch": 4.419613075383589, "ref_ce_loss": 0.04283803701400757, "step": 13250 }, { "epoch": 4.419613075383589, "loss": 0.3654177486896515, "step": 13250 }, { "ce_loss": 0.11240356415510178, "epoch": 4.419613075383589, "step": 13250 }, { "distill_loss": 0.1893204152584076, "epoch": 4.419613075383589, "step": 13250 }, { "epoch": 4.419613075383589, "ref_ce_loss": 0.06355581432580948, "step": 13250 }, { "epoch": 4.4229486324216145, "loss": 0.4019, "step": 13260 }, { "epoch": 4.4229486324216145, "grad_norm": 3.225231885910034, "step": 13260 }, { "epoch": 4.4229486324216145, "learning_rate": 0.00013165347666939275, "step": 13260 }, { "epoch": 4.4229486324216145, "loss": 0.5111039876937866, "step": 13260 }, { "ce_loss": 0.17488041520118713, "epoch": 4.4229486324216145, "step": 13260 }, { "distill_loss": 0.2363886833190918, "epoch": 4.4229486324216145, "step": 13260 }, { "epoch": 4.4229486324216145, "ref_ce_loss": 0.09943842887878418, "step": 13260 }, { "epoch": 4.4229486324216145, "loss": 0.3635881245136261, "step": 13260 }, { "ce_loss": 0.12603534758090973, "epoch": 4.4229486324216145, "step": 13260 }, { "distill_loss": 0.13557103276252747, "epoch": 4.4229486324216145, "step": 13260 }, { "epoch": 4.4229486324216145, "ref_ce_loss": 0.0772860124707222, "step": 13260 }, { "epoch": 4.4229486324216145, "loss": 0.366436630487442, "step": 13260 }, { "ce_loss": 0.034161556512117386, "epoch": 4.4229486324216145, "step": 13260 }, { "distill_loss": 0.14038299024105072, "epoch": 4.4229486324216145, "step": 13260 }, { "epoch": 4.4229486324216145, "ref_ce_loss": 0.08084021508693695, "step": 13260 }, { "epoch": 4.4229486324216145, "loss": 0.3443140387535095, "step": 13260 }, { "ce_loss": 0.10151468962430954, "epoch": 4.4229486324216145, "step": 13260 }, { "distill_loss": 0.13597281277179718, "epoch": 4.4229486324216145, "step": 13260 }, { "epoch": 4.4229486324216145, "ref_ce_loss": 0.07605358213186264, "step": 13260 }, { "epoch": 4.42628418945964, "loss": 0.4883, "step": 13270 }, { "epoch": 4.42628418945964, "grad_norm": 2.648609161376953, "step": 13270 }, { "epoch": 4.42628418945964, "learning_rate": 0.00013145245288026319, "step": 13270 }, { "epoch": 4.42628418945964, "loss": 0.4274841547012329, "step": 13270 }, { "ce_loss": 0.05348929390311241, "epoch": 4.42628418945964, "step": 13270 }, { "distill_loss": 0.1469852775335312, "epoch": 4.42628418945964, "step": 13270 }, { "epoch": 4.42628418945964, "ref_ce_loss": 0.09373314678668976, "step": 13270 }, { "epoch": 4.42628418945964, "loss": 0.9421871900558472, "step": 13270 }, { "ce_loss": 0.11481419950723648, "epoch": 4.42628418945964, "step": 13270 }, { "distill_loss": 0.17199678719043732, "epoch": 4.42628418945964, "step": 13270 }, { "epoch": 4.42628418945964, "ref_ce_loss": 0.13030129671096802, "step": 13270 }, { "epoch": 4.42628418945964, "loss": 0.5865803956985474, "step": 13270 }, { "ce_loss": 0.15226690471172333, "epoch": 4.42628418945964, "step": 13270 }, { "distill_loss": 0.20426751673221588, "epoch": 4.42628418945964, "step": 13270 }, { "epoch": 4.42628418945964, "ref_ce_loss": 0.08978847414255142, "step": 13270 }, { "epoch": 4.42628418945964, "loss": 0.41231030225753784, "step": 13270 }, { "ce_loss": 0.1027587354183197, "epoch": 4.42628418945964, "step": 13270 }, { "distill_loss": 0.09782750904560089, "epoch": 4.42628418945964, "step": 13270 }, { "epoch": 4.42628418945964, "ref_ce_loss": 0.0866297110915184, "step": 13270 }, { "epoch": 4.429619746497665, "loss": 0.4565, "step": 13280 }, { "epoch": 4.429619746497665, "grad_norm": 2.3905386924743652, "step": 13280 }, { "epoch": 4.429619746497665, "learning_rate": 0.0001312514629145432, "step": 13280 }, { "epoch": 4.429619746497665, "loss": 0.3989601731300354, "step": 13280 }, { "ce_loss": 0.08613447844982147, "epoch": 4.429619746497665, "step": 13280 }, { "distill_loss": 0.13968902826309204, "epoch": 4.429619746497665, "step": 13280 }, { "epoch": 4.429619746497665, "ref_ce_loss": 0.08468887954950333, "step": 13280 }, { "epoch": 4.429619746497665, "loss": 0.4234456419944763, "step": 13280 }, { "ce_loss": 0.11148975044488907, "epoch": 4.429619746497665, "step": 13280 }, { "distill_loss": 0.15834802389144897, "epoch": 4.429619746497665, "step": 13280 }, { "epoch": 4.429619746497665, "ref_ce_loss": 0.10697125643491745, "step": 13280 }, { "epoch": 4.429619746497665, "loss": 0.32392358779907227, "step": 13280 }, { "ce_loss": 0.06255452334880829, "epoch": 4.429619746497665, "step": 13280 }, { "distill_loss": 0.14786508679389954, "epoch": 4.429619746497665, "step": 13280 }, { "epoch": 4.429619746497665, "ref_ce_loss": 0.07415209710597992, "step": 13280 }, { "epoch": 4.429619746497665, "loss": 0.47612956166267395, "step": 13280 }, { "ce_loss": 0.16316884756088257, "epoch": 4.429619746497665, "step": 13280 }, { "distill_loss": 0.1871202141046524, "epoch": 4.429619746497665, "step": 13280 }, { "epoch": 4.429619746497665, "ref_ce_loss": 0.10071472823619843, "step": 13280 }, { "epoch": 4.432955303535691, "loss": 0.4265, "step": 13290 }, { "epoch": 4.432955303535691, "grad_norm": 2.2215828895568848, "step": 13290 }, { "epoch": 4.432955303535691, "learning_rate": 0.00013105050713875922, "step": 13290 }, { "epoch": 4.432955303535691, "loss": 0.5471583604812622, "step": 13290 }, { "ce_loss": 0.11144928634166718, "epoch": 4.432955303535691, "step": 13290 }, { "distill_loss": 0.16252776980400085, "epoch": 4.432955303535691, "step": 13290 }, { "epoch": 4.432955303535691, "ref_ce_loss": 0.06755536794662476, "step": 13290 }, { "epoch": 4.432955303535691, "loss": 0.2610267102718353, "step": 13290 }, { "ce_loss": 0.09459121525287628, "epoch": 4.432955303535691, "step": 13290 }, { "distill_loss": 0.09221311658620834, "epoch": 4.432955303535691, "step": 13290 }, { "epoch": 4.432955303535691, "ref_ce_loss": 0.07398366928100586, "step": 13290 }, { "epoch": 4.432955303535691, "loss": 0.5348576307296753, "step": 13290 }, { "ce_loss": 0.0725354477763176, "epoch": 4.432955303535691, "step": 13290 }, { "distill_loss": 0.13648444414138794, "epoch": 4.432955303535691, "step": 13290 }, { "epoch": 4.432955303535691, "ref_ce_loss": 0.10237973183393478, "step": 13290 }, { "epoch": 4.432955303535691, "loss": 0.33219125866889954, "step": 13290 }, { "ce_loss": 0.11223422735929489, "epoch": 4.432955303535691, "step": 13290 }, { "distill_loss": 0.15365713834762573, "epoch": 4.432955303535691, "step": 13290 }, { "epoch": 4.432955303535691, "ref_ce_loss": 0.06554263085126877, "step": 13290 }, { "epoch": 4.436290860573716, "loss": 0.4689, "step": 13300 }, { "epoch": 4.436290860573716, "grad_norm": 1.8762089014053345, "step": 13300 }, { "epoch": 4.436290860573716, "learning_rate": 0.00013084958591937519, "step": 13300 }, { "epoch": 4.436290860573716, "loss": 0.3735318183898926, "step": 13300 }, { "ce_loss": 0.05010557547211647, "epoch": 4.436290860573716, "step": 13300 }, { "distill_loss": 0.19283129274845123, "epoch": 4.436290860573716, "step": 13300 }, { "epoch": 4.436290860573716, "ref_ce_loss": 0.07497648149728775, "step": 13300 }, { "epoch": 4.436290860573716, "loss": 0.4607619643211365, "step": 13300 }, { "ce_loss": 0.1650754064321518, "epoch": 4.436290860573716, "step": 13300 }, { "distill_loss": 0.13137763738632202, "epoch": 4.436290860573716, "step": 13300 }, { "epoch": 4.436290860573716, "ref_ce_loss": 0.0980844497680664, "step": 13300 }, { "epoch": 4.436290860573716, "loss": 0.36769622564315796, "step": 13300 }, { "ce_loss": 0.08664768934249878, "epoch": 4.436290860573716, "step": 13300 }, { "distill_loss": 0.11689133197069168, "epoch": 4.436290860573716, "step": 13300 }, { "epoch": 4.436290860573716, "ref_ce_loss": 0.0674903467297554, "step": 13300 }, { "epoch": 4.436290860573716, "loss": 0.41448602080345154, "step": 13300 }, { "ce_loss": 0.118854820728302, "epoch": 4.436290860573716, "step": 13300 }, { "distill_loss": 0.156342551112175, "epoch": 4.436290860573716, "step": 13300 }, { "epoch": 4.436290860573716, "ref_ce_loss": 0.08365678787231445, "step": 13300 }, { "epoch": 4.439626417611741, "loss": 0.4327, "step": 13310 }, { "epoch": 4.439626417611741, "grad_norm": 2.736565351486206, "step": 13310 }, { "epoch": 4.439626417611741, "learning_rate": 0.00013064869962279226, "step": 13310 }, { "epoch": 4.439626417611741, "loss": 0.47164031863212585, "step": 13310 }, { "ce_loss": 0.13483786582946777, "epoch": 4.439626417611741, "step": 13310 }, { "distill_loss": 0.20212770998477936, "epoch": 4.439626417611741, "step": 13310 }, { "epoch": 4.439626417611741, "ref_ce_loss": 0.09424245357513428, "step": 13310 }, { "epoch": 4.439626417611741, "loss": 0.4842525124549866, "step": 13310 }, { "ce_loss": 0.1050151139497757, "epoch": 4.439626417611741, "step": 13310 }, { "distill_loss": 0.12448824197053909, "epoch": 4.439626417611741, "step": 13310 }, { "epoch": 4.439626417611741, "ref_ce_loss": 0.10238910466432571, "step": 13310 }, { "epoch": 4.439626417611741, "loss": 0.3458286225795746, "step": 13310 }, { "ce_loss": 0.07503864914178848, "epoch": 4.439626417611741, "step": 13310 }, { "distill_loss": 0.14710615575313568, "epoch": 4.439626417611741, "step": 13310 }, { "epoch": 4.439626417611741, "ref_ce_loss": 0.08717601001262665, "step": 13310 }, { "epoch": 4.439626417611741, "loss": 0.4406740367412567, "step": 13310 }, { "ce_loss": 0.11572980880737305, "epoch": 4.439626417611741, "step": 13310 }, { "distill_loss": 0.1492346227169037, "epoch": 4.439626417611741, "step": 13310 }, { "epoch": 4.439626417611741, "ref_ce_loss": 0.07425940036773682, "step": 13310 }, { "epoch": 4.442961974649767, "loss": 0.4801, "step": 13320 }, { "epoch": 4.442961974649767, "grad_norm": 3.5770788192749023, "step": 13320 }, { "epoch": 4.442961974649767, "learning_rate": 0.00013044784861534773, "step": 13320 }, { "epoch": 4.442961974649767, "loss": 0.3639149069786072, "step": 13320 }, { "ce_loss": 0.06203765794634819, "epoch": 4.442961974649767, "step": 13320 }, { "distill_loss": 0.13513824343681335, "epoch": 4.442961974649767, "step": 13320 }, { "epoch": 4.442961974649767, "ref_ce_loss": 0.045562874525785446, "step": 13320 }, { "epoch": 4.442961974649767, "loss": 0.4618871808052063, "step": 13320 }, { "ce_loss": 0.07080277055501938, "epoch": 4.442961974649767, "step": 13320 }, { "distill_loss": 0.17618733644485474, "epoch": 4.442961974649767, "step": 13320 }, { "epoch": 4.442961974649767, "ref_ce_loss": 0.08771321922540665, "step": 13320 }, { "epoch": 4.442961974649767, "loss": 0.3985617756843567, "step": 13320 }, { "ce_loss": 0.11235746741294861, "epoch": 4.442961974649767, "step": 13320 }, { "distill_loss": 0.14052695035934448, "epoch": 4.442961974649767, "step": 13320 }, { "epoch": 4.442961974649767, "ref_ce_loss": 0.1089642271399498, "step": 13320 }, { "epoch": 4.442961974649767, "loss": 0.3456037640571594, "step": 13320 }, { "ce_loss": 0.03526025265455246, "epoch": 4.442961974649767, "step": 13320 }, { "distill_loss": 0.1443566381931305, "epoch": 4.442961974649767, "step": 13320 }, { "epoch": 4.442961974649767, "ref_ce_loss": 0.07900136709213257, "step": 13320 }, { "epoch": 4.446297531687792, "loss": 0.3867, "step": 13330 }, { "epoch": 4.446297531687792, "grad_norm": 2.4664969444274902, "step": 13330 }, { "epoch": 4.446297531687792, "learning_rate": 0.0001302470332633146, "step": 13330 }, { "epoch": 4.446297531687792, "loss": 0.6854321360588074, "step": 13330 }, { "ce_loss": 0.10115274041891098, "epoch": 4.446297531687792, "step": 13330 }, { "distill_loss": 0.22096025943756104, "epoch": 4.446297531687792, "step": 13330 }, { "epoch": 4.446297531687792, "ref_ce_loss": 0.10487420111894608, "step": 13330 }, { "epoch": 4.446297531687792, "loss": 0.35559317469596863, "step": 13330 }, { "ce_loss": 0.07464788109064102, "epoch": 4.446297531687792, "step": 13330 }, { "distill_loss": 0.15491950511932373, "epoch": 4.446297531687792, "step": 13330 }, { "epoch": 4.446297531687792, "ref_ce_loss": 0.09533894807100296, "step": 13330 }, { "epoch": 4.446297531687792, "loss": 0.6421335339546204, "step": 13330 }, { "ce_loss": 0.10751327127218246, "epoch": 4.446297531687792, "step": 13330 }, { "distill_loss": 0.21741746366024017, "epoch": 4.446297531687792, "step": 13330 }, { "epoch": 4.446297531687792, "ref_ce_loss": 0.09993693977594376, "step": 13330 }, { "epoch": 4.446297531687792, "loss": 0.5558599829673767, "step": 13330 }, { "ce_loss": 0.11573997139930725, "epoch": 4.446297531687792, "step": 13330 }, { "distill_loss": 0.1898089200258255, "epoch": 4.446297531687792, "step": 13330 }, { "epoch": 4.446297531687792, "ref_ce_loss": 0.1459214985370636, "step": 13330 }, { "epoch": 4.449633088725817, "loss": 0.487, "step": 13340 }, { "epoch": 4.449633088725817, "grad_norm": 3.9603376388549805, "step": 13340 }, { "epoch": 4.449633088725817, "learning_rate": 0.00013004625393290097, "step": 13340 }, { "epoch": 4.449633088725817, "loss": 0.3406130075454712, "step": 13340 }, { "ce_loss": 0.09222155809402466, "epoch": 4.449633088725817, "step": 13340 }, { "distill_loss": 0.15992435812950134, "epoch": 4.449633088725817, "step": 13340 }, { "epoch": 4.449633088725817, "ref_ce_loss": 0.055970415472984314, "step": 13340 }, { "epoch": 4.449633088725817, "loss": 0.27611514925956726, "step": 13340 }, { "ce_loss": 0.048204366117715836, "epoch": 4.449633088725817, "step": 13340 }, { "distill_loss": 0.10166335850954056, "epoch": 4.449633088725817, "step": 13340 }, { "epoch": 4.449633088725817, "ref_ce_loss": 0.0979592576622963, "step": 13340 }, { "epoch": 4.449633088725817, "loss": 0.38644808530807495, "step": 13340 }, { "ce_loss": 0.07723668962717056, "epoch": 4.449633088725817, "step": 13340 }, { "distill_loss": 0.24085360765457153, "epoch": 4.449633088725817, "step": 13340 }, { "epoch": 4.449633088725817, "ref_ce_loss": 0.06818073242902756, "step": 13340 }, { "epoch": 4.449633088725817, "loss": 0.484552800655365, "step": 13340 }, { "ce_loss": 0.14864076673984528, "epoch": 4.449633088725817, "step": 13340 }, { "distill_loss": 0.18420973420143127, "epoch": 4.449633088725817, "step": 13340 }, { "epoch": 4.449633088725817, "ref_ce_loss": 0.15143580734729767, "step": 13340 }, { "epoch": 4.452968645763843, "loss": 0.4362, "step": 13350 }, { "epoch": 4.452968645763843, "grad_norm": 3.3409905433654785, "step": 13350 }, { "epoch": 4.452968645763843, "learning_rate": 0.000129845510990249, "step": 13350 }, { "epoch": 4.452968645763843, "loss": 0.5213329792022705, "step": 13350 }, { "ce_loss": 0.17185841500759125, "epoch": 4.452968645763843, "step": 13350 }, { "distill_loss": 0.19801545143127441, "epoch": 4.452968645763843, "step": 13350 }, { "epoch": 4.452968645763843, "ref_ce_loss": 0.09971991181373596, "step": 13350 }, { "epoch": 4.452968645763843, "loss": 0.49981921911239624, "step": 13350 }, { "ce_loss": 0.06869068741798401, "epoch": 4.452968645763843, "step": 13350 }, { "distill_loss": 0.10246706008911133, "epoch": 4.452968645763843, "step": 13350 }, { "epoch": 4.452968645763843, "ref_ce_loss": 0.05476094037294388, "step": 13350 }, { "epoch": 4.452968645763843, "loss": 0.45160943269729614, "step": 13350 }, { "ce_loss": 0.12725037336349487, "epoch": 4.452968645763843, "step": 13350 }, { "distill_loss": 0.2022404819726944, "epoch": 4.452968645763843, "step": 13350 }, { "epoch": 4.452968645763843, "ref_ce_loss": 0.07778077572584152, "step": 13350 }, { "epoch": 4.452968645763843, "loss": 0.3159412741661072, "step": 13350 }, { "ce_loss": 0.11092124879360199, "epoch": 4.452968645763843, "step": 13350 }, { "distill_loss": 0.13240209221839905, "epoch": 4.452968645763843, "step": 13350 }, { "epoch": 4.452968645763843, "ref_ce_loss": 0.07244209945201874, "step": 13350 }, { "epoch": 4.456304202801868, "loss": 0.4877, "step": 13360 }, { "epoch": 4.456304202801868, "grad_norm": 2.414829730987549, "step": 13360 }, { "epoch": 4.456304202801868, "learning_rate": 0.0001296448048014347, "step": 13360 }, { "epoch": 4.456304202801868, "loss": 0.780661940574646, "step": 13360 }, { "ce_loss": 0.08376435935497284, "epoch": 4.456304202801868, "step": 13360 }, { "distill_loss": 0.21675555408000946, "epoch": 4.456304202801868, "step": 13360 }, { "epoch": 4.456304202801868, "ref_ce_loss": 0.1165047362446785, "step": 13360 }, { "epoch": 4.456304202801868, "loss": 0.41870665550231934, "step": 13360 }, { "ce_loss": 0.08456767350435257, "epoch": 4.456304202801868, "step": 13360 }, { "distill_loss": 0.10734406113624573, "epoch": 4.456304202801868, "step": 13360 }, { "epoch": 4.456304202801868, "ref_ce_loss": 0.07087530195713043, "step": 13360 }, { "epoch": 4.456304202801868, "loss": 0.2585833668708801, "step": 13360 }, { "ce_loss": 0.02313615381717682, "epoch": 4.456304202801868, "step": 13360 }, { "distill_loss": 0.07941682636737823, "epoch": 4.456304202801868, "step": 13360 }, { "epoch": 4.456304202801868, "ref_ce_loss": 0.04488234594464302, "step": 13360 }, { "epoch": 4.456304202801868, "loss": 0.2585996985435486, "step": 13360 }, { "ce_loss": 0.06306672841310501, "epoch": 4.456304202801868, "step": 13360 }, { "distill_loss": 0.11801082640886307, "epoch": 4.456304202801868, "step": 13360 }, { "epoch": 4.456304202801868, "ref_ce_loss": 0.07743985950946808, "step": 13360 }, { "epoch": 4.459639759839893, "loss": 0.4529, "step": 13370 }, { "epoch": 4.459639759839893, "grad_norm": 2.287713050842285, "step": 13370 }, { "epoch": 4.459639759839893, "learning_rate": 0.00012944413573246698, "step": 13370 }, { "epoch": 4.459639759839893, "loss": 0.3657965064048767, "step": 13370 }, { "ce_loss": 0.0648331418633461, "epoch": 4.459639759839893, "step": 13370 }, { "distill_loss": 0.16222751140594482, "epoch": 4.459639759839893, "step": 13370 }, { "epoch": 4.459639759839893, "ref_ce_loss": 0.06190655007958412, "step": 13370 }, { "epoch": 4.459639759839893, "loss": 0.3106033205986023, "step": 13370 }, { "ce_loss": 0.09094034135341644, "epoch": 4.459639759839893, "step": 13370 }, { "distill_loss": 0.1325746476650238, "epoch": 4.459639759839893, "step": 13370 }, { "epoch": 4.459639759839893, "ref_ce_loss": 0.08697564899921417, "step": 13370 }, { "epoch": 4.459639759839893, "loss": 0.20381583273410797, "step": 13370 }, { "ce_loss": 0.019039664417505264, "epoch": 4.459639759839893, "step": 13370 }, { "distill_loss": 0.08726243674755096, "epoch": 4.459639759839893, "step": 13370 }, { "epoch": 4.459639759839893, "ref_ce_loss": 0.0972718819975853, "step": 13370 }, { "epoch": 4.459639759839893, "loss": 0.6112961769104004, "step": 13370 }, { "ce_loss": 0.04354558512568474, "epoch": 4.459639759839893, "step": 13370 }, { "distill_loss": 0.24787333607673645, "epoch": 4.459639759839893, "step": 13370 }, { "epoch": 4.459639759839893, "ref_ce_loss": 0.10577059537172318, "step": 13370 }, { "epoch": 4.462975316877919, "loss": 0.4255, "step": 13380 }, { "epoch": 4.462975316877919, "grad_norm": 2.6293230056762695, "step": 13380 }, { "epoch": 4.462975316877919, "learning_rate": 0.000129243504149287, "step": 13380 }, { "epoch": 4.462975316877919, "loss": 0.19177551567554474, "step": 13380 }, { "ce_loss": 0.020500704646110535, "epoch": 4.462975316877919, "step": 13380 }, { "distill_loss": 0.09150394052267075, "epoch": 4.462975316877919, "step": 13380 }, { "epoch": 4.462975316877919, "ref_ce_loss": 0.051849473267793655, "step": 13380 }, { "epoch": 4.462975316877919, "loss": 0.29290539026260376, "step": 13380 }, { "ce_loss": 0.09419353306293488, "epoch": 4.462975316877919, "step": 13380 }, { "distill_loss": 0.11655114591121674, "epoch": 4.462975316877919, "step": 13380 }, { "epoch": 4.462975316877919, "ref_ce_loss": 0.08180296421051025, "step": 13380 }, { "epoch": 4.462975316877919, "loss": 0.8274011611938477, "step": 13380 }, { "ce_loss": 0.12826131284236908, "epoch": 4.462975316877919, "step": 13380 }, { "distill_loss": 0.15427960455417633, "epoch": 4.462975316877919, "step": 13380 }, { "epoch": 4.462975316877919, "ref_ce_loss": 0.1055775135755539, "step": 13380 }, { "epoch": 4.462975316877919, "loss": 0.33117276430130005, "step": 13380 }, { "ce_loss": 0.05304085463285446, "epoch": 4.462975316877919, "step": 13380 }, { "distill_loss": 0.12336459755897522, "epoch": 4.462975316877919, "step": 13380 }, { "epoch": 4.462975316877919, "ref_ce_loss": 0.07321083545684814, "step": 13380 }, { "epoch": 4.466310873915944, "loss": 0.4772, "step": 13390 }, { "epoch": 4.466310873915944, "grad_norm": 3.265866994857788, "step": 13390 }, { "epoch": 4.466310873915944, "learning_rate": 0.00012904291041776776, "step": 13390 }, { "epoch": 4.466310873915944, "loss": 0.33649423718452454, "step": 13390 }, { "ce_loss": 0.05174952372908592, "epoch": 4.466310873915944, "step": 13390 }, { "distill_loss": 0.18453219532966614, "epoch": 4.466310873915944, "step": 13390 }, { "epoch": 4.466310873915944, "ref_ce_loss": 0.10008276998996735, "step": 13390 }, { "epoch": 4.466310873915944, "loss": 0.37133169174194336, "step": 13390 }, { "ce_loss": 0.1038256585597992, "epoch": 4.466310873915944, "step": 13390 }, { "distill_loss": 0.1483224332332611, "epoch": 4.466310873915944, "step": 13390 }, { "epoch": 4.466310873915944, "ref_ce_loss": 0.09805310517549515, "step": 13390 }, { "epoch": 4.466310873915944, "loss": 0.7990421056747437, "step": 13390 }, { "ce_loss": 0.08086645603179932, "epoch": 4.466310873915944, "step": 13390 }, { "distill_loss": 0.2780746817588806, "epoch": 4.466310873915944, "step": 13390 }, { "epoch": 4.466310873915944, "ref_ce_loss": 0.13248445093631744, "step": 13390 }, { "epoch": 4.466310873915944, "loss": 0.6483367085456848, "step": 13390 }, { "ce_loss": 0.17032165825366974, "epoch": 4.466310873915944, "step": 13390 }, { "distill_loss": 0.3139309883117676, "epoch": 4.466310873915944, "step": 13390 }, { "epoch": 4.466310873915944, "ref_ce_loss": 0.11699211597442627, "step": 13390 }, { "epoch": 4.469646430953969, "loss": 0.4952, "step": 13400 }, { "epoch": 4.469646430953969, "grad_norm": 2.7196404933929443, "step": 13400 }, { "epoch": 4.469646430953969, "learning_rate": 0.000128842354903713, "step": 13400 }, { "epoch": 4.469646430953969, "loss": 0.26788756251335144, "step": 13400 }, { "ce_loss": 0.04564729705452919, "epoch": 4.469646430953969, "step": 13400 }, { "distill_loss": 0.10269252955913544, "epoch": 4.469646430953969, "step": 13400 }, { "epoch": 4.469646430953969, "ref_ce_loss": 0.07589396834373474, "step": 13400 }, { "epoch": 4.469646430953969, "loss": 0.2653442919254303, "step": 13400 }, { "ce_loss": 0.04288773238658905, "epoch": 4.469646430953969, "step": 13400 }, { "distill_loss": 0.098898746073246, "epoch": 4.469646430953969, "step": 13400 }, { "epoch": 4.469646430953969, "ref_ce_loss": 0.07052404433488846, "step": 13400 }, { "epoch": 4.469646430953969, "loss": 0.3910791575908661, "step": 13400 }, { "ce_loss": 0.08472683280706406, "epoch": 4.469646430953969, "step": 13400 }, { "distill_loss": 0.17124629020690918, "epoch": 4.469646430953969, "step": 13400 }, { "epoch": 4.469646430953969, "ref_ce_loss": 0.08802192658185959, "step": 13400 }, { "epoch": 4.469646430953969, "loss": 0.4675942659378052, "step": 13400 }, { "ce_loss": 0.06608907133340836, "epoch": 4.469646430953969, "step": 13400 }, { "distill_loss": 0.2451399713754654, "epoch": 4.469646430953969, "step": 13400 }, { "epoch": 4.469646430953969, "ref_ce_loss": 0.07252366840839386, "step": 13400 }, { "epoch": 4.472981987991995, "loss": 0.5049, "step": 13410 }, { "epoch": 4.472981987991995, "grad_norm": 3.4293053150177, "step": 13410 }, { "epoch": 4.472981987991995, "learning_rate": 0.00012864183797285683, "step": 13410 }, { "epoch": 4.472981987991995, "loss": 0.28266942501068115, "step": 13410 }, { "ce_loss": 0.08216257393360138, "epoch": 4.472981987991995, "step": 13410 }, { "distill_loss": 0.13507524132728577, "epoch": 4.472981987991995, "step": 13410 }, { "epoch": 4.472981987991995, "ref_ce_loss": 0.06497799605131149, "step": 13410 }, { "epoch": 4.472981987991995, "loss": 0.5329622030258179, "step": 13410 }, { "ce_loss": 0.18671882152557373, "epoch": 4.472981987991995, "step": 13410 }, { "distill_loss": 0.26308295130729675, "epoch": 4.472981987991995, "step": 13410 }, { "epoch": 4.472981987991995, "ref_ce_loss": 0.06053365767002106, "step": 13410 }, { "epoch": 4.472981987991995, "loss": 0.578106164932251, "step": 13410 }, { "ce_loss": 0.1150013655424118, "epoch": 4.472981987991995, "step": 13410 }, { "distill_loss": 0.20448864996433258, "epoch": 4.472981987991995, "step": 13410 }, { "epoch": 4.472981987991995, "ref_ce_loss": 0.09882330894470215, "step": 13410 }, { "epoch": 4.472981987991995, "loss": 0.46086007356643677, "step": 13410 }, { "ce_loss": 0.08865619450807571, "epoch": 4.472981987991995, "step": 13410 }, { "distill_loss": 0.2300991266965866, "epoch": 4.472981987991995, "step": 13410 }, { "epoch": 4.472981987991995, "ref_ce_loss": 0.07322938740253448, "step": 13410 }, { "epoch": 4.47631754503002, "loss": 0.4451, "step": 13420 }, { "epoch": 4.47631754503002, "grad_norm": 2.5083768367767334, "step": 13420 }, { "epoch": 4.47631754503002, "learning_rate": 0.00012844135999086315, "step": 13420 }, { "epoch": 4.47631754503002, "loss": 0.5072634220123291, "step": 13420 }, { "ce_loss": 0.13968051970005035, "epoch": 4.47631754503002, "step": 13420 }, { "distill_loss": 0.27628523111343384, "epoch": 4.47631754503002, "step": 13420 }, { "epoch": 4.47631754503002, "ref_ce_loss": 0.09119226783514023, "step": 13420 }, { "epoch": 4.47631754503002, "loss": 0.23070460557937622, "step": 13420 }, { "ce_loss": 0.0296455267816782, "epoch": 4.47631754503002, "step": 13420 }, { "distill_loss": 0.10874531418085098, "epoch": 4.47631754503002, "step": 13420 }, { "epoch": 4.47631754503002, "ref_ce_loss": 0.04925838112831116, "step": 13420 }, { "epoch": 4.47631754503002, "loss": 0.2778133749961853, "step": 13420 }, { "ce_loss": 0.04140105098485947, "epoch": 4.47631754503002, "step": 13420 }, { "distill_loss": 0.16100528836250305, "epoch": 4.47631754503002, "step": 13420 }, { "epoch": 4.47631754503002, "ref_ce_loss": 0.07529508322477341, "step": 13420 }, { "epoch": 4.47631754503002, "loss": 0.3053801655769348, "step": 13420 }, { "ce_loss": 0.0789996087551117, "epoch": 4.47631754503002, "step": 13420 }, { "distill_loss": 0.11323997378349304, "epoch": 4.47631754503002, "step": 13420 }, { "epoch": 4.47631754503002, "ref_ce_loss": 0.11304476112127304, "step": 13420 }, { "epoch": 4.4796531020680455, "loss": 0.4349, "step": 13430 }, { "epoch": 4.4796531020680455, "grad_norm": 3.8242762088775635, "step": 13430 }, { "epoch": 4.4796531020680455, "learning_rate": 0.00012824092132332466, "step": 13430 }, { "epoch": 4.4796531020680455, "loss": 0.5172803401947021, "step": 13430 }, { "ce_loss": 0.05954471230506897, "epoch": 4.4796531020680455, "step": 13430 }, { "distill_loss": 0.12819169461727142, "epoch": 4.4796531020680455, "step": 13430 }, { "epoch": 4.4796531020680455, "ref_ce_loss": 0.10265228897333145, "step": 13430 }, { "epoch": 4.4796531020680455, "loss": 0.28423821926116943, "step": 13430 }, { "ce_loss": 0.029237208887934685, "epoch": 4.4796531020680455, "step": 13430 }, { "distill_loss": 0.13489145040512085, "epoch": 4.4796531020680455, "step": 13430 }, { "epoch": 4.4796531020680455, "ref_ce_loss": 0.06749539822340012, "step": 13430 }, { "epoch": 4.4796531020680455, "loss": 0.37082672119140625, "step": 13430 }, { "ce_loss": 0.08649688214063644, "epoch": 4.4796531020680455, "step": 13430 }, { "distill_loss": 0.1623936891555786, "epoch": 4.4796531020680455, "step": 13430 }, { "epoch": 4.4796531020680455, "ref_ce_loss": 0.0803963840007782, "step": 13430 }, { "epoch": 4.4796531020680455, "loss": 0.48403871059417725, "step": 13430 }, { "ce_loss": 0.12726320326328278, "epoch": 4.4796531020680455, "step": 13430 }, { "distill_loss": 0.13687562942504883, "epoch": 4.4796531020680455, "step": 13430 }, { "epoch": 4.4796531020680455, "ref_ce_loss": 0.07841704040765762, "step": 13430 }, { "epoch": 4.482988659106071, "loss": 0.4811, "step": 13440 }, { "epoch": 4.482988659106071, "grad_norm": 2.50536847114563, "step": 13440 }, { "epoch": 4.482988659106071, "learning_rate": 0.0001280405223357624, "step": 13440 }, { "epoch": 4.482988659106071, "loss": 0.4957541227340698, "step": 13440 }, { "ce_loss": 0.10157718509435654, "epoch": 4.482988659106071, "step": 13440 }, { "distill_loss": 0.16703274846076965, "epoch": 4.482988659106071, "step": 13440 }, { "epoch": 4.482988659106071, "ref_ce_loss": 0.09913060814142227, "step": 13440 }, { "epoch": 4.482988659106071, "loss": 0.26849082112312317, "step": 13440 }, { "ce_loss": 0.06595831364393234, "epoch": 4.482988659106071, "step": 13440 }, { "distill_loss": 0.12347928434610367, "epoch": 4.482988659106071, "step": 13440 }, { "epoch": 4.482988659106071, "ref_ce_loss": 0.05139641463756561, "step": 13440 }, { "epoch": 4.482988659106071, "loss": 0.34890177845954895, "step": 13440 }, { "ce_loss": 0.01840231381356716, "epoch": 4.482988659106071, "step": 13440 }, { "distill_loss": 0.21284881234169006, "epoch": 4.482988659106071, "step": 13440 }, { "epoch": 4.482988659106071, "ref_ce_loss": 0.055508144199848175, "step": 13440 }, { "epoch": 4.482988659106071, "loss": 0.370951384305954, "step": 13440 }, { "ce_loss": 0.11548206210136414, "epoch": 4.482988659106071, "step": 13440 }, { "distill_loss": 0.17274267971515656, "epoch": 4.482988659106071, "step": 13440 }, { "epoch": 4.482988659106071, "ref_ce_loss": 0.08252322673797607, "step": 13440 }, { "epoch": 4.486324216144096, "loss": 0.4699, "step": 13450 }, { "epoch": 4.486324216144096, "grad_norm": 3.5319814682006836, "step": 13450 }, { "epoch": 4.486324216144096, "learning_rate": 0.0001278401633936251, "step": 13450 }, { "epoch": 4.486324216144096, "loss": 0.4721037745475769, "step": 13450 }, { "ce_loss": 0.06719991564750671, "epoch": 4.486324216144096, "step": 13450 }, { "distill_loss": 0.16518472135066986, "epoch": 4.486324216144096, "step": 13450 }, { "epoch": 4.486324216144096, "ref_ce_loss": 0.061171624809503555, "step": 13450 }, { "epoch": 4.486324216144096, "loss": 0.3494679927825928, "step": 13450 }, { "ce_loss": 0.06874405592679977, "epoch": 4.486324216144096, "step": 13450 }, { "distill_loss": 0.11198866367340088, "epoch": 4.486324216144096, "step": 13450 }, { "epoch": 4.486324216144096, "ref_ce_loss": 0.10032968968153, "step": 13450 }, { "epoch": 4.486324216144096, "loss": 0.500599205493927, "step": 13450 }, { "ce_loss": 0.09012137353420258, "epoch": 4.486324216144096, "step": 13450 }, { "distill_loss": 0.28814423084259033, "epoch": 4.486324216144096, "step": 13450 }, { "epoch": 4.486324216144096, "ref_ce_loss": 0.08558844774961472, "step": 13450 }, { "epoch": 4.486324216144096, "loss": 0.6183328628540039, "step": 13450 }, { "ce_loss": 0.12911294400691986, "epoch": 4.486324216144096, "step": 13450 }, { "distill_loss": 0.28603270649909973, "epoch": 4.486324216144096, "step": 13450 }, { "epoch": 4.486324216144096, "ref_ce_loss": 0.10391736030578613, "step": 13450 }, { "epoch": 4.4896597731821215, "loss": 0.4655, "step": 13460 }, { "epoch": 4.4896597731821215, "grad_norm": 3.1925013065338135, "step": 13460 }, { "epoch": 4.4896597731821215, "learning_rate": 0.0001276398448622884, "step": 13460 }, { "epoch": 4.4896597731821215, "loss": 0.7418397665023804, "step": 13460 }, { "ce_loss": 0.18634061515331268, "epoch": 4.4896597731821215, "step": 13460 }, { "distill_loss": 0.21090039610862732, "epoch": 4.4896597731821215, "step": 13460 }, { "epoch": 4.4896597731821215, "ref_ce_loss": 0.11397220939397812, "step": 13460 }, { "epoch": 4.4896597731821215, "loss": 0.41916927695274353, "step": 13460 }, { "ce_loss": 0.08889000862836838, "epoch": 4.4896597731821215, "step": 13460 }, { "distill_loss": 0.1984071582555771, "epoch": 4.4896597731821215, "step": 13460 }, { "epoch": 4.4896597731821215, "ref_ce_loss": 0.09343430399894714, "step": 13460 }, { "epoch": 4.4896597731821215, "loss": 0.656537652015686, "step": 13460 }, { "ce_loss": 0.19183559715747833, "epoch": 4.4896597731821215, "step": 13460 }, { "distill_loss": 0.20771309733390808, "epoch": 4.4896597731821215, "step": 13460 }, { "epoch": 4.4896597731821215, "ref_ce_loss": 0.09294997155666351, "step": 13460 }, { "epoch": 4.4896597731821215, "loss": 0.19820013642311096, "step": 13460 }, { "ce_loss": 0.028779538348317146, "epoch": 4.4896597731821215, "step": 13460 }, { "distill_loss": 0.11477810144424438, "epoch": 4.4896597731821215, "step": 13460 }, { "epoch": 4.4896597731821215, "ref_ce_loss": 0.05453171953558922, "step": 13460 }, { "epoch": 4.492995330220147, "loss": 0.4577, "step": 13470 }, { "epoch": 4.492995330220147, "grad_norm": 2.804248809814453, "step": 13470 }, { "epoch": 4.492995330220147, "learning_rate": 0.00012743956710705435, "step": 13470 }, { "epoch": 4.492995330220147, "loss": 0.45621517300605774, "step": 13470 }, { "ce_loss": 0.08536119759082794, "epoch": 4.492995330220147, "step": 13470 }, { "distill_loss": 0.1456269472837448, "epoch": 4.492995330220147, "step": 13470 }, { "epoch": 4.492995330220147, "ref_ce_loss": 0.0890783965587616, "step": 13470 }, { "epoch": 4.492995330220147, "loss": 0.292806476354599, "step": 13470 }, { "ce_loss": 0.0762527659535408, "epoch": 4.492995330220147, "step": 13470 }, { "distill_loss": 0.11498447507619858, "epoch": 4.492995330220147, "step": 13470 }, { "epoch": 4.492995330220147, "ref_ce_loss": 0.10128685086965561, "step": 13470 }, { "epoch": 4.492995330220147, "loss": 0.42637720704078674, "step": 13470 }, { "ce_loss": 0.07126825302839279, "epoch": 4.492995330220147, "step": 13470 }, { "distill_loss": 0.14693793654441833, "epoch": 4.492995330220147, "step": 13470 }, { "epoch": 4.492995330220147, "ref_ce_loss": 0.09059864282608032, "step": 13470 }, { "epoch": 4.492995330220147, "loss": 0.628233790397644, "step": 13470 }, { "ce_loss": 0.07562445104122162, "epoch": 4.492995330220147, "step": 13470 }, { "distill_loss": 0.15411150455474854, "epoch": 4.492995330220147, "step": 13470 }, { "epoch": 4.492995330220147, "ref_ce_loss": 0.08947544544935226, "step": 13470 }, { "epoch": 4.496330887258172, "loss": 0.436, "step": 13480 }, { "epoch": 4.496330887258172, "grad_norm": 3.4965808391571045, "step": 13480 }, { "epoch": 4.496330887258172, "learning_rate": 0.0001272393304931505, "step": 13480 }, { "epoch": 4.496330887258172, "loss": 0.5007912516593933, "step": 13480 }, { "ce_loss": 0.15746666491031647, "epoch": 4.496330887258172, "step": 13480 }, { "distill_loss": 0.1947905421257019, "epoch": 4.496330887258172, "step": 13480 }, { "epoch": 4.496330887258172, "ref_ce_loss": 0.11005079746246338, "step": 13480 }, { "epoch": 4.496330887258172, "loss": 0.3622717261314392, "step": 13480 }, { "ce_loss": 0.07799629122018814, "epoch": 4.496330887258172, "step": 13480 }, { "distill_loss": 0.1503952145576477, "epoch": 4.496330887258172, "step": 13480 }, { "epoch": 4.496330887258172, "ref_ce_loss": 0.06716569513082504, "step": 13480 }, { "epoch": 4.496330887258172, "loss": 0.5783126354217529, "step": 13480 }, { "ce_loss": 0.10873718559741974, "epoch": 4.496330887258172, "step": 13480 }, { "distill_loss": 0.10999981313943863, "epoch": 4.496330887258172, "step": 13480 }, { "epoch": 4.496330887258172, "ref_ce_loss": 0.11208885908126831, "step": 13480 }, { "epoch": 4.496330887258172, "loss": 0.3097704350948334, "step": 13480 }, { "ce_loss": 0.04218291491270065, "epoch": 4.496330887258172, "step": 13480 }, { "distill_loss": 0.14731380343437195, "epoch": 4.496330887258172, "step": 13480 }, { "epoch": 4.496330887258172, "ref_ce_loss": 0.07287292927503586, "step": 13480 }, { "epoch": 4.4996664442961976, "loss": 0.4298, "step": 13490 }, { "epoch": 4.4996664442961976, "grad_norm": 3.667353630065918, "step": 13490 }, { "epoch": 4.4996664442961976, "learning_rate": 0.0001270391353857295, "step": 13490 }, { "epoch": 4.4996664442961976, "loss": 0.19863076508045197, "step": 13490 }, { "ce_loss": 0.038019660860300064, "epoch": 4.4996664442961976, "step": 13490 }, { "distill_loss": 0.09817139059305191, "epoch": 4.4996664442961976, "step": 13490 }, { "epoch": 4.4996664442961976, "ref_ce_loss": 0.04153582826256752, "step": 13490 }, { "epoch": 4.4996664442961976, "loss": 0.42932868003845215, "step": 13490 }, { "ce_loss": 0.10347703844308853, "epoch": 4.4996664442961976, "step": 13490 }, { "distill_loss": 0.15128600597381592, "epoch": 4.4996664442961976, "step": 13490 }, { "epoch": 4.4996664442961976, "ref_ce_loss": 0.10648830235004425, "step": 13490 }, { "epoch": 4.4996664442961976, "loss": 0.805569052696228, "step": 13490 }, { "ce_loss": 0.06474862992763519, "epoch": 4.4996664442961976, "step": 13490 }, { "distill_loss": 0.15280646085739136, "epoch": 4.4996664442961976, "step": 13490 }, { "epoch": 4.4996664442961976, "ref_ce_loss": 0.13071759045124054, "step": 13490 }, { "epoch": 4.4996664442961976, "loss": 0.31424570083618164, "step": 13490 }, { "ce_loss": 0.06584881991147995, "epoch": 4.4996664442961976, "step": 13490 }, { "distill_loss": 0.09528250247240067, "epoch": 4.4996664442961976, "step": 13490 }, { "epoch": 4.4996664442961976, "ref_ce_loss": 0.10239201039075851, "step": 13490 }, { "epoch": 4.503002001334223, "loss": 0.4227, "step": 13500 }, { "epoch": 4.503002001334223, "grad_norm": 3.658917188644409, "step": 13500 }, { "epoch": 4.503002001334223, "learning_rate": 0.00012683898214986824, "step": 13500 }, { "epoch": 4.503002001334223, "loss": 0.3130754828453064, "step": 13500 }, { "ce_loss": 0.07128926366567612, "epoch": 4.503002001334223, "step": 13500 }, { "distill_loss": 0.08057375252246857, "epoch": 4.503002001334223, "step": 13500 }, { "epoch": 4.503002001334223, "ref_ce_loss": 0.06667958945035934, "step": 13500 }, { "epoch": 4.503002001334223, "loss": 0.42011693120002747, "step": 13500 }, { "ce_loss": 0.10284280776977539, "epoch": 4.503002001334223, "step": 13500 }, { "distill_loss": 0.215025395154953, "epoch": 4.503002001334223, "step": 13500 }, { "epoch": 4.503002001334223, "ref_ce_loss": 0.10161525756120682, "step": 13500 }, { "epoch": 4.503002001334223, "loss": 0.3984660506248474, "step": 13500 }, { "ce_loss": 0.06674403697252274, "epoch": 4.503002001334223, "step": 13500 }, { "distill_loss": 0.2244340181350708, "epoch": 4.503002001334223, "step": 13500 }, { "epoch": 4.503002001334223, "ref_ce_loss": 0.08058111369609833, "step": 13500 }, { "epoch": 4.503002001334223, "loss": 0.4775744080543518, "step": 13500 }, { "ce_loss": 0.15970994532108307, "epoch": 4.503002001334223, "step": 13500 }, { "distill_loss": 0.12742125988006592, "epoch": 4.503002001334223, "step": 13500 }, { "epoch": 4.503002001334223, "ref_ce_loss": 0.11462785303592682, "step": 13500 }, { "epoch": 4.506337558372248, "loss": 0.4352, "step": 13510 }, { "epoch": 4.506337558372248, "grad_norm": 2.5688297748565674, "step": 13510 }, { "epoch": 4.506337558372248, "learning_rate": 0.00012663887115056723, "step": 13510 }, { "epoch": 4.506337558372248, "loss": 0.34181031584739685, "step": 13510 }, { "ce_loss": 0.10207465291023254, "epoch": 4.506337558372248, "step": 13510 }, { "distill_loss": 0.1244707852602005, "epoch": 4.506337558372248, "step": 13510 }, { "epoch": 4.506337558372248, "ref_ce_loss": 0.06944452971220016, "step": 13510 }, { "epoch": 4.506337558372248, "loss": 0.5946003794670105, "step": 13510 }, { "ce_loss": 0.14638981223106384, "epoch": 4.506337558372248, "step": 13510 }, { "distill_loss": 0.2600780725479126, "epoch": 4.506337558372248, "step": 13510 }, { "epoch": 4.506337558372248, "ref_ce_loss": 0.10639924556016922, "step": 13510 }, { "epoch": 4.506337558372248, "loss": 0.4521411657333374, "step": 13510 }, { "ce_loss": 0.11947055160999298, "epoch": 4.506337558372248, "step": 13510 }, { "distill_loss": 0.14773902297019958, "epoch": 4.506337558372248, "step": 13510 }, { "epoch": 4.506337558372248, "ref_ce_loss": 0.12681178748607635, "step": 13510 }, { "epoch": 4.506337558372248, "loss": 0.34964805841445923, "step": 13510 }, { "ce_loss": 0.13701759278774261, "epoch": 4.506337558372248, "step": 13510 }, { "distill_loss": 0.14266565442085266, "epoch": 4.506337558372248, "step": 13510 }, { "epoch": 4.506337558372248, "ref_ce_loss": 0.06979183107614517, "step": 13510 }, { "epoch": 4.509673115410274, "loss": 0.4043, "step": 13520 }, { "epoch": 4.509673115410274, "grad_norm": 2.665954828262329, "step": 13520 }, { "epoch": 4.509673115410274, "learning_rate": 0.00012643880275275005, "step": 13520 }, { "epoch": 4.509673115410274, "loss": 0.6819088459014893, "step": 13520 }, { "ce_loss": 0.2538369596004486, "epoch": 4.509673115410274, "step": 13520 }, { "distill_loss": 0.19348205626010895, "epoch": 4.509673115410274, "step": 13520 }, { "epoch": 4.509673115410274, "ref_ce_loss": 0.14676028490066528, "step": 13520 }, { "epoch": 4.509673115410274, "loss": 0.3843729496002197, "step": 13520 }, { "ce_loss": 0.09575576335191727, "epoch": 4.509673115410274, "step": 13520 }, { "distill_loss": 0.16320499777793884, "epoch": 4.509673115410274, "step": 13520 }, { "epoch": 4.509673115410274, "ref_ce_loss": 0.12518060207366943, "step": 13520 }, { "epoch": 4.509673115410274, "loss": 0.3843967318534851, "step": 13520 }, { "ce_loss": 0.09873102605342865, "epoch": 4.509673115410274, "step": 13520 }, { "distill_loss": 0.10267771035432816, "epoch": 4.509673115410274, "step": 13520 }, { "epoch": 4.509673115410274, "ref_ce_loss": 0.08693916350603104, "step": 13520 }, { "epoch": 4.509673115410274, "loss": 0.3445023000240326, "step": 13520 }, { "ce_loss": 0.06388503313064575, "epoch": 4.509673115410274, "step": 13520 }, { "distill_loss": 0.1321995109319687, "epoch": 4.509673115410274, "step": 13520 }, { "epoch": 4.509673115410274, "ref_ce_loss": 0.10116295516490936, "step": 13520 }, { "epoch": 4.513008672448299, "loss": 0.4466, "step": 13530 }, { "epoch": 4.513008672448299, "grad_norm": 3.3427317142486572, "step": 13530 }, { "epoch": 4.513008672448299, "learning_rate": 0.0001262387773212625, "step": 13530 }, { "epoch": 4.513008672448299, "loss": 0.261531263589859, "step": 13530 }, { "ce_loss": 0.06258751451969147, "epoch": 4.513008672448299, "step": 13530 }, { "distill_loss": 0.14119817316532135, "epoch": 4.513008672448299, "step": 13530 }, { "epoch": 4.513008672448299, "ref_ce_loss": 0.057690706104040146, "step": 13530 }, { "epoch": 4.513008672448299, "loss": 0.4364040791988373, "step": 13530 }, { "ce_loss": 0.03287011384963989, "epoch": 4.513008672448299, "step": 13530 }, { "distill_loss": 0.11824694275856018, "epoch": 4.513008672448299, "step": 13530 }, { "epoch": 4.513008672448299, "ref_ce_loss": 0.05607159063220024, "step": 13530 }, { "epoch": 4.513008672448299, "loss": 0.48048606514930725, "step": 13530 }, { "ce_loss": 0.05228961631655693, "epoch": 4.513008672448299, "step": 13530 }, { "distill_loss": 0.20537841320037842, "epoch": 4.513008672448299, "step": 13530 }, { "epoch": 4.513008672448299, "ref_ce_loss": 0.08688047528266907, "step": 13530 }, { "epoch": 4.513008672448299, "loss": 0.37082439661026, "step": 13530 }, { "ce_loss": 0.07386504113674164, "epoch": 4.513008672448299, "step": 13530 }, { "distill_loss": 0.09237289428710938, "epoch": 4.513008672448299, "step": 13530 }, { "epoch": 4.513008672448299, "ref_ce_loss": 0.0847601592540741, "step": 13530 }, { "epoch": 4.516344229486324, "loss": 0.4137, "step": 13540 }, { "epoch": 4.516344229486324, "grad_norm": 3.44813871383667, "step": 13540 }, { "epoch": 4.516344229486324, "learning_rate": 0.00012603879522087215, "step": 13540 }, { "epoch": 4.516344229486324, "loss": 0.4075620174407959, "step": 13540 }, { "ce_loss": 0.1053367629647255, "epoch": 4.516344229486324, "step": 13540 }, { "distill_loss": 0.15739335119724274, "epoch": 4.516344229486324, "step": 13540 }, { "epoch": 4.516344229486324, "ref_ce_loss": 0.10203933715820312, "step": 13540 }, { "epoch": 4.516344229486324, "loss": 0.6476325392723083, "step": 13540 }, { "ce_loss": 0.09174636751413345, "epoch": 4.516344229486324, "step": 13540 }, { "distill_loss": 0.130633145570755, "epoch": 4.516344229486324, "step": 13540 }, { "epoch": 4.516344229486324, "ref_ce_loss": 0.13086505234241486, "step": 13540 }, { "epoch": 4.516344229486324, "loss": 0.44802364706993103, "step": 13540 }, { "ce_loss": 0.1039741113781929, "epoch": 4.516344229486324, "step": 13540 }, { "distill_loss": 0.13582943379878998, "epoch": 4.516344229486324, "step": 13540 }, { "epoch": 4.516344229486324, "ref_ce_loss": 0.06499588489532471, "step": 13540 }, { "epoch": 4.516344229486324, "loss": 0.3249742090702057, "step": 13540 }, { "ce_loss": 0.11507577449083328, "epoch": 4.516344229486324, "step": 13540 }, { "distill_loss": 0.11589515954256058, "epoch": 4.516344229486324, "step": 13540 }, { "epoch": 4.516344229486324, "ref_ce_loss": 0.09377795457839966, "step": 13540 }, { "epoch": 4.51967978652435, "loss": 0.4084, "step": 13550 }, { "epoch": 4.51967978652435, "grad_norm": 2.4113292694091797, "step": 13550 }, { "epoch": 4.51967978652435, "learning_rate": 0.0001258388568162673, "step": 13550 }, { "epoch": 4.51967978652435, "loss": 0.46115991473197937, "step": 13550 }, { "ce_loss": 0.08020874857902527, "epoch": 4.51967978652435, "step": 13550 }, { "distill_loss": 0.12565740942955017, "epoch": 4.51967978652435, "step": 13550 }, { "epoch": 4.51967978652435, "ref_ce_loss": 0.11783842742443085, "step": 13550 }, { "epoch": 4.51967978652435, "loss": 0.3117343783378601, "step": 13550 }, { "ce_loss": 0.10067509114742279, "epoch": 4.51967978652435, "step": 13550 }, { "distill_loss": 0.12151607871055603, "epoch": 4.51967978652435, "step": 13550 }, { "epoch": 4.51967978652435, "ref_ce_loss": 0.054096437990665436, "step": 13550 }, { "epoch": 4.51967978652435, "loss": 0.41671937704086304, "step": 13550 }, { "ce_loss": 0.12779854238033295, "epoch": 4.51967978652435, "step": 13550 }, { "distill_loss": 0.14122280478477478, "epoch": 4.51967978652435, "step": 13550 }, { "epoch": 4.51967978652435, "ref_ce_loss": 0.09169165790081024, "step": 13550 }, { "epoch": 4.51967978652435, "loss": 0.4090811312198639, "step": 13550 }, { "ce_loss": 0.1314501315355301, "epoch": 4.51967978652435, "step": 13550 }, { "distill_loss": 0.16391001641750336, "epoch": 4.51967978652435, "step": 13550 }, { "epoch": 4.51967978652435, "ref_ce_loss": 0.08914811909198761, "step": 13550 }, { "epoch": 4.523015343562375, "loss": 0.4736, "step": 13560 }, { "epoch": 4.523015343562375, "grad_norm": 3.5108258724212646, "step": 13560 }, { "epoch": 4.523015343562375, "learning_rate": 0.00012563896247205685, "step": 13560 }, { "epoch": 4.523015343562375, "loss": 0.41970306634902954, "step": 13560 }, { "ce_loss": 0.08534153550863266, "epoch": 4.523015343562375, "step": 13560 }, { "distill_loss": 0.10883626341819763, "epoch": 4.523015343562375, "step": 13560 }, { "epoch": 4.523015343562375, "ref_ce_loss": 0.05522970110177994, "step": 13560 }, { "epoch": 4.523015343562375, "loss": 0.6811654567718506, "step": 13560 }, { "ce_loss": 0.2026192545890808, "epoch": 4.523015343562375, "step": 13560 }, { "distill_loss": 0.2263200879096985, "epoch": 4.523015343562375, "step": 13560 }, { "epoch": 4.523015343562375, "ref_ce_loss": 0.10827615857124329, "step": 13560 }, { "epoch": 4.523015343562375, "loss": 0.43313559889793396, "step": 13560 }, { "ce_loss": 0.15236859023571014, "epoch": 4.523015343562375, "step": 13560 }, { "distill_loss": 0.15981322526931763, "epoch": 4.523015343562375, "step": 13560 }, { "epoch": 4.523015343562375, "ref_ce_loss": 0.07809644192457199, "step": 13560 }, { "epoch": 4.523015343562375, "loss": 0.40003854036331177, "step": 13560 }, { "ce_loss": 0.051413632929325104, "epoch": 4.523015343562375, "step": 13560 }, { "distill_loss": 0.12792935967445374, "epoch": 4.523015343562375, "step": 13560 }, { "epoch": 4.523015343562375, "ref_ce_loss": 0.06233259662985802, "step": 13560 }, { "epoch": 4.5263509006004, "loss": 0.4263, "step": 13570 }, { "epoch": 4.5263509006004, "grad_norm": 2.6565046310424805, "step": 13570 }, { "epoch": 4.5263509006004, "learning_rate": 0.00012543911255276927, "step": 13570 }, { "epoch": 4.5263509006004, "loss": 0.6125295162200928, "step": 13570 }, { "ce_loss": 0.174873948097229, "epoch": 4.5263509006004, "step": 13570 }, { "distill_loss": 0.18027006089687347, "epoch": 4.5263509006004, "step": 13570 }, { "epoch": 4.5263509006004, "ref_ce_loss": 0.10872701555490494, "step": 13570 }, { "epoch": 4.5263509006004, "loss": 0.37513914704322815, "step": 13570 }, { "ce_loss": 0.05586402490735054, "epoch": 4.5263509006004, "step": 13570 }, { "distill_loss": 0.11506953835487366, "epoch": 4.5263509006004, "step": 13570 }, { "epoch": 4.5263509006004, "ref_ce_loss": 0.09433775395154953, "step": 13570 }, { "epoch": 4.5263509006004, "loss": 0.43863236904144287, "step": 13570 }, { "ce_loss": 0.06320168077945709, "epoch": 4.5263509006004, "step": 13570 }, { "distill_loss": 0.18999160826206207, "epoch": 4.5263509006004, "step": 13570 }, { "epoch": 4.5263509006004, "ref_ce_loss": 0.09951197355985641, "step": 13570 }, { "epoch": 4.5263509006004, "loss": 0.3416541814804077, "step": 13570 }, { "ce_loss": 0.07509807497262955, "epoch": 4.5263509006004, "step": 13570 }, { "distill_loss": 0.1782594621181488, "epoch": 4.5263509006004, "step": 13570 }, { "epoch": 4.5263509006004, "ref_ce_loss": 0.0882348045706749, "step": 13570 }, { "epoch": 4.529686457638426, "loss": 0.4296, "step": 13580 }, { "epoch": 4.529686457638426, "grad_norm": 1.9833866357803345, "step": 13580 }, { "epoch": 4.529686457638426, "learning_rate": 0.0001252393074228518, "step": 13580 }, { "epoch": 4.529686457638426, "loss": 0.3686028718948364, "step": 13580 }, { "ce_loss": 0.0907389372587204, "epoch": 4.529686457638426, "step": 13580 }, { "distill_loss": 0.14625293016433716, "epoch": 4.529686457638426, "step": 13580 }, { "epoch": 4.529686457638426, "ref_ce_loss": 0.07061794400215149, "step": 13580 }, { "epoch": 4.529686457638426, "loss": 0.8088022470474243, "step": 13580 }, { "ce_loss": 0.08733832836151123, "epoch": 4.529686457638426, "step": 13580 }, { "distill_loss": 0.2592051029205322, "epoch": 4.529686457638426, "step": 13580 }, { "epoch": 4.529686457638426, "ref_ce_loss": 0.08921769261360168, "step": 13580 }, { "epoch": 4.529686457638426, "loss": 0.4520493745803833, "step": 13580 }, { "ce_loss": 0.09981293976306915, "epoch": 4.529686457638426, "step": 13580 }, { "distill_loss": 0.22075609862804413, "epoch": 4.529686457638426, "step": 13580 }, { "epoch": 4.529686457638426, "ref_ce_loss": 0.06416413187980652, "step": 13580 }, { "epoch": 4.529686457638426, "loss": 0.33570596575737, "step": 13580 }, { "ce_loss": 0.0714380294084549, "epoch": 4.529686457638426, "step": 13580 }, { "distill_loss": 0.15616613626480103, "epoch": 4.529686457638426, "step": 13580 }, { "epoch": 4.529686457638426, "ref_ce_loss": 0.07128936797380447, "step": 13580 }, { "epoch": 4.533022014676451, "loss": 0.4352, "step": 13590 }, { "epoch": 4.533022014676451, "grad_norm": 2.4279701709747314, "step": 13590 }, { "epoch": 4.533022014676451, "learning_rate": 0.00012503954744667035, "step": 13590 }, { "epoch": 4.533022014676451, "loss": 0.31245681643486023, "step": 13590 }, { "ce_loss": 0.07602875679731369, "epoch": 4.533022014676451, "step": 13590 }, { "distill_loss": 0.1463480144739151, "epoch": 4.533022014676451, "step": 13590 }, { "epoch": 4.533022014676451, "ref_ce_loss": 0.06806331127882004, "step": 13590 }, { "epoch": 4.533022014676451, "loss": 0.3308563828468323, "step": 13590 }, { "ce_loss": 0.1174980103969574, "epoch": 4.533022014676451, "step": 13590 }, { "distill_loss": 0.10723531991243362, "epoch": 4.533022014676451, "step": 13590 }, { "epoch": 4.533022014676451, "ref_ce_loss": 0.08204618096351624, "step": 13590 }, { "epoch": 4.533022014676451, "loss": 0.3874821960926056, "step": 13590 }, { "ce_loss": 0.05842968076467514, "epoch": 4.533022014676451, "step": 13590 }, { "distill_loss": 0.21752431988716125, "epoch": 4.533022014676451, "step": 13590 }, { "epoch": 4.533022014676451, "ref_ce_loss": 0.11144692450761795, "step": 13590 }, { "epoch": 4.533022014676451, "loss": 0.5839598774909973, "step": 13590 }, { "ce_loss": 0.12129247188568115, "epoch": 4.533022014676451, "step": 13590 }, { "distill_loss": 0.3265639841556549, "epoch": 4.533022014676451, "step": 13590 }, { "epoch": 4.533022014676451, "ref_ce_loss": 0.09380409121513367, "step": 13590 }, { "epoch": 4.536357571714476, "loss": 0.4077, "step": 13600 }, { "epoch": 4.536357571714476, "grad_norm": 2.1981921195983887, "step": 13600 }, { "epoch": 4.536357571714476, "learning_rate": 0.00012483983298850832, "step": 13600 }, { "epoch": 4.536357571714476, "loss": 0.4827296733856201, "step": 13600 }, { "ce_loss": 0.11267001181840897, "epoch": 4.536357571714476, "step": 13600 }, { "distill_loss": 0.1192665547132492, "epoch": 4.536357571714476, "step": 13600 }, { "epoch": 4.536357571714476, "ref_ce_loss": 0.09567270427942276, "step": 13600 }, { "epoch": 4.536357571714476, "loss": 0.3647862374782562, "step": 13600 }, { "ce_loss": 0.14574190974235535, "epoch": 4.536357571714476, "step": 13600 }, { "distill_loss": 0.12533891201019287, "epoch": 4.536357571714476, "step": 13600 }, { "epoch": 4.536357571714476, "ref_ce_loss": 0.0631135031580925, "step": 13600 }, { "epoch": 4.536357571714476, "loss": 0.5031948089599609, "step": 13600 }, { "ce_loss": 0.07701995223760605, "epoch": 4.536357571714476, "step": 13600 }, { "distill_loss": 0.11162997782230377, "epoch": 4.536357571714476, "step": 13600 }, { "epoch": 4.536357571714476, "ref_ce_loss": 0.0998823344707489, "step": 13600 }, { "epoch": 4.536357571714476, "loss": 0.46570688486099243, "step": 13600 }, { "ce_loss": 0.16124644875526428, "epoch": 4.536357571714476, "step": 13600 }, { "distill_loss": 0.2057582139968872, "epoch": 4.536357571714476, "step": 13600 }, { "epoch": 4.536357571714476, "ref_ce_loss": 0.09859221428632736, "step": 13600 }, { "epoch": 4.539693128752502, "loss": 0.4824, "step": 13610 }, { "epoch": 4.539693128752502, "grad_norm": 3.6256258487701416, "step": 13610 }, { "epoch": 4.539693128752502, "learning_rate": 0.00012464016441256592, "step": 13610 }, { "epoch": 4.539693128752502, "loss": 0.330820232629776, "step": 13610 }, { "ce_loss": 0.10562053322792053, "epoch": 4.539693128752502, "step": 13610 }, { "distill_loss": 0.12647363543510437, "epoch": 4.539693128752502, "step": 13610 }, { "epoch": 4.539693128752502, "ref_ce_loss": 0.07756177335977554, "step": 13610 }, { "epoch": 4.539693128752502, "loss": 0.3412962555885315, "step": 13610 }, { "ce_loss": 0.09845242649316788, "epoch": 4.539693128752502, "step": 13610 }, { "distill_loss": 0.1366013139486313, "epoch": 4.539693128752502, "step": 13610 }, { "epoch": 4.539693128752502, "ref_ce_loss": 0.07171228528022766, "step": 13610 }, { "epoch": 4.539693128752502, "loss": 0.6043596863746643, "step": 13610 }, { "ce_loss": 0.1571023017168045, "epoch": 4.539693128752502, "step": 13610 }, { "distill_loss": 0.157630056142807, "epoch": 4.539693128752502, "step": 13610 }, { "epoch": 4.539693128752502, "ref_ce_loss": 0.11741932481527328, "step": 13610 }, { "epoch": 4.539693128752502, "loss": 0.3237883150577545, "step": 13610 }, { "ce_loss": 0.07001443207263947, "epoch": 4.539693128752502, "step": 13610 }, { "distill_loss": 0.13885082304477692, "epoch": 4.539693128752502, "step": 13610 }, { "epoch": 4.539693128752502, "ref_ce_loss": 0.08538435399532318, "step": 13610 }, { "epoch": 4.543028685790527, "loss": 0.4485, "step": 13620 }, { "epoch": 4.543028685790527, "grad_norm": 3.25905442237854, "step": 13620 }, { "epoch": 4.543028685790527, "learning_rate": 0.00012444054208296014, "step": 13620 }, { "epoch": 4.543028685790527, "loss": 0.3815009295940399, "step": 13620 }, { "ce_loss": 0.11263687163591385, "epoch": 4.543028685790527, "step": 13620 }, { "distill_loss": 0.13090786337852478, "epoch": 4.543028685790527, "step": 13620 }, { "epoch": 4.543028685790527, "ref_ce_loss": 0.07116122543811798, "step": 13620 }, { "epoch": 4.543028685790527, "loss": 0.39881637692451477, "step": 13620 }, { "ce_loss": 0.055635806173086166, "epoch": 4.543028685790527, "step": 13620 }, { "distill_loss": 0.20073339343070984, "epoch": 4.543028685790527, "step": 13620 }, { "epoch": 4.543028685790527, "ref_ce_loss": 0.0927748754620552, "step": 13620 }, { "epoch": 4.543028685790527, "loss": 0.30770015716552734, "step": 13620 }, { "ce_loss": 0.07468118518590927, "epoch": 4.543028685790527, "step": 13620 }, { "distill_loss": 0.11699899286031723, "epoch": 4.543028685790527, "step": 13620 }, { "epoch": 4.543028685790527, "ref_ce_loss": 0.06440125405788422, "step": 13620 }, { "epoch": 4.543028685790527, "loss": 0.3126216530799866, "step": 13620 }, { "ce_loss": 0.07507819682359695, "epoch": 4.543028685790527, "step": 13620 }, { "distill_loss": 0.09823939204216003, "epoch": 4.543028685790527, "step": 13620 }, { "epoch": 4.543028685790527, "ref_ce_loss": 0.05781750753521919, "step": 13620 }, { "epoch": 4.5463642428285524, "loss": 0.3936, "step": 13630 }, { "epoch": 4.5463642428285524, "grad_norm": 1.8271279335021973, "step": 13630 }, { "epoch": 4.5463642428285524, "learning_rate": 0.0001242409663637231, "step": 13630 }, { "epoch": 4.5463642428285524, "loss": 0.3869246542453766, "step": 13630 }, { "ce_loss": 0.11788243800401688, "epoch": 4.5463642428285524, "step": 13630 }, { "distill_loss": 0.13876157999038696, "epoch": 4.5463642428285524, "step": 13630 }, { "epoch": 4.5463642428285524, "ref_ce_loss": 0.09515149891376495, "step": 13630 }, { "epoch": 4.5463642428285524, "loss": 0.2891252040863037, "step": 13630 }, { "ce_loss": 0.053451113402843475, "epoch": 4.5463642428285524, "step": 13630 }, { "distill_loss": 0.1296810507774353, "epoch": 4.5463642428285524, "step": 13630 }, { "epoch": 4.5463642428285524, "ref_ce_loss": 0.05890681594610214, "step": 13630 }, { "epoch": 4.5463642428285524, "loss": 0.359307199716568, "step": 13630 }, { "ce_loss": 0.11626996845006943, "epoch": 4.5463642428285524, "step": 13630 }, { "distill_loss": 0.11499570310115814, "epoch": 4.5463642428285524, "step": 13630 }, { "epoch": 4.5463642428285524, "ref_ce_loss": 0.1020270511507988, "step": 13630 }, { "epoch": 4.5463642428285524, "loss": 0.2649209499359131, "step": 13630 }, { "ce_loss": 0.05239448696374893, "epoch": 4.5463642428285524, "step": 13630 }, { "distill_loss": 0.12976151704788208, "epoch": 4.5463642428285524, "step": 13630 }, { "epoch": 4.5463642428285524, "ref_ce_loss": 0.05968295782804489, "step": 13630 }, { "epoch": 4.549699799866578, "loss": 0.4434, "step": 13640 }, { "epoch": 4.549699799866578, "grad_norm": 3.0977251529693604, "step": 13640 }, { "epoch": 4.549699799866578, "learning_rate": 0.0001240414376188023, "step": 13640 }, { "epoch": 4.549699799866578, "loss": 0.3558688759803772, "step": 13640 }, { "ce_loss": 0.09094174206256866, "epoch": 4.549699799866578, "step": 13640 }, { "distill_loss": 0.11046818643808365, "epoch": 4.549699799866578, "step": 13640 }, { "epoch": 4.549699799866578, "ref_ce_loss": 0.08839097619056702, "step": 13640 }, { "epoch": 4.549699799866578, "loss": 0.7989137172698975, "step": 13640 }, { "ce_loss": 0.1515854001045227, "epoch": 4.549699799866578, "step": 13640 }, { "distill_loss": 0.16707676649093628, "epoch": 4.549699799866578, "step": 13640 }, { "epoch": 4.549699799866578, "ref_ce_loss": 0.08309073746204376, "step": 13640 }, { "epoch": 4.549699799866578, "loss": 0.37850069999694824, "step": 13640 }, { "ce_loss": 0.13884668052196503, "epoch": 4.549699799866578, "step": 13640 }, { "distill_loss": 0.16242651641368866, "epoch": 4.549699799866578, "step": 13640 }, { "epoch": 4.549699799866578, "ref_ce_loss": 0.07711191475391388, "step": 13640 }, { "epoch": 4.549699799866578, "loss": 0.37308332324028015, "step": 13640 }, { "ce_loss": 0.028073173016309738, "epoch": 4.549699799866578, "step": 13640 }, { "distill_loss": 0.10945470631122589, "epoch": 4.549699799866578, "step": 13640 }, { "epoch": 4.549699799866578, "ref_ce_loss": 0.08089083433151245, "step": 13640 }, { "epoch": 4.553035356904603, "loss": 0.4421, "step": 13650 }, { "epoch": 4.553035356904603, "grad_norm": 2.5074403285980225, "step": 13650 }, { "epoch": 4.553035356904603, "learning_rate": 0.0001238419562120596, "step": 13650 }, { "epoch": 4.553035356904603, "loss": 0.25730282068252563, "step": 13650 }, { "ce_loss": 0.03727759048342705, "epoch": 4.553035356904603, "step": 13650 }, { "distill_loss": 0.12253627926111221, "epoch": 4.553035356904603, "step": 13650 }, { "epoch": 4.553035356904603, "ref_ce_loss": 0.05770736187696457, "step": 13650 }, { "epoch": 4.553035356904603, "loss": 0.513620138168335, "step": 13650 }, { "ce_loss": 0.10222535580396652, "epoch": 4.553035356904603, "step": 13650 }, { "distill_loss": 0.25187408924102783, "epoch": 4.553035356904603, "step": 13650 }, { "epoch": 4.553035356904603, "ref_ce_loss": 0.1170971617102623, "step": 13650 }, { "epoch": 4.553035356904603, "loss": 0.5014258623123169, "step": 13650 }, { "ce_loss": 0.2105833888053894, "epoch": 4.553035356904603, "step": 13650 }, { "distill_loss": 0.16550350189208984, "epoch": 4.553035356904603, "step": 13650 }, { "epoch": 4.553035356904603, "ref_ce_loss": 0.09165210276842117, "step": 13650 }, { "epoch": 4.553035356904603, "loss": 0.567755937576294, "step": 13650 }, { "ce_loss": 0.1038329005241394, "epoch": 4.553035356904603, "step": 13650 }, { "distill_loss": 0.31333285570144653, "epoch": 4.553035356904603, "step": 13650 }, { "epoch": 4.553035356904603, "ref_ce_loss": 0.12102946639060974, "step": 13650 }, { "epoch": 4.5563709139426285, "loss": 0.4451, "step": 13660 }, { "epoch": 4.5563709139426285, "grad_norm": 3.6462759971618652, "step": 13660 }, { "epoch": 4.5563709139426285, "learning_rate": 0.00012364252250727012, "step": 13660 }, { "epoch": 4.5563709139426285, "loss": 0.6694972515106201, "step": 13660 }, { "ce_loss": 0.13008111715316772, "epoch": 4.5563709139426285, "step": 13660 }, { "distill_loss": 0.14958806335926056, "epoch": 4.5563709139426285, "step": 13660 }, { "epoch": 4.5563709139426285, "ref_ce_loss": 0.13347876071929932, "step": 13660 }, { "epoch": 4.5563709139426285, "loss": 0.5252541899681091, "step": 13660 }, { "ce_loss": 0.13631699979305267, "epoch": 4.5563709139426285, "step": 13660 }, { "distill_loss": 0.13244640827178955, "epoch": 4.5563709139426285, "step": 13660 }, { "epoch": 4.5563709139426285, "ref_ce_loss": 0.07643939554691315, "step": 13660 }, { "epoch": 4.5563709139426285, "loss": 0.7412728071212769, "step": 13660 }, { "ce_loss": 0.0762416198849678, "epoch": 4.5563709139426285, "step": 13660 }, { "distill_loss": 0.18756809830665588, "epoch": 4.5563709139426285, "step": 13660 }, { "epoch": 4.5563709139426285, "ref_ce_loss": 0.0861452966928482, "step": 13660 }, { "epoch": 4.5563709139426285, "loss": 0.30000126361846924, "step": 13660 }, { "ce_loss": 0.056661494076251984, "epoch": 4.5563709139426285, "step": 13660 }, { "distill_loss": 0.11866292357444763, "epoch": 4.5563709139426285, "step": 13660 }, { "epoch": 4.5563709139426285, "ref_ce_loss": 0.06830247491598129, "step": 13660 }, { "epoch": 4.559706470980654, "loss": 0.4842, "step": 13670 }, { "epoch": 4.559706470980654, "grad_norm": 3.67041277885437, "step": 13670 }, { "epoch": 4.559706470980654, "learning_rate": 0.00012344313686812248, "step": 13670 }, { "epoch": 4.559706470980654, "loss": 0.41302490234375, "step": 13670 }, { "ce_loss": 0.1367223709821701, "epoch": 4.559706470980654, "step": 13670 }, { "distill_loss": 0.15959084033966064, "epoch": 4.559706470980654, "step": 13670 }, { "epoch": 4.559706470980654, "ref_ce_loss": 0.0791315883398056, "step": 13670 }, { "epoch": 4.559706470980654, "loss": 0.3104380667209625, "step": 13670 }, { "ce_loss": 0.05096364766359329, "epoch": 4.559706470980654, "step": 13670 }, { "distill_loss": 0.1786295473575592, "epoch": 4.559706470980654, "step": 13670 }, { "epoch": 4.559706470980654, "ref_ce_loss": 0.06254564225673676, "step": 13670 }, { "epoch": 4.559706470980654, "loss": 0.5430464148521423, "step": 13670 }, { "ce_loss": 0.06723407655954361, "epoch": 4.559706470980654, "step": 13670 }, { "distill_loss": 0.13717561960220337, "epoch": 4.559706470980654, "step": 13670 }, { "epoch": 4.559706470980654, "ref_ce_loss": 0.079865463078022, "step": 13670 }, { "epoch": 4.559706470980654, "loss": 0.4139541685581207, "step": 13670 }, { "ce_loss": 0.10406375676393509, "epoch": 4.559706470980654, "step": 13670 }, { "distill_loss": 0.1822187751531601, "epoch": 4.559706470980654, "step": 13670 }, { "epoch": 4.559706470980654, "ref_ce_loss": 0.09676679968833923, "step": 13670 }, { "epoch": 4.563042028018679, "loss": 0.4609, "step": 13680 }, { "epoch": 4.563042028018679, "grad_norm": 2.727271795272827, "step": 13680 }, { "epoch": 4.563042028018679, "learning_rate": 0.00012324379965821734, "step": 13680 }, { "epoch": 4.563042028018679, "loss": 0.5735003352165222, "step": 13680 }, { "ce_loss": 0.07929814606904984, "epoch": 4.563042028018679, "step": 13680 }, { "distill_loss": 0.1739797592163086, "epoch": 4.563042028018679, "step": 13680 }, { "epoch": 4.563042028018679, "ref_ce_loss": 0.10339117795228958, "step": 13680 }, { "epoch": 4.563042028018679, "loss": 0.33152449131011963, "step": 13680 }, { "ce_loss": 0.1058422103524208, "epoch": 4.563042028018679, "step": 13680 }, { "distill_loss": 0.11422500759363174, "epoch": 4.563042028018679, "step": 13680 }, { "epoch": 4.563042028018679, "ref_ce_loss": 0.08426758646965027, "step": 13680 }, { "epoch": 4.563042028018679, "loss": 0.5023914575576782, "step": 13680 }, { "ce_loss": 0.12481412291526794, "epoch": 4.563042028018679, "step": 13680 }, { "distill_loss": 0.16830413043498993, "epoch": 4.563042028018679, "step": 13680 }, { "epoch": 4.563042028018679, "ref_ce_loss": 0.09582826495170593, "step": 13680 }, { "epoch": 4.563042028018679, "loss": 0.3103086054325104, "step": 13680 }, { "ce_loss": 0.08641095459461212, "epoch": 4.563042028018679, "step": 13680 }, { "distill_loss": 0.14794382452964783, "epoch": 4.563042028018679, "step": 13680 }, { "epoch": 4.563042028018679, "ref_ce_loss": 0.07582417875528336, "step": 13680 }, { "epoch": 4.5663775850567045, "loss": 0.4606, "step": 13690 }, { "epoch": 4.5663775850567045, "grad_norm": 10.296592712402344, "step": 13690 }, { "epoch": 4.5663775850567045, "learning_rate": 0.00012304451124106716, "step": 13690 }, { "epoch": 4.5663775850567045, "loss": 0.5351616144180298, "step": 13690 }, { "ce_loss": 0.22443951666355133, "epoch": 4.5663775850567045, "step": 13690 }, { "distill_loss": 0.1858910769224167, "epoch": 4.5663775850567045, "step": 13690 }, { "epoch": 4.5663775850567045, "ref_ce_loss": 0.0964193195104599, "step": 13690 }, { "epoch": 4.5663775850567045, "loss": 0.42722010612487793, "step": 13690 }, { "ce_loss": 0.15494583547115326, "epoch": 4.5663775850567045, "step": 13690 }, { "distill_loss": 0.14783549308776855, "epoch": 4.5663775850567045, "step": 13690 }, { "epoch": 4.5663775850567045, "ref_ce_loss": 0.08811517059803009, "step": 13690 }, { "epoch": 4.5663775850567045, "loss": 0.4499038755893707, "step": 13690 }, { "ce_loss": 0.08879373222589493, "epoch": 4.5663775850567045, "step": 13690 }, { "distill_loss": 0.17907992005348206, "epoch": 4.5663775850567045, "step": 13690 }, { "epoch": 4.5663775850567045, "ref_ce_loss": 0.0889173224568367, "step": 13690 }, { "epoch": 4.5663775850567045, "loss": 0.6208651065826416, "step": 13690 }, { "ce_loss": 0.12071339040994644, "epoch": 4.5663775850567045, "step": 13690 }, { "distill_loss": 0.14878807961940765, "epoch": 4.5663775850567045, "step": 13690 }, { "epoch": 4.5663775850567045, "ref_ce_loss": 0.07070641219615936, "step": 13690 }, { "epoch": 4.56971314209473, "loss": 0.4358, "step": 13700 }, { "epoch": 4.56971314209473, "grad_norm": 3.6545889377593994, "step": 13700 }, { "epoch": 4.56971314209473, "learning_rate": 0.00012284527198009543, "step": 13700 }, { "epoch": 4.56971314209473, "loss": 0.43364912271499634, "step": 13700 }, { "ce_loss": 0.09774000942707062, "epoch": 4.56971314209473, "step": 13700 }, { "distill_loss": 0.13901299238204956, "epoch": 4.56971314209473, "step": 13700 }, { "epoch": 4.56971314209473, "ref_ce_loss": 0.08951815217733383, "step": 13700 }, { "epoch": 4.56971314209473, "loss": 0.5613880753517151, "step": 13700 }, { "ce_loss": 0.09675523638725281, "epoch": 4.56971314209473, "step": 13700 }, { "distill_loss": 0.23557351529598236, "epoch": 4.56971314209473, "step": 13700 }, { "epoch": 4.56971314209473, "ref_ce_loss": 0.08532778918743134, "step": 13700 }, { "epoch": 4.56971314209473, "loss": 0.245731383562088, "step": 13700 }, { "ce_loss": 0.017872940748929977, "epoch": 4.56971314209473, "step": 13700 }, { "distill_loss": 0.08782623708248138, "epoch": 4.56971314209473, "step": 13700 }, { "epoch": 4.56971314209473, "ref_ce_loss": 0.056050386279821396, "step": 13700 }, { "epoch": 4.56971314209473, "loss": 0.30002671480178833, "step": 13700 }, { "ce_loss": 0.06583509594202042, "epoch": 4.56971314209473, "step": 13700 }, { "distill_loss": 0.1712363362312317, "epoch": 4.56971314209473, "step": 13700 }, { "epoch": 4.56971314209473, "ref_ce_loss": 0.06252934783697128, "step": 13700 }, { "epoch": 4.573048699132755, "loss": 0.4209, "step": 13710 }, { "epoch": 4.573048699132755, "grad_norm": 2.177154064178467, "step": 13710 }, { "epoch": 4.573048699132755, "learning_rate": 0.00012264608223863592, "step": 13710 }, { "epoch": 4.573048699132755, "loss": 0.4023416042327881, "step": 13710 }, { "ce_loss": 0.140888050198555, "epoch": 4.573048699132755, "step": 13710 }, { "distill_loss": 0.12739530205726624, "epoch": 4.573048699132755, "step": 13710 }, { "epoch": 4.573048699132755, "ref_ce_loss": 0.1103469580411911, "step": 13710 }, { "epoch": 4.573048699132755, "loss": 0.5931873917579651, "step": 13710 }, { "ce_loss": 0.10426094383001328, "epoch": 4.573048699132755, "step": 13710 }, { "distill_loss": 0.2250000536441803, "epoch": 4.573048699132755, "step": 13710 }, { "epoch": 4.573048699132755, "ref_ce_loss": 0.12573935091495514, "step": 13710 }, { "epoch": 4.573048699132755, "loss": 0.35033118724823, "step": 13710 }, { "ce_loss": 0.07681325823068619, "epoch": 4.573048699132755, "step": 13710 }, { "distill_loss": 0.19096848368644714, "epoch": 4.573048699132755, "step": 13710 }, { "epoch": 4.573048699132755, "ref_ce_loss": 0.0824156329035759, "step": 13710 }, { "epoch": 4.573048699132755, "loss": 0.47160404920578003, "step": 13710 }, { "ce_loss": 0.13833743333816528, "epoch": 4.573048699132755, "step": 13710 }, { "distill_loss": 0.219387024641037, "epoch": 4.573048699132755, "step": 13710 }, { "epoch": 4.573048699132755, "ref_ce_loss": 0.08718808740377426, "step": 13710 }, { "epoch": 4.576384256170781, "loss": 0.4349, "step": 13720 }, { "epoch": 4.576384256170781, "grad_norm": 2.8919193744659424, "step": 13720 }, { "epoch": 4.576384256170781, "learning_rate": 0.00012244694237993216, "step": 13720 }, { "epoch": 4.576384256170781, "loss": 0.39218464493751526, "step": 13720 }, { "ce_loss": 0.14482086896896362, "epoch": 4.576384256170781, "step": 13720 }, { "distill_loss": 0.15278075635433197, "epoch": 4.576384256170781, "step": 13720 }, { "epoch": 4.576384256170781, "ref_ce_loss": 0.05385451018810272, "step": 13720 }, { "epoch": 4.576384256170781, "loss": 0.8182839155197144, "step": 13720 }, { "ce_loss": 0.10439430177211761, "epoch": 4.576384256170781, "step": 13720 }, { "distill_loss": 0.16038022935390472, "epoch": 4.576384256170781, "step": 13720 }, { "epoch": 4.576384256170781, "ref_ce_loss": 0.09227900952100754, "step": 13720 }, { "epoch": 4.576384256170781, "loss": 0.6853447556495667, "step": 13720 }, { "ce_loss": 0.07551288604736328, "epoch": 4.576384256170781, "step": 13720 }, { "distill_loss": 0.15994273126125336, "epoch": 4.576384256170781, "step": 13720 }, { "epoch": 4.576384256170781, "ref_ce_loss": 0.1423223465681076, "step": 13720 }, { "epoch": 4.576384256170781, "loss": 0.303835928440094, "step": 13720 }, { "ce_loss": 0.062204133719205856, "epoch": 4.576384256170781, "step": 13720 }, { "distill_loss": 0.10961860418319702, "epoch": 4.576384256170781, "step": 13720 }, { "epoch": 4.576384256170781, "ref_ce_loss": 0.07964453846216202, "step": 13720 }, { "epoch": 4.579719813208806, "loss": 0.3986, "step": 13730 }, { "epoch": 4.579719813208806, "grad_norm": 1.9763175249099731, "step": 13730 }, { "epoch": 4.579719813208806, "learning_rate": 0.00012224785276713674, "step": 13730 }, { "epoch": 4.579719813208806, "loss": 0.35807281732559204, "step": 13730 }, { "ce_loss": 0.08664276450872421, "epoch": 4.579719813208806, "step": 13730 }, { "distill_loss": 0.14837504923343658, "epoch": 4.579719813208806, "step": 13730 }, { "epoch": 4.579719813208806, "ref_ce_loss": 0.07494465261697769, "step": 13730 }, { "epoch": 4.579719813208806, "loss": 0.2507583200931549, "step": 13730 }, { "ce_loss": 0.05459444224834442, "epoch": 4.579719813208806, "step": 13730 }, { "distill_loss": 0.10137300193309784, "epoch": 4.579719813208806, "step": 13730 }, { "epoch": 4.579719813208806, "ref_ce_loss": 0.09378074109554291, "step": 13730 }, { "epoch": 4.579719813208806, "loss": 0.4497610330581665, "step": 13730 }, { "ce_loss": 0.12088881433010101, "epoch": 4.579719813208806, "step": 13730 }, { "distill_loss": 0.18141990900039673, "epoch": 4.579719813208806, "step": 13730 }, { "epoch": 4.579719813208806, "ref_ce_loss": 0.11048609763383865, "step": 13730 }, { "epoch": 4.579719813208806, "loss": 0.46002933382987976, "step": 13730 }, { "ce_loss": 0.15615512430667877, "epoch": 4.579719813208806, "step": 13730 }, { "distill_loss": 0.14017333090305328, "epoch": 4.579719813208806, "step": 13730 }, { "epoch": 4.579719813208806, "ref_ce_loss": 0.08091697841882706, "step": 13730 }, { "epoch": 4.583055370246831, "loss": 0.4443, "step": 13740 }, { "epoch": 4.583055370246831, "grad_norm": 1.916420340538025, "step": 13740 }, { "epoch": 4.583055370246831, "learning_rate": 0.00012204881376331049, "step": 13740 }, { "epoch": 4.583055370246831, "loss": 0.36558467149734497, "step": 13740 }, { "ce_loss": 0.07916125655174255, "epoch": 4.583055370246831, "step": 13740 }, { "distill_loss": 0.16772039234638214, "epoch": 4.583055370246831, "step": 13740 }, { "epoch": 4.583055370246831, "ref_ce_loss": 0.06706126779317856, "step": 13740 }, { "epoch": 4.583055370246831, "loss": 0.4626239538192749, "step": 13740 }, { "ce_loss": 0.09908965229988098, "epoch": 4.583055370246831, "step": 13740 }, { "distill_loss": 0.14869137108325958, "epoch": 4.583055370246831, "step": 13740 }, { "epoch": 4.583055370246831, "ref_ce_loss": 0.14961369335651398, "step": 13740 }, { "epoch": 4.583055370246831, "loss": 0.6350833773612976, "step": 13740 }, { "ce_loss": 0.09164462983608246, "epoch": 4.583055370246831, "step": 13740 }, { "distill_loss": 0.13121801614761353, "epoch": 4.583055370246831, "step": 13740 }, { "epoch": 4.583055370246831, "ref_ce_loss": 0.12443569302558899, "step": 13740 }, { "epoch": 4.583055370246831, "loss": 0.25773707032203674, "step": 13740 }, { "ce_loss": 0.0695008710026741, "epoch": 4.583055370246831, "step": 13740 }, { "distill_loss": 0.11608363687992096, "epoch": 4.583055370246831, "step": 13740 }, { "epoch": 4.583055370246831, "ref_ce_loss": 0.04488598555326462, "step": 13740 }, { "epoch": 4.586390927284857, "loss": 0.475, "step": 13750 }, { "epoch": 4.586390927284857, "grad_norm": 3.7855210304260254, "step": 13750 }, { "epoch": 4.586390927284857, "learning_rate": 0.00012184982573142215, "step": 13750 }, { "epoch": 4.586390927284857, "loss": 0.4133288264274597, "step": 13750 }, { "ce_loss": 0.09249895066022873, "epoch": 4.586390927284857, "step": 13750 }, { "distill_loss": 0.1350885033607483, "epoch": 4.586390927284857, "step": 13750 }, { "epoch": 4.586390927284857, "ref_ce_loss": 0.09329091012477875, "step": 13750 }, { "epoch": 4.586390927284857, "loss": 0.2725394070148468, "step": 13750 }, { "ce_loss": 0.057737085968256, "epoch": 4.586390927284857, "step": 13750 }, { "distill_loss": 0.12318053841590881, "epoch": 4.586390927284857, "step": 13750 }, { "epoch": 4.586390927284857, "ref_ce_loss": 0.06361659616231918, "step": 13750 }, { "epoch": 4.586390927284857, "loss": 0.2811312675476074, "step": 13750 }, { "ce_loss": 0.04427865892648697, "epoch": 4.586390927284857, "step": 13750 }, { "distill_loss": 0.13684555888175964, "epoch": 4.586390927284857, "step": 13750 }, { "epoch": 4.586390927284857, "ref_ce_loss": 0.05077182129025459, "step": 13750 }, { "epoch": 4.586390927284857, "loss": 0.6474969387054443, "step": 13750 }, { "ce_loss": 0.08332732319831848, "epoch": 4.586390927284857, "step": 13750 }, { "distill_loss": 0.13889549672603607, "epoch": 4.586390927284857, "step": 13750 }, { "epoch": 4.586390927284857, "ref_ce_loss": 0.06022493168711662, "step": 13750 }, { "epoch": 4.589726484322882, "loss": 0.4507, "step": 13760 }, { "epoch": 4.589726484322882, "grad_norm": 2.70395565032959, "step": 13760 }, { "epoch": 4.589726484322882, "learning_rate": 0.00012165088903434731, "step": 13760 }, { "epoch": 4.589726484322882, "loss": 0.19448912143707275, "step": 13760 }, { "ce_loss": 0.011985452845692635, "epoch": 4.589726484322882, "step": 13760 }, { "distill_loss": 0.0980040431022644, "epoch": 4.589726484322882, "step": 13760 }, { "epoch": 4.589726484322882, "ref_ce_loss": 0.08422990143299103, "step": 13760 }, { "epoch": 4.589726484322882, "loss": 0.511165201663971, "step": 13760 }, { "ce_loss": 0.17377831041812897, "epoch": 4.589726484322882, "step": 13760 }, { "distill_loss": 0.1677647829055786, "epoch": 4.589726484322882, "step": 13760 }, { "epoch": 4.589726484322882, "ref_ce_loss": 0.10331190377473831, "step": 13760 }, { "epoch": 4.589726484322882, "loss": 0.5306928157806396, "step": 13760 }, { "ce_loss": 0.17135898768901825, "epoch": 4.589726484322882, "step": 13760 }, { "distill_loss": 0.20471972227096558, "epoch": 4.589726484322882, "step": 13760 }, { "epoch": 4.589726484322882, "ref_ce_loss": 0.15429776906967163, "step": 13760 }, { "epoch": 4.589726484322882, "loss": 0.5309099555015564, "step": 13760 }, { "ce_loss": 0.17451880872249603, "epoch": 4.589726484322882, "step": 13760 }, { "distill_loss": 0.2045009434223175, "epoch": 4.589726484322882, "step": 13760 }, { "epoch": 4.589726484322882, "ref_ce_loss": 0.11921660602092743, "step": 13760 }, { "epoch": 4.593062041360907, "loss": 0.4912, "step": 13770 }, { "epoch": 4.593062041360907, "grad_norm": 3.7819948196411133, "step": 13770 }, { "epoch": 4.593062041360907, "learning_rate": 0.00012145200403486805, "step": 13770 }, { "epoch": 4.593062041360907, "loss": 0.6235246062278748, "step": 13770 }, { "ce_loss": 0.12689945101737976, "epoch": 4.593062041360907, "step": 13770 }, { "distill_loss": 0.17795231938362122, "epoch": 4.593062041360907, "step": 13770 }, { "epoch": 4.593062041360907, "ref_ce_loss": 0.10836771130561829, "step": 13770 }, { "epoch": 4.593062041360907, "loss": 0.2900228798389435, "step": 13770 }, { "ce_loss": 0.053863294422626495, "epoch": 4.593062041360907, "step": 13770 }, { "distill_loss": 0.13948102295398712, "epoch": 4.593062041360907, "step": 13770 }, { "epoch": 4.593062041360907, "ref_ce_loss": 0.09636963158845901, "step": 13770 }, { "epoch": 4.593062041360907, "loss": 0.5091549158096313, "step": 13770 }, { "ce_loss": 0.08481498062610626, "epoch": 4.593062041360907, "step": 13770 }, { "distill_loss": 0.30344074964523315, "epoch": 4.593062041360907, "step": 13770 }, { "epoch": 4.593062041360907, "ref_ce_loss": 0.12071350961923599, "step": 13770 }, { "epoch": 4.593062041360907, "loss": 0.5841506719589233, "step": 13770 }, { "ce_loss": 0.1180991530418396, "epoch": 4.593062041360907, "step": 13770 }, { "distill_loss": 0.21680428087711334, "epoch": 4.593062041360907, "step": 13770 }, { "epoch": 4.593062041360907, "ref_ce_loss": 0.11102721095085144, "step": 13770 }, { "epoch": 4.596397598398933, "loss": 0.48, "step": 13780 }, { "epoch": 4.596397598398933, "grad_norm": 2.3747878074645996, "step": 13780 }, { "epoch": 4.596397598398933, "learning_rate": 0.00012125317109567219, "step": 13780 }, { "epoch": 4.596397598398933, "loss": 0.7505519390106201, "step": 13780 }, { "ce_loss": 0.1606123447418213, "epoch": 4.596397598398933, "step": 13780 }, { "distill_loss": 0.16520407795906067, "epoch": 4.596397598398933, "step": 13780 }, { "epoch": 4.596397598398933, "ref_ce_loss": 0.08397366106510162, "step": 13780 }, { "epoch": 4.596397598398933, "loss": 0.3478164076805115, "step": 13780 }, { "ce_loss": 0.07152732461690903, "epoch": 4.596397598398933, "step": 13780 }, { "distill_loss": 0.1993105709552765, "epoch": 4.596397598398933, "step": 13780 }, { "epoch": 4.596397598398933, "ref_ce_loss": 0.07670750468969345, "step": 13780 }, { "epoch": 4.596397598398933, "loss": 0.5030295252799988, "step": 13780 }, { "ce_loss": 0.06702382862567902, "epoch": 4.596397598398933, "step": 13780 }, { "distill_loss": 0.10921978205442429, "epoch": 4.596397598398933, "step": 13780 }, { "epoch": 4.596397598398933, "ref_ce_loss": 0.10053457319736481, "step": 13780 }, { "epoch": 4.596397598398933, "loss": 0.446050763130188, "step": 13780 }, { "ce_loss": 0.1185617446899414, "epoch": 4.596397598398933, "step": 13780 }, { "distill_loss": 0.1715582013130188, "epoch": 4.596397598398933, "step": 13780 }, { "epoch": 4.596397598398933, "ref_ce_loss": 0.08671645820140839, "step": 13780 }, { "epoch": 4.599733155436958, "loss": 0.4124, "step": 13790 }, { "epoch": 4.599733155436958, "grad_norm": 3.6794545650482178, "step": 13790 }, { "epoch": 4.599733155436958, "learning_rate": 0.00012105439057935254, "step": 13790 }, { "epoch": 4.599733155436958, "loss": 0.3869558274745941, "step": 13790 }, { "ce_loss": 0.09533415734767914, "epoch": 4.599733155436958, "step": 13790 }, { "distill_loss": 0.15595683455467224, "epoch": 4.599733155436958, "step": 13790 }, { "epoch": 4.599733155436958, "ref_ce_loss": 0.10946252197027206, "step": 13790 }, { "epoch": 4.599733155436958, "loss": 0.4473382830619812, "step": 13790 }, { "ce_loss": 0.11866164207458496, "epoch": 4.599733155436958, "step": 13790 }, { "distill_loss": 0.21954916417598724, "epoch": 4.599733155436958, "step": 13790 }, { "epoch": 4.599733155436958, "ref_ce_loss": 0.10898900777101517, "step": 13790 }, { "epoch": 4.599733155436958, "loss": 0.23668749630451202, "step": 13790 }, { "ce_loss": 0.0502098873257637, "epoch": 4.599733155436958, "step": 13790 }, { "distill_loss": 0.10353440046310425, "epoch": 4.599733155436958, "step": 13790 }, { "epoch": 4.599733155436958, "ref_ce_loss": 0.047333549708127975, "step": 13790 }, { "epoch": 4.599733155436958, "loss": 0.5408918857574463, "step": 13790 }, { "ce_loss": 0.12211053818464279, "epoch": 4.599733155436958, "step": 13790 }, { "distill_loss": 0.15445077419281006, "epoch": 4.599733155436958, "step": 13790 }, { "epoch": 4.599733155436958, "ref_ce_loss": 0.08605307340621948, "step": 13790 }, { "epoch": 4.603068712474983, "loss": 0.4387, "step": 13800 }, { "epoch": 4.603068712474983, "grad_norm": 3.229055404663086, "step": 13800 }, { "epoch": 4.603068712474983, "learning_rate": 0.00012085566284840637, "step": 13800 }, { "epoch": 4.603068712474983, "loss": 0.2670734226703644, "step": 13800 }, { "ce_loss": 0.010470490902662277, "epoch": 4.603068712474983, "step": 13800 }, { "distill_loss": 0.0766136422753334, "epoch": 4.603068712474983, "step": 13800 }, { "epoch": 4.603068712474983, "ref_ce_loss": 0.05777127668261528, "step": 13800 }, { "epoch": 4.603068712474983, "loss": 0.40359926223754883, "step": 13800 }, { "ce_loss": 0.19126923382282257, "epoch": 4.603068712474983, "step": 13800 }, { "distill_loss": 0.1276741325855255, "epoch": 4.603068712474983, "step": 13800 }, { "epoch": 4.603068712474983, "ref_ce_loss": 0.08455155044794083, "step": 13800 }, { "epoch": 4.603068712474983, "loss": 0.7321145534515381, "step": 13800 }, { "ce_loss": 0.13351599872112274, "epoch": 4.603068712474983, "step": 13800 }, { "distill_loss": 0.18421530723571777, "epoch": 4.603068712474983, "step": 13800 }, { "epoch": 4.603068712474983, "ref_ce_loss": 0.09967406839132309, "step": 13800 }, { "epoch": 4.603068712474983, "loss": 0.3720538318157196, "step": 13800 }, { "ce_loss": 0.11610039323568344, "epoch": 4.603068712474983, "step": 13800 }, { "distill_loss": 0.14120493829250336, "epoch": 4.603068712474983, "step": 13800 }, { "epoch": 4.603068712474983, "ref_ce_loss": 0.11461121588945389, "step": 13800 }, { "epoch": 4.606404269513009, "loss": 0.4275, "step": 13810 }, { "epoch": 4.606404269513009, "grad_norm": 3.0860586166381836, "step": 13810 }, { "epoch": 4.606404269513009, "learning_rate": 0.00012065698826523464, "step": 13810 }, { "epoch": 4.606404269513009, "loss": 0.5441614985466003, "step": 13810 }, { "ce_loss": 0.14497758448123932, "epoch": 4.606404269513009, "step": 13810 }, { "distill_loss": 0.14539915323257446, "epoch": 4.606404269513009, "step": 13810 }, { "epoch": 4.606404269513009, "ref_ce_loss": 0.0962267741560936, "step": 13810 }, { "epoch": 4.606404269513009, "loss": 0.2371405065059662, "step": 13810 }, { "ce_loss": 0.05888356268405914, "epoch": 4.606404269513009, "step": 13810 }, { "distill_loss": 0.10163218528032303, "epoch": 4.606404269513009, "step": 13810 }, { "epoch": 4.606404269513009, "ref_ce_loss": 0.0764048844575882, "step": 13810 }, { "epoch": 4.606404269513009, "loss": 0.35190001130104065, "step": 13810 }, { "ce_loss": 0.1194930449128151, "epoch": 4.606404269513009, "step": 13810 }, { "distill_loss": 0.11340652406215668, "epoch": 4.606404269513009, "step": 13810 }, { "epoch": 4.606404269513009, "ref_ce_loss": 0.08773592859506607, "step": 13810 }, { "epoch": 4.606404269513009, "loss": 0.3663141429424286, "step": 13810 }, { "ce_loss": 0.11494455486536026, "epoch": 4.606404269513009, "step": 13810 }, { "distill_loss": 0.1567523181438446, "epoch": 4.606404269513009, "step": 13810 }, { "epoch": 4.606404269513009, "ref_ce_loss": 0.07022814452648163, "step": 13810 }, { "epoch": 4.609739826551034, "loss": 0.4339, "step": 13820 }, { "epoch": 4.609739826551034, "grad_norm": 4.00881290435791, "step": 13820 }, { "epoch": 4.609739826551034, "learning_rate": 0.00012045836719214144, "step": 13820 }, { "epoch": 4.609739826551034, "loss": 0.4647110402584076, "step": 13820 }, { "ce_loss": 0.12648433446884155, "epoch": 4.609739826551034, "step": 13820 }, { "distill_loss": 0.19175873696804047, "epoch": 4.609739826551034, "step": 13820 }, { "epoch": 4.609739826551034, "ref_ce_loss": 0.10510455071926117, "step": 13820 }, { "epoch": 4.609739826551034, "loss": 0.31502780318260193, "step": 13820 }, { "ce_loss": 0.06370996683835983, "epoch": 4.609739826551034, "step": 13820 }, { "distill_loss": 0.11913012713193893, "epoch": 4.609739826551034, "step": 13820 }, { "epoch": 4.609739826551034, "ref_ce_loss": 0.09854836761951447, "step": 13820 }, { "epoch": 4.609739826551034, "loss": 0.2350054383277893, "step": 13820 }, { "ce_loss": 0.0437438003718853, "epoch": 4.609739826551034, "step": 13820 }, { "distill_loss": 0.13966906070709229, "epoch": 4.609739826551034, "step": 13820 }, { "epoch": 4.609739826551034, "ref_ce_loss": 0.05152058228850365, "step": 13820 }, { "epoch": 4.609739826551034, "loss": 0.5847965478897095, "step": 13820 }, { "ce_loss": 0.059979137033224106, "epoch": 4.609739826551034, "step": 13820 }, { "distill_loss": 0.1415148377418518, "epoch": 4.609739826551034, "step": 13820 }, { "epoch": 4.609739826551034, "ref_ce_loss": 0.07900790870189667, "step": 13820 }, { "epoch": 4.613075383589059, "loss": 0.4223, "step": 13830 }, { "epoch": 4.613075383589059, "grad_norm": 2.820600986480713, "step": 13830 }, { "epoch": 4.613075383589059, "learning_rate": 0.00012025979999133331, "step": 13830 }, { "epoch": 4.613075383589059, "loss": 0.4595937728881836, "step": 13830 }, { "ce_loss": 0.1749647557735443, "epoch": 4.613075383589059, "step": 13830 }, { "distill_loss": 0.13943815231323242, "epoch": 4.613075383589059, "step": 13830 }, { "epoch": 4.613075383589059, "ref_ce_loss": 0.12155468761920929, "step": 13830 }, { "epoch": 4.613075383589059, "loss": 0.6600459218025208, "step": 13830 }, { "ce_loss": 0.06968953460454941, "epoch": 4.613075383589059, "step": 13830 }, { "distill_loss": 0.14609262347221375, "epoch": 4.613075383589059, "step": 13830 }, { "epoch": 4.613075383589059, "ref_ce_loss": 0.08199404180049896, "step": 13830 }, { "epoch": 4.613075383589059, "loss": 0.33543717861175537, "step": 13830 }, { "ce_loss": 0.05783466994762421, "epoch": 4.613075383589059, "step": 13830 }, { "distill_loss": 0.11310224235057831, "epoch": 4.613075383589059, "step": 13830 }, { "epoch": 4.613075383589059, "ref_ce_loss": 0.0892929807305336, "step": 13830 }, { "epoch": 4.613075383589059, "loss": 0.3647262454032898, "step": 13830 }, { "ce_loss": 0.04082637280225754, "epoch": 4.613075383589059, "step": 13830 }, { "distill_loss": 0.15222899615764618, "epoch": 4.613075383589059, "step": 13830 }, { "epoch": 4.613075383589059, "ref_ce_loss": 0.08344709873199463, "step": 13830 }, { "epoch": 4.616410940627085, "loss": 0.4776, "step": 13840 }, { "epoch": 4.616410940627085, "grad_norm": 5.2468366622924805, "step": 13840 }, { "epoch": 4.616410940627085, "learning_rate": 0.00012006128702491837, "step": 13840 }, { "epoch": 4.616410940627085, "loss": 0.18527640402317047, "step": 13840 }, { "ce_loss": 0.016801243647933006, "epoch": 4.616410940627085, "step": 13840 }, { "distill_loss": 0.09196959435939789, "epoch": 4.616410940627085, "step": 13840 }, { "epoch": 4.616410940627085, "ref_ce_loss": 0.07647021859884262, "step": 13840 }, { "epoch": 4.616410940627085, "loss": 0.28422296047210693, "step": 13840 }, { "ce_loss": 0.0651751160621643, "epoch": 4.616410940627085, "step": 13840 }, { "distill_loss": 0.09933413565158844, "epoch": 4.616410940627085, "step": 13840 }, { "epoch": 4.616410940627085, "ref_ce_loss": 0.08113130927085876, "step": 13840 }, { "epoch": 4.616410940627085, "loss": 0.5058305263519287, "step": 13840 }, { "ce_loss": 0.08947150409221649, "epoch": 4.616410940627085, "step": 13840 }, { "distill_loss": 0.20171362161636353, "epoch": 4.616410940627085, "step": 13840 }, { "epoch": 4.616410940627085, "ref_ce_loss": 0.10488651692867279, "step": 13840 }, { "epoch": 4.616410940627085, "loss": 0.33015942573547363, "step": 13840 }, { "ce_loss": 0.04778118059039116, "epoch": 4.616410940627085, "step": 13840 }, { "distill_loss": 0.09841583669185638, "epoch": 4.616410940627085, "step": 13840 }, { "epoch": 4.616410940627085, "ref_ce_loss": 0.08348551392555237, "step": 13840 }, { "epoch": 4.61974649766511, "loss": 0.4282, "step": 13850 }, { "epoch": 4.61974649766511, "grad_norm": 3.1201624870300293, "step": 13850 }, { "epoch": 4.61974649766511, "learning_rate": 0.00011986282865490614, "step": 13850 }, { "epoch": 4.61974649766511, "loss": 0.42631927132606506, "step": 13850 }, { "ce_loss": 0.12742312252521515, "epoch": 4.61974649766511, "step": 13850 }, { "distill_loss": 0.159180149435997, "epoch": 4.61974649766511, "step": 13850 }, { "epoch": 4.61974649766511, "ref_ce_loss": 0.10846816748380661, "step": 13850 }, { "epoch": 4.61974649766511, "loss": 0.24269895255565643, "step": 13850 }, { "ce_loss": 0.022357333451509476, "epoch": 4.61974649766511, "step": 13850 }, { "distill_loss": 0.10337289422750473, "epoch": 4.61974649766511, "step": 13850 }, { "epoch": 4.61974649766511, "ref_ce_loss": 0.09756804257631302, "step": 13850 }, { "epoch": 4.61974649766511, "loss": 0.27713820338249207, "step": 13850 }, { "ce_loss": 0.05909983813762665, "epoch": 4.61974649766511, "step": 13850 }, { "distill_loss": 0.1180325597524643, "epoch": 4.61974649766511, "step": 13850 }, { "epoch": 4.61974649766511, "ref_ce_loss": 0.0755968987941742, "step": 13850 }, { "epoch": 4.61974649766511, "loss": 0.4052237868309021, "step": 13850 }, { "ce_loss": 0.05517116189002991, "epoch": 4.61974649766511, "step": 13850 }, { "distill_loss": 0.13782203197479248, "epoch": 4.61974649766511, "step": 13850 }, { "epoch": 4.61974649766511, "ref_ce_loss": 0.08097584545612335, "step": 13850 }, { "epoch": 4.6230820547031355, "loss": 0.421, "step": 13860 }, { "epoch": 4.6230820547031355, "grad_norm": 2.583071708679199, "step": 13860 }, { "epoch": 4.6230820547031355, "learning_rate": 0.00011966442524320619, "step": 13860 }, { "epoch": 4.6230820547031355, "loss": 0.36535245180130005, "step": 13860 }, { "ce_loss": 0.08213835209608078, "epoch": 4.6230820547031355, "step": 13860 }, { "distill_loss": 0.12975214421749115, "epoch": 4.6230820547031355, "step": 13860 }, { "epoch": 4.6230820547031355, "ref_ce_loss": 0.039594732224941254, "step": 13860 }, { "epoch": 4.6230820547031355, "loss": 0.530299723148346, "step": 13860 }, { "ce_loss": 0.12899358570575714, "epoch": 4.6230820547031355, "step": 13860 }, { "distill_loss": 0.2598971128463745, "epoch": 4.6230820547031355, "step": 13860 }, { "epoch": 4.6230820547031355, "ref_ce_loss": 0.09959632903337479, "step": 13860 }, { "epoch": 4.6230820547031355, "loss": 0.3098726272583008, "step": 13860 }, { "ce_loss": 0.07682958990335464, "epoch": 4.6230820547031355, "step": 13860 }, { "distill_loss": 0.16426794230937958, "epoch": 4.6230820547031355, "step": 13860 }, { "epoch": 4.6230820547031355, "ref_ce_loss": 0.06860676407814026, "step": 13860 }, { "epoch": 4.6230820547031355, "loss": 0.2955167293548584, "step": 13860 }, { "ce_loss": 0.06630170345306396, "epoch": 4.6230820547031355, "step": 13860 }, { "distill_loss": 0.13955476880073547, "epoch": 4.6230820547031355, "step": 13860 }, { "epoch": 4.6230820547031355, "ref_ce_loss": 0.08948300033807755, "step": 13860 }, { "epoch": 4.626417611741161, "loss": 0.3961, "step": 13870 }, { "epoch": 4.626417611741161, "grad_norm": 2.267632007598877, "step": 13870 }, { "epoch": 4.626417611741161, "learning_rate": 0.00011946607715162821, "step": 13870 }, { "epoch": 4.626417611741161, "loss": 0.5063121318817139, "step": 13870 }, { "ce_loss": 0.07224904745817184, "epoch": 4.626417611741161, "step": 13870 }, { "distill_loss": 0.21819202601909637, "epoch": 4.626417611741161, "step": 13870 }, { "epoch": 4.626417611741161, "ref_ce_loss": 0.046448878943920135, "step": 13870 }, { "epoch": 4.626417611741161, "loss": 0.33341118693351746, "step": 13870 }, { "ce_loss": 0.0745762288570404, "epoch": 4.626417611741161, "step": 13870 }, { "distill_loss": 0.11459389328956604, "epoch": 4.626417611741161, "step": 13870 }, { "epoch": 4.626417611741161, "ref_ce_loss": 0.09432224929332733, "step": 13870 }, { "epoch": 4.626417611741161, "loss": 0.2235090583562851, "step": 13870 }, { "ce_loss": 0.04365307465195656, "epoch": 4.626417611741161, "step": 13870 }, { "distill_loss": 0.11507236212491989, "epoch": 4.626417611741161, "step": 13870 }, { "epoch": 4.626417611741161, "ref_ce_loss": 0.0628926232457161, "step": 13870 }, { "epoch": 4.626417611741161, "loss": 0.17913806438446045, "step": 13870 }, { "ce_loss": 0.022245345637202263, "epoch": 4.626417611741161, "step": 13870 }, { "distill_loss": 0.0938110202550888, "epoch": 4.626417611741161, "step": 13870 }, { "epoch": 4.626417611741161, "ref_ce_loss": 0.062074512243270874, "step": 13870 }, { "epoch": 4.629753168779186, "loss": 0.4243, "step": 13880 }, { "epoch": 4.629753168779186, "grad_norm": 1.9865474700927734, "step": 13880 }, { "epoch": 4.629753168779186, "learning_rate": 0.00011926778474188093, "step": 13880 }, { "epoch": 4.629753168779186, "loss": 0.5977710485458374, "step": 13880 }, { "ce_loss": 0.1861013025045395, "epoch": 4.629753168779186, "step": 13880 }, { "distill_loss": 0.19886083900928497, "epoch": 4.629753168779186, "step": 13880 }, { "epoch": 4.629753168779186, "ref_ce_loss": 0.12160538136959076, "step": 13880 }, { "epoch": 4.629753168779186, "loss": 0.31760358810424805, "step": 13880 }, { "ce_loss": 0.09331942349672318, "epoch": 4.629753168779186, "step": 13880 }, { "distill_loss": 0.10661468654870987, "epoch": 4.629753168779186, "step": 13880 }, { "epoch": 4.629753168779186, "ref_ce_loss": 0.09620601683855057, "step": 13880 }, { "epoch": 4.629753168779186, "loss": 0.29638510942459106, "step": 13880 }, { "ce_loss": 0.01876196824014187, "epoch": 4.629753168779186, "step": 13880 }, { "distill_loss": 0.13782073557376862, "epoch": 4.629753168779186, "step": 13880 }, { "epoch": 4.629753168779186, "ref_ce_loss": 0.08540710061788559, "step": 13880 }, { "epoch": 4.629753168779186, "loss": 0.3348587453365326, "step": 13880 }, { "ce_loss": 0.12150032818317413, "epoch": 4.629753168779186, "step": 13880 }, { "distill_loss": 0.1310514360666275, "epoch": 4.629753168779186, "step": 13880 }, { "epoch": 4.629753168779186, "ref_ce_loss": 0.05937032774090767, "step": 13880 }, { "epoch": 4.6330887258172115, "loss": 0.4186, "step": 13890 }, { "epoch": 4.6330887258172115, "grad_norm": 2.33272385597229, "step": 13890 }, { "epoch": 4.6330887258172115, "learning_rate": 0.00011906954837557133, "step": 13890 }, { "epoch": 4.6330887258172115, "loss": 0.4339160919189453, "step": 13890 }, { "ce_loss": 0.10280374437570572, "epoch": 4.6330887258172115, "step": 13890 }, { "distill_loss": 0.15689747035503387, "epoch": 4.6330887258172115, "step": 13890 }, { "epoch": 4.6330887258172115, "ref_ce_loss": 0.08752254396677017, "step": 13890 }, { "epoch": 4.6330887258172115, "loss": 0.2364230751991272, "step": 13890 }, { "ce_loss": 0.05747167766094208, "epoch": 4.6330887258172115, "step": 13890 }, { "distill_loss": 0.10423195362091064, "epoch": 4.6330887258172115, "step": 13890 }, { "epoch": 4.6330887258172115, "ref_ce_loss": 0.0538666807115078, "step": 13890 }, { "epoch": 4.6330887258172115, "loss": 0.2561870515346527, "step": 13890 }, { "ce_loss": 0.06521957367658615, "epoch": 4.6330887258172115, "step": 13890 }, { "distill_loss": 0.10261288285255432, "epoch": 4.6330887258172115, "step": 13890 }, { "epoch": 4.6330887258172115, "ref_ce_loss": 0.06124391034245491, "step": 13890 }, { "epoch": 4.6330887258172115, "loss": 0.266074538230896, "step": 13890 }, { "ce_loss": 0.06419584900140762, "epoch": 4.6330887258172115, "step": 13890 }, { "distill_loss": 0.09646572917699814, "epoch": 4.6330887258172115, "step": 13890 }, { "epoch": 4.6330887258172115, "ref_ce_loss": 0.0818057730793953, "step": 13890 }, { "epoch": 4.636424282855237, "loss": 0.4187, "step": 13900 }, { "epoch": 4.636424282855237, "grad_norm": 3.4036307334899902, "step": 13900 }, { "epoch": 4.636424282855237, "learning_rate": 0.00011887136841420444, "step": 13900 }, { "epoch": 4.636424282855237, "loss": 0.7854809761047363, "step": 13900 }, { "ce_loss": 0.1579110324382782, "epoch": 4.636424282855237, "step": 13900 }, { "distill_loss": 0.17435112595558167, "epoch": 4.636424282855237, "step": 13900 }, { "epoch": 4.636424282855237, "ref_ce_loss": 0.07334227114915848, "step": 13900 }, { "epoch": 4.636424282855237, "loss": 0.7152835130691528, "step": 13900 }, { "ce_loss": 0.1721900850534439, "epoch": 4.636424282855237, "step": 13900 }, { "distill_loss": 0.20881494879722595, "epoch": 4.636424282855237, "step": 13900 }, { "epoch": 4.636424282855237, "ref_ce_loss": 0.11643655598163605, "step": 13900 }, { "epoch": 4.636424282855237, "loss": 0.3511710464954376, "step": 13900 }, { "ce_loss": 0.09932536631822586, "epoch": 4.636424282855237, "step": 13900 }, { "distill_loss": 0.13884766399860382, "epoch": 4.636424282855237, "step": 13900 }, { "epoch": 4.636424282855237, "ref_ce_loss": 0.08952175080776215, "step": 13900 }, { "epoch": 4.636424282855237, "loss": 0.6365158557891846, "step": 13900 }, { "ce_loss": 0.19059507548809052, "epoch": 4.636424282855237, "step": 13900 }, { "distill_loss": 0.17020054161548615, "epoch": 4.636424282855237, "step": 13900 }, { "epoch": 4.636424282855237, "ref_ce_loss": 0.09225431084632874, "step": 13900 }, { "epoch": 4.639759839893262, "loss": 0.4603, "step": 13910 }, { "epoch": 4.639759839893262, "grad_norm": 2.318143367767334, "step": 13910 }, { "epoch": 4.639759839893262, "learning_rate": 0.00011867324521918238, "step": 13910 }, { "epoch": 4.639759839893262, "loss": 0.3799995183944702, "step": 13910 }, { "ce_loss": 0.13443692028522491, "epoch": 4.639759839893262, "step": 13910 }, { "distill_loss": 0.17886750400066376, "epoch": 4.639759839893262, "step": 13910 }, { "epoch": 4.639759839893262, "ref_ce_loss": 0.06636130809783936, "step": 13910 }, { "epoch": 4.639759839893262, "loss": 0.4490954875946045, "step": 13910 }, { "ce_loss": 0.09709697216749191, "epoch": 4.639759839893262, "step": 13910 }, { "distill_loss": 0.175789013504982, "epoch": 4.639759839893262, "step": 13910 }, { "epoch": 4.639759839893262, "ref_ce_loss": 0.11594989150762558, "step": 13910 }, { "epoch": 4.639759839893262, "loss": 0.7186691761016846, "step": 13910 }, { "ce_loss": 0.11297119408845901, "epoch": 4.639759839893262, "step": 13910 }, { "distill_loss": 0.15328797698020935, "epoch": 4.639759839893262, "step": 13910 }, { "epoch": 4.639759839893262, "ref_ce_loss": 0.07922860234975815, "step": 13910 }, { "epoch": 4.639759839893262, "loss": 0.4605812728404999, "step": 13910 }, { "ce_loss": 0.12701410055160522, "epoch": 4.639759839893262, "step": 13910 }, { "distill_loss": 0.16070149838924408, "epoch": 4.639759839893262, "step": 13910 }, { "epoch": 4.639759839893262, "ref_ce_loss": 0.0495823509991169, "step": 13910 }, { "epoch": 4.643095396931288, "loss": 0.4562, "step": 13920 }, { "epoch": 4.643095396931288, "grad_norm": 2.51172137260437, "step": 13920 }, { "epoch": 4.643095396931288, "learning_rate": 0.00011847517915180356, "step": 13920 }, { "epoch": 4.643095396931288, "loss": 0.41623979806900024, "step": 13920 }, { "ce_loss": 0.05802667886018753, "epoch": 4.643095396931288, "step": 13920 }, { "distill_loss": 0.14537282288074493, "epoch": 4.643095396931288, "step": 13920 }, { "epoch": 4.643095396931288, "ref_ce_loss": 0.063286691904068, "step": 13920 }, { "epoch": 4.643095396931288, "loss": 0.8246922492980957, "step": 13920 }, { "ce_loss": 0.06929119676351547, "epoch": 4.643095396931288, "step": 13920 }, { "distill_loss": 0.16589587926864624, "epoch": 4.643095396931288, "step": 13920 }, { "epoch": 4.643095396931288, "ref_ce_loss": 0.10522410273551941, "step": 13920 }, { "epoch": 4.643095396931288, "loss": 0.5103774666786194, "step": 13920 }, { "ce_loss": 0.1705876737833023, "epoch": 4.643095396931288, "step": 13920 }, { "distill_loss": 0.15484222769737244, "epoch": 4.643095396931288, "step": 13920 }, { "epoch": 4.643095396931288, "ref_ce_loss": 0.14860162138938904, "step": 13920 }, { "epoch": 4.643095396931288, "loss": 0.46628522872924805, "step": 13920 }, { "ce_loss": 0.0842401385307312, "epoch": 4.643095396931288, "step": 13920 }, { "distill_loss": 0.149397075176239, "epoch": 4.643095396931288, "step": 13920 }, { "epoch": 4.643095396931288, "ref_ce_loss": 0.0805220827460289, "step": 13920 }, { "epoch": 4.646430953969313, "loss": 0.4727, "step": 13930 }, { "epoch": 4.646430953969313, "grad_norm": 2.2505548000335693, "step": 13930 }, { "epoch": 4.646430953969313, "learning_rate": 0.00011827717057326252, "step": 13930 }, { "epoch": 4.646430953969313, "loss": 0.36407792568206787, "step": 13930 }, { "ce_loss": 0.11833661794662476, "epoch": 4.646430953969313, "step": 13930 }, { "distill_loss": 0.15202414989471436, "epoch": 4.646430953969313, "step": 13930 }, { "epoch": 4.646430953969313, "ref_ce_loss": 0.0678495541214943, "step": 13930 }, { "epoch": 4.646430953969313, "loss": 0.3908737003803253, "step": 13930 }, { "ce_loss": 0.10677006840705872, "epoch": 4.646430953969313, "step": 13930 }, { "distill_loss": 0.16376477479934692, "epoch": 4.646430953969313, "step": 13930 }, { "epoch": 4.646430953969313, "ref_ce_loss": 0.11999862641096115, "step": 13930 }, { "epoch": 4.646430953969313, "loss": 0.41768503189086914, "step": 13930 }, { "ce_loss": 0.09405351430177689, "epoch": 4.646430953969313, "step": 13930 }, { "distill_loss": 0.16130514442920685, "epoch": 4.646430953969313, "step": 13930 }, { "epoch": 4.646430953969313, "ref_ce_loss": 0.10390011221170425, "step": 13930 }, { "epoch": 4.646430953969313, "loss": 0.39294177293777466, "step": 13930 }, { "ce_loss": 0.11008800566196442, "epoch": 4.646430953969313, "step": 13930 }, { "distill_loss": 0.15608064830303192, "epoch": 4.646430953969313, "step": 13930 }, { "epoch": 4.646430953969313, "ref_ce_loss": 0.059509534388780594, "step": 13930 }, { "epoch": 4.649766511007338, "loss": 0.3981, "step": 13940 }, { "epoch": 4.649766511007338, "grad_norm": 3.809084177017212, "step": 13940 }, { "epoch": 4.649766511007338, "learning_rate": 0.00011807921984464869, "step": 13940 }, { "epoch": 4.649766511007338, "loss": 0.322648823261261, "step": 13940 }, { "ce_loss": 0.048746369779109955, "epoch": 4.649766511007338, "step": 13940 }, { "distill_loss": 0.12906503677368164, "epoch": 4.649766511007338, "step": 13940 }, { "epoch": 4.649766511007338, "ref_ce_loss": 0.055475905537605286, "step": 13940 }, { "epoch": 4.649766511007338, "loss": 0.3334718942642212, "step": 13940 }, { "ce_loss": 0.07918932288885117, "epoch": 4.649766511007338, "step": 13940 }, { "distill_loss": 0.14949554204940796, "epoch": 4.649766511007338, "step": 13940 }, { "epoch": 4.649766511007338, "ref_ce_loss": 0.03752530738711357, "step": 13940 }, { "epoch": 4.649766511007338, "loss": 0.37703388929367065, "step": 13940 }, { "ce_loss": 0.08964626491069794, "epoch": 4.649766511007338, "step": 13940 }, { "distill_loss": 0.15478254854679108, "epoch": 4.649766511007338, "step": 13940 }, { "epoch": 4.649766511007338, "ref_ce_loss": 0.09836571663618088, "step": 13940 }, { "epoch": 4.649766511007338, "loss": 0.6053097248077393, "step": 13940 }, { "ce_loss": 0.09944993257522583, "epoch": 4.649766511007338, "step": 13940 }, { "distill_loss": 0.18654851615428925, "epoch": 4.649766511007338, "step": 13940 }, { "epoch": 4.649766511007338, "ref_ce_loss": 0.11349543184041977, "step": 13940 }, { "epoch": 4.653102068045364, "loss": 0.4453, "step": 13950 }, { "epoch": 4.653102068045364, "grad_norm": 1.9943000078201294, "step": 13950 }, { "epoch": 4.653102068045364, "learning_rate": 0.00011788132732694608, "step": 13950 }, { "epoch": 4.653102068045364, "loss": 0.6359728574752808, "step": 13950 }, { "ce_loss": 0.07161174714565277, "epoch": 4.653102068045364, "step": 13950 }, { "distill_loss": 0.13512277603149414, "epoch": 4.653102068045364, "step": 13950 }, { "epoch": 4.653102068045364, "ref_ce_loss": 0.08939622342586517, "step": 13950 }, { "epoch": 4.653102068045364, "loss": 0.3288179039955139, "step": 13950 }, { "ce_loss": 0.08766458183526993, "epoch": 4.653102068045364, "step": 13950 }, { "distill_loss": 0.13691724836826324, "epoch": 4.653102068045364, "step": 13950 }, { "epoch": 4.653102068045364, "ref_ce_loss": 0.0851324051618576, "step": 13950 }, { "epoch": 4.653102068045364, "loss": 0.6143372058868408, "step": 13950 }, { "ce_loss": 0.15045179426670074, "epoch": 4.653102068045364, "step": 13950 }, { "distill_loss": 0.1924908459186554, "epoch": 4.653102068045364, "step": 13950 }, { "epoch": 4.653102068045364, "ref_ce_loss": 0.08594503998756409, "step": 13950 }, { "epoch": 4.653102068045364, "loss": 0.533692479133606, "step": 13950 }, { "ce_loss": 0.11740151047706604, "epoch": 4.653102068045364, "step": 13950 }, { "distill_loss": 0.14584587514400482, "epoch": 4.653102068045364, "step": 13950 }, { "epoch": 4.653102068045364, "ref_ce_loss": 0.06598098576068878, "step": 13950 }, { "epoch": 4.656437625083389, "loss": 0.4665, "step": 13960 }, { "epoch": 4.656437625083389, "grad_norm": 4.326551914215088, "step": 13960 }, { "epoch": 4.656437625083389, "learning_rate": 0.00011768349338103273, "step": 13960 }, { "epoch": 4.656437625083389, "loss": 0.38727375864982605, "step": 13960 }, { "ce_loss": 0.05691125616431236, "epoch": 4.656437625083389, "step": 13960 }, { "distill_loss": 0.12060718238353729, "epoch": 4.656437625083389, "step": 13960 }, { "epoch": 4.656437625083389, "ref_ce_loss": 0.0724659189581871, "step": 13960 }, { "epoch": 4.656437625083389, "loss": 0.4936906397342682, "step": 13960 }, { "ce_loss": 0.12200823426246643, "epoch": 4.656437625083389, "step": 13960 }, { "distill_loss": 0.1445818841457367, "epoch": 4.656437625083389, "step": 13960 }, { "epoch": 4.656437625083389, "ref_ce_loss": 0.09555574506521225, "step": 13960 }, { "epoch": 4.656437625083389, "loss": 0.5360593795776367, "step": 13960 }, { "ce_loss": 0.12914180755615234, "epoch": 4.656437625083389, "step": 13960 }, { "distill_loss": 0.19759999215602875, "epoch": 4.656437625083389, "step": 13960 }, { "epoch": 4.656437625083389, "ref_ce_loss": 0.1032886877655983, "step": 13960 }, { "epoch": 4.656437625083389, "loss": 0.4051770865917206, "step": 13960 }, { "ce_loss": 0.1243601068854332, "epoch": 4.656437625083389, "step": 13960 }, { "distill_loss": 0.13619369268417358, "epoch": 4.656437625083389, "step": 13960 }, { "epoch": 4.656437625083389, "ref_ce_loss": 0.10029909014701843, "step": 13960 }, { "epoch": 4.659773182121414, "loss": 0.4264, "step": 13970 }, { "epoch": 4.659773182121414, "grad_norm": 2.531710624694824, "step": 13970 }, { "epoch": 4.659773182121414, "learning_rate": 0.0001174857183676796, "step": 13970 }, { "epoch": 4.659773182121414, "loss": 0.3981442451477051, "step": 13970 }, { "ce_loss": 0.12004105746746063, "epoch": 4.659773182121414, "step": 13970 }, { "distill_loss": 0.1404426544904709, "epoch": 4.659773182121414, "step": 13970 }, { "epoch": 4.659773182121414, "ref_ce_loss": 0.07255041599273682, "step": 13970 }, { "epoch": 4.659773182121414, "loss": 0.3417322635650635, "step": 13970 }, { "ce_loss": 0.12429323047399521, "epoch": 4.659773182121414, "step": 13970 }, { "distill_loss": 0.13273431360721588, "epoch": 4.659773182121414, "step": 13970 }, { "epoch": 4.659773182121414, "ref_ce_loss": 0.08454957604408264, "step": 13970 }, { "epoch": 4.659773182121414, "loss": 0.3531622886657715, "step": 13970 }, { "ce_loss": 0.0692068412899971, "epoch": 4.659773182121414, "step": 13970 }, { "distill_loss": 0.1651550829410553, "epoch": 4.659773182121414, "step": 13970 }, { "epoch": 4.659773182121414, "ref_ce_loss": 0.11834469437599182, "step": 13970 }, { "epoch": 4.659773182121414, "loss": 0.382656991481781, "step": 13970 }, { "ce_loss": 0.028870120644569397, "epoch": 4.659773182121414, "step": 13970 }, { "distill_loss": 0.10546714067459106, "epoch": 4.659773182121414, "step": 13970 }, { "epoch": 4.659773182121414, "ref_ce_loss": 0.06734929233789444, "step": 13970 }, { "epoch": 4.66310873915944, "loss": 0.4685, "step": 13980 }, { "epoch": 4.66310873915944, "grad_norm": 4.019238471984863, "step": 13980 }, { "epoch": 4.66310873915944, "learning_rate": 0.00011728800264755034, "step": 13980 }, { "epoch": 4.66310873915944, "loss": 0.2976240813732147, "step": 13980 }, { "ce_loss": 0.06153253838419914, "epoch": 4.66310873915944, "step": 13980 }, { "distill_loss": 0.131220281124115, "epoch": 4.66310873915944, "step": 13980 }, { "epoch": 4.66310873915944, "ref_ce_loss": 0.06489714235067368, "step": 13980 }, { "epoch": 4.66310873915944, "loss": 0.38370513916015625, "step": 13980 }, { "ce_loss": 0.10629656910896301, "epoch": 4.66310873915944, "step": 13980 }, { "distill_loss": 0.14813432097434998, "epoch": 4.66310873915944, "step": 13980 }, { "epoch": 4.66310873915944, "ref_ce_loss": 0.10886724293231964, "step": 13980 }, { "epoch": 4.66310873915944, "loss": 0.29492947459220886, "step": 13980 }, { "ce_loss": 0.05098975449800491, "epoch": 4.66310873915944, "step": 13980 }, { "distill_loss": 0.1699628233909607, "epoch": 4.66310873915944, "step": 13980 }, { "epoch": 4.66310873915944, "ref_ce_loss": 0.07380944490432739, "step": 13980 }, { "epoch": 4.66310873915944, "loss": 0.3338431417942047, "step": 13980 }, { "ce_loss": 0.08042466640472412, "epoch": 4.66310873915944, "step": 13980 }, { "distill_loss": 0.11967265605926514, "epoch": 4.66310873915944, "step": 13980 }, { "epoch": 4.66310873915944, "ref_ce_loss": 0.06992650032043457, "step": 13980 }, { "epoch": 4.666444296197465, "loss": 0.4189, "step": 13990 }, { "epoch": 4.666444296197465, "grad_norm": 2.2785110473632812, "step": 13990 }, { "epoch": 4.666444296197465, "learning_rate": 0.00011709034658120039, "step": 13990 }, { "epoch": 4.666444296197465, "loss": 0.27386167645454407, "step": 13990 }, { "ce_loss": 0.0423194095492363, "epoch": 4.666444296197465, "step": 13990 }, { "distill_loss": 0.12726286053657532, "epoch": 4.666444296197465, "step": 13990 }, { "epoch": 4.666444296197465, "ref_ce_loss": 0.07899350672960281, "step": 13990 }, { "epoch": 4.666444296197465, "loss": 0.506066083908081, "step": 13990 }, { "ce_loss": 0.09570929408073425, "epoch": 4.666444296197465, "step": 13990 }, { "distill_loss": 0.14508602023124695, "epoch": 4.666444296197465, "step": 13990 }, { "epoch": 4.666444296197465, "ref_ce_loss": 0.10210248827934265, "step": 13990 }, { "epoch": 4.666444296197465, "loss": 0.3897585868835449, "step": 13990 }, { "ce_loss": 0.09135571867227554, "epoch": 4.666444296197465, "step": 13990 }, { "distill_loss": 0.24521411955356598, "epoch": 4.666444296197465, "step": 13990 }, { "epoch": 4.666444296197465, "ref_ce_loss": 0.053022027015686035, "step": 13990 }, { "epoch": 4.666444296197465, "loss": 0.4636989235877991, "step": 13990 }, { "ce_loss": 0.10903631150722504, "epoch": 4.666444296197465, "step": 13990 }, { "distill_loss": 0.183272123336792, "epoch": 4.666444296197465, "step": 13990 }, { "epoch": 4.666444296197465, "ref_ce_loss": 0.11822542548179626, "step": 13990 }, { "epoch": 4.66977985323549, "loss": 0.4258, "step": 14000 }, { "epoch": 4.66977985323549, "grad_norm": 2.021631956100464, "step": 14000 }, { "epoch": 4.66977985323549, "learning_rate": 0.00011689275052907649, "step": 14000 }, { "epoch": 4.66977985323549, "loss": 0.4199639856815338, "step": 14000 }, { "ce_loss": 0.09831275790929794, "epoch": 4.66977985323549, "step": 14000 }, { "distill_loss": 0.17195957899093628, "epoch": 4.66977985323549, "step": 14000 }, { "epoch": 4.66977985323549, "ref_ce_loss": 0.06019444018602371, "step": 14000 }, { "epoch": 4.66977985323549, "loss": 0.34083402156829834, "step": 14000 }, { "ce_loss": 0.13644544780254364, "epoch": 4.66977985323549, "step": 14000 }, { "distill_loss": 0.1311686784029007, "epoch": 4.66977985323549, "step": 14000 }, { "epoch": 4.66977985323549, "ref_ce_loss": 0.0720614343881607, "step": 14000 }, { "epoch": 4.66977985323549, "loss": 0.48071956634521484, "step": 14000 }, { "ce_loss": 0.1457836627960205, "epoch": 4.66977985323549, "step": 14000 }, { "distill_loss": 0.20094430446624756, "epoch": 4.66977985323549, "step": 14000 }, { "epoch": 4.66977985323549, "ref_ce_loss": 0.07603190094232559, "step": 14000 }, { "epoch": 4.66977985323549, "loss": 0.34649357199668884, "step": 14000 }, { "ce_loss": 0.04942803084850311, "epoch": 4.66977985323549, "step": 14000 }, { "distill_loss": 0.10252837836742401, "epoch": 4.66977985323549, "step": 14000 }, { "epoch": 4.66977985323549, "ref_ce_loss": 0.07332263886928558, "step": 14000 }, { "epoch": 4.673115410273516, "loss": 0.4056, "step": 14010 }, { "epoch": 4.673115410273516, "grad_norm": 2.8276865482330322, "step": 14010 }, { "epoch": 4.673115410273516, "learning_rate": 0.00011669521485151591, "step": 14010 }, { "epoch": 4.673115410273516, "loss": 0.6654629707336426, "step": 14010 }, { "ce_loss": 0.10808245092630386, "epoch": 4.673115410273516, "step": 14010 }, { "distill_loss": 0.11504686623811722, "epoch": 4.673115410273516, "step": 14010 }, { "epoch": 4.673115410273516, "ref_ce_loss": 0.07503039389848709, "step": 14010 }, { "epoch": 4.673115410273516, "loss": 0.6763304471969604, "step": 14010 }, { "ce_loss": 0.1163989007472992, "epoch": 4.673115410273516, "step": 14010 }, { "distill_loss": 0.19686204195022583, "epoch": 4.673115410273516, "step": 14010 }, { "epoch": 4.673115410273516, "ref_ce_loss": 0.09545636177062988, "step": 14010 }, { "epoch": 4.673115410273516, "loss": 0.2588968276977539, "step": 14010 }, { "ce_loss": 0.04675479605793953, "epoch": 4.673115410273516, "step": 14010 }, { "distill_loss": 0.13797177374362946, "epoch": 4.673115410273516, "step": 14010 }, { "epoch": 4.673115410273516, "ref_ce_loss": 0.0738506093621254, "step": 14010 }, { "epoch": 4.673115410273516, "loss": 0.3704875707626343, "step": 14010 }, { "ce_loss": 0.12819942831993103, "epoch": 4.673115410273516, "step": 14010 }, { "distill_loss": 0.11613549292087555, "epoch": 4.673115410273516, "step": 14010 }, { "epoch": 4.673115410273516, "ref_ce_loss": 0.09399432688951492, "step": 14010 }, { "epoch": 4.676450967311541, "loss": 0.4743, "step": 14020 }, { "epoch": 4.676450967311541, "grad_norm": 4.23807954788208, "step": 14020 }, { "epoch": 4.676450967311541, "learning_rate": 0.00011649773990874573, "step": 14020 }, { "epoch": 4.676450967311541, "loss": 0.3631729781627655, "step": 14020 }, { "ce_loss": 0.09874864667654037, "epoch": 4.676450967311541, "step": 14020 }, { "distill_loss": 0.1832832247018814, "epoch": 4.676450967311541, "step": 14020 }, { "epoch": 4.676450967311541, "ref_ce_loss": 0.08077353239059448, "step": 14020 }, { "epoch": 4.676450967311541, "loss": 0.3315301835536957, "step": 14020 }, { "ce_loss": 0.030873116105794907, "epoch": 4.676450967311541, "step": 14020 }, { "distill_loss": 0.16136936843395233, "epoch": 4.676450967311541, "step": 14020 }, { "epoch": 4.676450967311541, "ref_ce_loss": 0.06957585364580154, "step": 14020 }, { "epoch": 4.676450967311541, "loss": 0.5829925537109375, "step": 14020 }, { "ce_loss": 0.1614890992641449, "epoch": 4.676450967311541, "step": 14020 }, { "distill_loss": 0.19390279054641724, "epoch": 4.676450967311541, "step": 14020 }, { "epoch": 4.676450967311541, "ref_ce_loss": 0.09260150045156479, "step": 14020 }, { "epoch": 4.676450967311541, "loss": 0.47367650270462036, "step": 14020 }, { "ce_loss": 0.06276272982358932, "epoch": 4.676450967311541, "step": 14020 }, { "distill_loss": 0.13485144078731537, "epoch": 4.676450967311541, "step": 14020 }, { "epoch": 4.676450967311541, "ref_ce_loss": 0.09369198977947235, "step": 14020 }, { "epoch": 4.679786524349566, "loss": 0.456, "step": 14030 }, { "epoch": 4.679786524349566, "grad_norm": 3.837237596511841, "step": 14030 }, { "epoch": 4.679786524349566, "learning_rate": 0.0001163003260608824, "step": 14030 }, { "epoch": 4.679786524349566, "loss": 0.31302571296691895, "step": 14030 }, { "ce_loss": 0.09062153846025467, "epoch": 4.679786524349566, "step": 14030 }, { "distill_loss": 0.10657159984111786, "epoch": 4.679786524349566, "step": 14030 }, { "epoch": 4.679786524349566, "ref_ce_loss": 0.08325086534023285, "step": 14030 }, { "epoch": 4.679786524349566, "loss": 0.7584108114242554, "step": 14030 }, { "ce_loss": 0.09911534935235977, "epoch": 4.679786524349566, "step": 14030 }, { "distill_loss": 0.12623099982738495, "epoch": 4.679786524349566, "step": 14030 }, { "epoch": 4.679786524349566, "ref_ce_loss": 0.14725255966186523, "step": 14030 }, { "epoch": 4.679786524349566, "loss": 0.5008531808853149, "step": 14030 }, { "ce_loss": 0.10759527236223221, "epoch": 4.679786524349566, "step": 14030 }, { "distill_loss": 0.14299361407756805, "epoch": 4.679786524349566, "step": 14030 }, { "epoch": 4.679786524349566, "ref_ce_loss": 0.11691004782915115, "step": 14030 }, { "epoch": 4.679786524349566, "loss": 0.2524974048137665, "step": 14030 }, { "ce_loss": 0.03561868518590927, "epoch": 4.679786524349566, "step": 14030 }, { "distill_loss": 0.13155224919319153, "epoch": 4.679786524349566, "step": 14030 }, { "epoch": 4.679786524349566, "ref_ce_loss": 0.04909590631723404, "step": 14030 }, { "epoch": 4.683122081387592, "loss": 0.4802, "step": 14040 }, { "epoch": 4.683122081387592, "grad_norm": 2.0721302032470703, "step": 14040 }, { "epoch": 4.683122081387592, "learning_rate": 0.00011610297366793094, "step": 14040 }, { "epoch": 4.683122081387592, "loss": 0.22465814650058746, "step": 14040 }, { "ce_loss": 0.06050679087638855, "epoch": 4.683122081387592, "step": 14040 }, { "distill_loss": 0.08898370712995529, "epoch": 4.683122081387592, "step": 14040 }, { "epoch": 4.683122081387592, "ref_ce_loss": 0.07474206387996674, "step": 14040 }, { "epoch": 4.683122081387592, "loss": 0.8398646116256714, "step": 14040 }, { "ce_loss": 0.08319870382547379, "epoch": 4.683122081387592, "step": 14040 }, { "distill_loss": 0.1809157282114029, "epoch": 4.683122081387592, "step": 14040 }, { "epoch": 4.683122081387592, "ref_ce_loss": 0.08680960536003113, "step": 14040 }, { "epoch": 4.683122081387592, "loss": 0.5027204155921936, "step": 14040 }, { "ce_loss": 0.1770513951778412, "epoch": 4.683122081387592, "step": 14040 }, { "distill_loss": 0.20788423717021942, "epoch": 4.683122081387592, "step": 14040 }, { "epoch": 4.683122081387592, "ref_ce_loss": 0.09297391772270203, "step": 14040 }, { "epoch": 4.683122081387592, "loss": 0.30045297741889954, "step": 14040 }, { "ce_loss": 0.10085663199424744, "epoch": 4.683122081387592, "step": 14040 }, { "distill_loss": 0.09326339513063431, "epoch": 4.683122081387592, "step": 14040 }, { "epoch": 4.683122081387592, "ref_ce_loss": 0.10611985623836517, "step": 14040 }, { "epoch": 4.686457638425617, "loss": 0.4225, "step": 14050 }, { "epoch": 4.686457638425617, "grad_norm": 2.194854736328125, "step": 14050 }, { "epoch": 4.686457638425617, "learning_rate": 0.00011590568308978418, "step": 14050 }, { "epoch": 4.686457638425617, "loss": 0.2557274401187897, "step": 14050 }, { "ce_loss": 0.0705738216638565, "epoch": 4.686457638425617, "step": 14050 }, { "distill_loss": 0.09358223527669907, "epoch": 4.686457638425617, "step": 14050 }, { "epoch": 4.686457638425617, "ref_ce_loss": 0.046359218657016754, "step": 14050 }, { "epoch": 4.686457638425617, "loss": 0.3328011631965637, "step": 14050 }, { "ce_loss": 0.03167080879211426, "epoch": 4.686457638425617, "step": 14050 }, { "distill_loss": 0.17716750502586365, "epoch": 4.686457638425617, "step": 14050 }, { "epoch": 4.686457638425617, "ref_ce_loss": 0.08014225959777832, "step": 14050 }, { "epoch": 4.686457638425617, "loss": 0.5323995351791382, "step": 14050 }, { "ce_loss": 0.14414207637310028, "epoch": 4.686457638425617, "step": 14050 }, { "distill_loss": 0.15471740067005157, "epoch": 4.686457638425617, "step": 14050 }, { "epoch": 4.686457638425617, "ref_ce_loss": 0.08761541545391083, "step": 14050 }, { "epoch": 4.686457638425617, "loss": 0.3618018627166748, "step": 14050 }, { "ce_loss": 0.06889332085847855, "epoch": 4.686457638425617, "step": 14050 }, { "distill_loss": 0.14258632063865662, "epoch": 4.686457638425617, "step": 14050 }, { "epoch": 4.686457638425617, "ref_ce_loss": 0.05323160067200661, "step": 14050 }, { "epoch": 4.6897931954636425, "loss": 0.4491, "step": 14060 }, { "epoch": 4.6897931954636425, "grad_norm": 2.5217649936676025, "step": 14060 }, { "epoch": 4.6897931954636425, "learning_rate": 0.0001157084546862224, "step": 14060 }, { "epoch": 4.6897931954636425, "loss": 0.4820738434791565, "step": 14060 }, { "ce_loss": 0.09973058849573135, "epoch": 4.6897931954636425, "step": 14060 }, { "distill_loss": 0.13066692650318146, "epoch": 4.6897931954636425, "step": 14060 }, { "epoch": 4.6897931954636425, "ref_ce_loss": 0.10295473039150238, "step": 14060 }, { "epoch": 4.6897931954636425, "loss": 0.9242709875106812, "step": 14060 }, { "ce_loss": 0.12287642806768417, "epoch": 4.6897931954636425, "step": 14060 }, { "distill_loss": 0.14985498785972595, "epoch": 4.6897931954636425, "step": 14060 }, { "epoch": 4.6897931954636425, "ref_ce_loss": 0.09447828680276871, "step": 14060 }, { "epoch": 4.6897931954636425, "loss": 0.6546158790588379, "step": 14060 }, { "ce_loss": 0.15501223504543304, "epoch": 4.6897931954636425, "step": 14060 }, { "distill_loss": 0.12218710035085678, "epoch": 4.6897931954636425, "step": 14060 }, { "epoch": 4.6897931954636425, "ref_ce_loss": 0.10068108886480331, "step": 14060 }, { "epoch": 4.6897931954636425, "loss": 0.3825370669364929, "step": 14060 }, { "ce_loss": 0.1286890059709549, "epoch": 4.6897931954636425, "step": 14060 }, { "distill_loss": 0.15739960968494415, "epoch": 4.6897931954636425, "step": 14060 }, { "epoch": 4.6897931954636425, "ref_ce_loss": 0.09618832170963287, "step": 14060 }, { "epoch": 4.693128752501668, "loss": 0.424, "step": 14070 }, { "epoch": 4.693128752501668, "grad_norm": 2.1217305660247803, "step": 14070 }, { "epoch": 4.693128752501668, "learning_rate": 0.00011551128881691231, "step": 14070 }, { "epoch": 4.693128752501668, "loss": 0.41536468267440796, "step": 14070 }, { "ce_loss": 0.14819218218326569, "epoch": 4.693128752501668, "step": 14070 }, { "distill_loss": 0.14639030396938324, "epoch": 4.693128752501668, "step": 14070 }, { "epoch": 4.693128752501668, "ref_ce_loss": 0.12024623900651932, "step": 14070 }, { "epoch": 4.693128752501668, "loss": 0.23171193897724152, "step": 14070 }, { "ce_loss": 0.05471116676926613, "epoch": 4.693128752501668, "step": 14070 }, { "distill_loss": 0.11721976846456528, "epoch": 4.693128752501668, "step": 14070 }, { "epoch": 4.693128752501668, "ref_ce_loss": 0.0596269890666008, "step": 14070 }, { "epoch": 4.693128752501668, "loss": 0.8457027673721313, "step": 14070 }, { "ce_loss": 0.13278742134571075, "epoch": 4.693128752501668, "step": 14070 }, { "distill_loss": 0.216340571641922, "epoch": 4.693128752501668, "step": 14070 }, { "epoch": 4.693128752501668, "ref_ce_loss": 0.11654487252235413, "step": 14070 }, { "epoch": 4.693128752501668, "loss": 0.5509470701217651, "step": 14070 }, { "ce_loss": 0.08814974129199982, "epoch": 4.693128752501668, "step": 14070 }, { "distill_loss": 0.17692141234874725, "epoch": 4.693128752501668, "step": 14070 }, { "epoch": 4.693128752501668, "ref_ce_loss": 0.1137252077460289, "step": 14070 }, { "epoch": 4.696464309539693, "loss": 0.4754, "step": 14080 }, { "epoch": 4.696464309539693, "grad_norm": 3.004528045654297, "step": 14080 }, { "epoch": 4.696464309539693, "learning_rate": 0.00011531418584140673, "step": 14080 }, { "epoch": 4.696464309539693, "loss": 0.479631245136261, "step": 14080 }, { "ce_loss": 0.10905808955430984, "epoch": 4.696464309539693, "step": 14080 }, { "distill_loss": 0.14824768900871277, "epoch": 4.696464309539693, "step": 14080 }, { "epoch": 4.696464309539693, "ref_ce_loss": 0.10647785663604736, "step": 14080 }, { "epoch": 4.696464309539693, "loss": 0.35922229290008545, "step": 14080 }, { "ce_loss": 0.05895942449569702, "epoch": 4.696464309539693, "step": 14080 }, { "distill_loss": 0.14783093333244324, "epoch": 4.696464309539693, "step": 14080 }, { "epoch": 4.696464309539693, "ref_ce_loss": 0.07992715388536453, "step": 14080 }, { "epoch": 4.696464309539693, "loss": 0.33700183033943176, "step": 14080 }, { "ce_loss": 0.06836721301078796, "epoch": 4.696464309539693, "step": 14080 }, { "distill_loss": 0.17993780970573425, "epoch": 4.696464309539693, "step": 14080 }, { "epoch": 4.696464309539693, "ref_ce_loss": 0.06357478350400925, "step": 14080 }, { "epoch": 4.696464309539693, "loss": 0.23770642280578613, "step": 14080 }, { "ce_loss": 0.061131224036216736, "epoch": 4.696464309539693, "step": 14080 }, { "distill_loss": 0.11508830636739731, "epoch": 4.696464309539693, "step": 14080 }, { "epoch": 4.696464309539693, "ref_ce_loss": 0.051302142441272736, "step": 14080 }, { "epoch": 4.6997998665777185, "loss": 0.4443, "step": 14090 }, { "epoch": 4.6997998665777185, "grad_norm": 2.966629981994629, "step": 14090 }, { "epoch": 4.6997998665777185, "learning_rate": 0.00011511714611914378, "step": 14090 }, { "epoch": 4.6997998665777185, "loss": 0.27354520559310913, "step": 14090 }, { "ce_loss": 0.017260253429412842, "epoch": 4.6997998665777185, "step": 14090 }, { "distill_loss": 0.0955350324511528, "epoch": 4.6997998665777185, "step": 14090 }, { "epoch": 4.6997998665777185, "ref_ce_loss": 0.05125044286251068, "step": 14090 }, { "epoch": 4.6997998665777185, "loss": 0.453208863735199, "step": 14090 }, { "ce_loss": 0.058098290115594864, "epoch": 4.6997998665777185, "step": 14090 }, { "distill_loss": 0.13044509291648865, "epoch": 4.6997998665777185, "step": 14090 }, { "epoch": 4.6997998665777185, "ref_ce_loss": 0.07986553758382797, "step": 14090 }, { "epoch": 4.6997998665777185, "loss": 0.38690540194511414, "step": 14090 }, { "ce_loss": 0.13304416835308075, "epoch": 4.6997998665777185, "step": 14090 }, { "distill_loss": 0.1314799189567566, "epoch": 4.6997998665777185, "step": 14090 }, { "epoch": 4.6997998665777185, "ref_ce_loss": 0.08153283596038818, "step": 14090 }, { "epoch": 4.6997998665777185, "loss": 0.35225972533226013, "step": 14090 }, { "ce_loss": 0.08277490735054016, "epoch": 4.6997998665777185, "step": 14090 }, { "distill_loss": 0.1228342354297638, "epoch": 4.6997998665777185, "step": 14090 }, { "epoch": 4.6997998665777185, "ref_ce_loss": 0.09828595072031021, "step": 14090 }, { "epoch": 4.703135423615744, "loss": 0.4201, "step": 14100 }, { "epoch": 4.703135423615744, "grad_norm": 2.899693250656128, "step": 14100 }, { "epoch": 4.703135423615744, "learning_rate": 0.00011492017000944613, "step": 14100 }, { "epoch": 4.703135423615744, "loss": 0.38401275873184204, "step": 14100 }, { "ce_loss": 0.10745421797037125, "epoch": 4.703135423615744, "step": 14100 }, { "distill_loss": 0.17753872275352478, "epoch": 4.703135423615744, "step": 14100 }, { "epoch": 4.703135423615744, "ref_ce_loss": 0.09878189116716385, "step": 14100 }, { "epoch": 4.703135423615744, "loss": 0.41860488057136536, "step": 14100 }, { "ce_loss": 0.1281246542930603, "epoch": 4.703135423615744, "step": 14100 }, { "distill_loss": 0.10887651890516281, "epoch": 4.703135423615744, "step": 14100 }, { "epoch": 4.703135423615744, "ref_ce_loss": 0.07744525372982025, "step": 14100 }, { "epoch": 4.703135423615744, "loss": 0.5172537565231323, "step": 14100 }, { "ce_loss": 0.12804587185382843, "epoch": 4.703135423615744, "step": 14100 }, { "distill_loss": 0.12572410702705383, "epoch": 4.703135423615744, "step": 14100 }, { "epoch": 4.703135423615744, "ref_ce_loss": 0.0798870399594307, "step": 14100 }, { "epoch": 4.703135423615744, "loss": 0.3483532965183258, "step": 14100 }, { "ce_loss": 0.06657369434833527, "epoch": 4.703135423615744, "step": 14100 }, { "distill_loss": 0.11907947063446045, "epoch": 4.703135423615744, "step": 14100 }, { "epoch": 4.703135423615744, "ref_ce_loss": 0.064053475856781, "step": 14100 }, { "epoch": 4.706470980653769, "loss": 0.4565, "step": 14110 }, { "epoch": 4.706470980653769, "grad_norm": 2.73425555229187, "step": 14110 }, { "epoch": 4.706470980653769, "learning_rate": 0.00011472325787152053, "step": 14110 }, { "epoch": 4.706470980653769, "loss": 0.37849685549736023, "step": 14110 }, { "ce_loss": 0.09926678240299225, "epoch": 4.706470980653769, "step": 14110 }, { "distill_loss": 0.1470641791820526, "epoch": 4.706470980653769, "step": 14110 }, { "epoch": 4.706470980653769, "ref_ce_loss": 0.05578787997364998, "step": 14110 }, { "epoch": 4.706470980653769, "loss": 0.33840328454971313, "step": 14110 }, { "ce_loss": 0.06550583243370056, "epoch": 4.706470980653769, "step": 14110 }, { "distill_loss": 0.14875485002994537, "epoch": 4.706470980653769, "step": 14110 }, { "epoch": 4.706470980653769, "ref_ce_loss": 0.05572007596492767, "step": 14110 }, { "epoch": 4.706470980653769, "loss": 0.275194376707077, "step": 14110 }, { "ce_loss": 0.018452441319823265, "epoch": 4.706470980653769, "step": 14110 }, { "distill_loss": 0.08852989971637726, "epoch": 4.706470980653769, "step": 14110 }, { "epoch": 4.706470980653769, "ref_ce_loss": 0.032092366367578506, "step": 14110 }, { "epoch": 4.706470980653769, "loss": 0.39704009890556335, "step": 14110 }, { "ce_loss": 0.07161843031644821, "epoch": 4.706470980653769, "step": 14110 }, { "distill_loss": 0.17728634178638458, "epoch": 4.706470980653769, "step": 14110 }, { "epoch": 4.706470980653769, "ref_ce_loss": 0.10999871045351028, "step": 14110 }, { "epoch": 4.709806537691795, "loss": 0.4662, "step": 14120 }, { "epoch": 4.709806537691795, "grad_norm": 10.664507865905762, "step": 14120 }, { "epoch": 4.709806537691795, "learning_rate": 0.000114526410064457, "step": 14120 }, { "epoch": 4.709806537691795, "loss": 0.25680869817733765, "step": 14120 }, { "ce_loss": 0.07057490199804306, "epoch": 4.709806537691795, "step": 14120 }, { "distill_loss": 0.12687957286834717, "epoch": 4.709806537691795, "step": 14120 }, { "epoch": 4.709806537691795, "ref_ce_loss": 0.05930490419268608, "step": 14120 }, { "epoch": 4.709806537691795, "loss": 0.3404933512210846, "step": 14120 }, { "ce_loss": 0.07909941673278809, "epoch": 4.709806537691795, "step": 14120 }, { "distill_loss": 0.11669367551803589, "epoch": 4.709806537691795, "step": 14120 }, { "epoch": 4.709806537691795, "ref_ce_loss": 0.08591524511575699, "step": 14120 }, { "epoch": 4.709806537691795, "loss": 0.39324355125427246, "step": 14120 }, { "ce_loss": 0.0727115124464035, "epoch": 4.709806537691795, "step": 14120 }, { "distill_loss": 0.18177631497383118, "epoch": 4.709806537691795, "step": 14120 }, { "epoch": 4.709806537691795, "ref_ce_loss": 0.06662489473819733, "step": 14120 }, { "epoch": 4.709806537691795, "loss": 0.2780805230140686, "step": 14120 }, { "ce_loss": 0.07770053297281265, "epoch": 4.709806537691795, "step": 14120 }, { "distill_loss": 0.11451657861471176, "epoch": 4.709806537691795, "step": 14120 }, { "epoch": 4.709806537691795, "ref_ce_loss": 0.08560647070407867, "step": 14120 }, { "epoch": 4.71314209472982, "loss": 1.3647, "step": 14130 }, { "epoch": 4.71314209472982, "grad_norm": 11.033317565917969, "step": 14130 }, { "epoch": 4.71314209472982, "learning_rate": 0.00011432962694722833, "step": 14130 }, { "epoch": 4.71314209472982, "loss": 2.30210542678833, "step": 14130 }, { "ce_loss": 1.3777077198028564, "epoch": 4.71314209472982, "step": 14130 }, { "distill_loss": 0.17551563680171967, "epoch": 4.71314209472982, "step": 14130 }, { "epoch": 4.71314209472982, "ref_ce_loss": 0.6524390578269958, "step": 14130 }, { "epoch": 4.71314209472982, "loss": 2.1025567054748535, "step": 14130 }, { "ce_loss": 1.2161785364151, "epoch": 4.71314209472982, "step": 14130 }, { "distill_loss": 0.14761461317539215, "epoch": 4.71314209472982, "step": 14130 }, { "epoch": 4.71314209472982, "ref_ce_loss": 0.6293615102767944, "step": 14130 }, { "epoch": 4.71314209472982, "loss": 1.9151501655578613, "step": 14130 }, { "ce_loss": 1.1945534944534302, "epoch": 4.71314209472982, "step": 14130 }, { "distill_loss": 0.1268174946308136, "epoch": 4.71314209472982, "step": 14130 }, { "epoch": 4.71314209472982, "ref_ce_loss": 0.5792096257209778, "step": 14130 }, { "epoch": 4.71314209472982, "loss": 2.032562494277954, "step": 14130 }, { "ce_loss": 1.1563758850097656, "epoch": 4.71314209472982, "step": 14130 }, { "distill_loss": 0.15413472056388855, "epoch": 4.71314209472982, "step": 14130 }, { "epoch": 4.71314209472982, "ref_ce_loss": 0.6570587158203125, "step": 14130 }, { "epoch": 4.716477651767845, "loss": 1.1888, "step": 14140 }, { "epoch": 4.716477651767845, "grad_norm": 279.4869689941406, "step": 14140 }, { "epoch": 4.716477651767845, "learning_rate": 0.00011413290887868933, "step": 14140 }, { "epoch": 4.716477651767845, "loss": 0.9591177105903625, "step": 14140 }, { "ce_loss": 0.3586890697479248, "epoch": 4.716477651767845, "step": 14140 }, { "distill_loss": 0.13500162959098816, "epoch": 4.716477651767845, "step": 14140 }, { "epoch": 4.716477651767845, "ref_ce_loss": 0.27306056022644043, "step": 14140 }, { "epoch": 4.716477651767845, "loss": 0.5405594110488892, "step": 14140 }, { "ce_loss": 0.20202241837978363, "epoch": 4.716477651767845, "step": 14140 }, { "distill_loss": 0.141360804438591, "epoch": 4.716477651767845, "step": 14140 }, { "epoch": 4.716477651767845, "ref_ce_loss": 0.1429208666086197, "step": 14140 }, { "epoch": 4.716477651767845, "loss": 0.5801871418952942, "step": 14140 }, { "ce_loss": 0.21308133006095886, "epoch": 4.716477651767845, "step": 14140 }, { "distill_loss": 0.15593229234218597, "epoch": 4.716477651767845, "step": 14140 }, { "epoch": 4.716477651767845, "ref_ce_loss": 0.16659733653068542, "step": 14140 }, { "epoch": 4.716477651767845, "loss": 0.8410604000091553, "step": 14140 }, { "ce_loss": 0.41682738065719604, "epoch": 4.716477651767845, "step": 14140 }, { "distill_loss": 0.14686113595962524, "epoch": 4.716477651767845, "step": 14140 }, { "epoch": 4.716477651767845, "ref_ce_loss": 0.2372933030128479, "step": 14140 }, { "epoch": 4.719813208805871, "loss": 0.6165, "step": 14150 }, { "epoch": 4.719813208805871, "grad_norm": 2.762190818786621, "step": 14150 }, { "epoch": 4.719813208805871, "learning_rate": 0.00011393625621757609, "step": 14150 }, { "epoch": 4.719813208805871, "loss": 0.39677220582962036, "step": 14150 }, { "ce_loss": 0.14377433061599731, "epoch": 4.719813208805871, "step": 14150 }, { "distill_loss": 0.12296594679355621, "epoch": 4.719813208805871, "step": 14150 }, { "epoch": 4.719813208805871, "ref_ce_loss": 0.09659332036972046, "step": 14150 }, { "epoch": 4.719813208805871, "loss": 0.42406103014945984, "step": 14150 }, { "ce_loss": 0.1596388965845108, "epoch": 4.719813208805871, "step": 14150 }, { "distill_loss": 0.12342248857021332, "epoch": 4.719813208805871, "step": 14150 }, { "epoch": 4.719813208805871, "ref_ce_loss": 0.11014783382415771, "step": 14150 }, { "epoch": 4.719813208805871, "loss": 0.5634651184082031, "step": 14150 }, { "ce_loss": 0.22476035356521606, "epoch": 4.719813208805871, "step": 14150 }, { "distill_loss": 0.15548166632652283, "epoch": 4.719813208805871, "step": 14150 }, { "epoch": 4.719813208805871, "ref_ce_loss": 0.15671610832214355, "step": 14150 }, { "epoch": 4.719813208805871, "loss": 0.4034387767314911, "step": 14150 }, { "ce_loss": 0.19524210691452026, "epoch": 4.719813208805871, "step": 14150 }, { "distill_loss": 0.09127781540155411, "epoch": 4.719813208805871, "step": 14150 }, { "epoch": 4.719813208805871, "ref_ce_loss": 0.1166234090924263, "step": 14150 }, { "epoch": 4.723148765843896, "loss": 0.5712, "step": 14160 }, { "epoch": 4.723148765843896, "grad_norm": 2.8528082370758057, "step": 14160 }, { "epoch": 4.723148765843896, "learning_rate": 0.00011373966932250552, "step": 14160 }, { "epoch": 4.723148765843896, "loss": 0.9441983699798584, "step": 14160 }, { "ce_loss": 0.21303069591522217, "epoch": 4.723148765843896, "step": 14160 }, { "distill_loss": 0.19046366214752197, "epoch": 4.723148765843896, "step": 14160 }, { "epoch": 4.723148765843896, "ref_ce_loss": 0.15359333157539368, "step": 14160 }, { "epoch": 4.723148765843896, "loss": 0.5612851977348328, "step": 14160 }, { "ce_loss": 0.29807958006858826, "epoch": 4.723148765843896, "step": 14160 }, { "distill_loss": 0.14131425321102142, "epoch": 4.723148765843896, "step": 14160 }, { "epoch": 4.723148765843896, "ref_ce_loss": 0.1210092157125473, "step": 14160 }, { "epoch": 4.723148765843896, "loss": 0.5455232262611389, "step": 14160 }, { "ce_loss": 0.19224156439304352, "epoch": 4.723148765843896, "step": 14160 }, { "distill_loss": 0.11192300915718079, "epoch": 4.723148765843896, "step": 14160 }, { "epoch": 4.723148765843896, "ref_ce_loss": 0.1534367799758911, "step": 14160 }, { "epoch": 4.723148765843896, "loss": 0.46232539415359497, "step": 14160 }, { "ce_loss": 0.13547080755233765, "epoch": 4.723148765843896, "step": 14160 }, { "distill_loss": 0.18183527886867523, "epoch": 4.723148765843896, "step": 14160 }, { "epoch": 4.723148765843896, "ref_ce_loss": 0.14482565224170685, "step": 14160 }, { "epoch": 4.726484322881921, "loss": 0.5941, "step": 14170 }, { "epoch": 4.726484322881921, "grad_norm": 2.678342580795288, "step": 14170 }, { "epoch": 4.726484322881921, "learning_rate": 0.0001135431485519746, "step": 14170 }, { "epoch": 4.726484322881921, "loss": 0.44022274017333984, "step": 14170 }, { "ce_loss": 0.10185471922159195, "epoch": 4.726484322881921, "step": 14170 }, { "distill_loss": 0.10535676032304764, "epoch": 4.726484322881921, "step": 14170 }, { "epoch": 4.726484322881921, "ref_ce_loss": 0.10007350146770477, "step": 14170 }, { "epoch": 4.726484322881921, "loss": 0.6223923563957214, "step": 14170 }, { "ce_loss": 0.2227141410112381, "epoch": 4.726484322881921, "step": 14170 }, { "distill_loss": 0.18482142686843872, "epoch": 4.726484322881921, "step": 14170 }, { "epoch": 4.726484322881921, "ref_ce_loss": 0.18490292131900787, "step": 14170 }, { "epoch": 4.726484322881921, "loss": 0.6841763854026794, "step": 14170 }, { "ce_loss": 0.2221713364124298, "epoch": 4.726484322881921, "step": 14170 }, { "distill_loss": 0.19176220893859863, "epoch": 4.726484322881921, "step": 14170 }, { "epoch": 4.726484322881921, "ref_ce_loss": 0.14797654747962952, "step": 14170 }, { "epoch": 4.726484322881921, "loss": 0.5130280256271362, "step": 14170 }, { "ce_loss": 0.18819832801818848, "epoch": 4.726484322881921, "step": 14170 }, { "distill_loss": 0.16560444235801697, "epoch": 4.726484322881921, "step": 14170 }, { "epoch": 4.726484322881921, "ref_ce_loss": 0.1364285945892334, "step": 14170 }, { "epoch": 4.729819879919947, "loss": 0.4885, "step": 14180 }, { "epoch": 4.729819879919947, "grad_norm": 2.7698051929473877, "step": 14180 }, { "epoch": 4.729819879919947, "learning_rate": 0.00011334669426435963, "step": 14180 }, { "epoch": 4.729819879919947, "loss": 0.8724526166915894, "step": 14180 }, { "ce_loss": 0.2848694622516632, "epoch": 4.729819879919947, "step": 14180 }, { "distill_loss": 0.13748759031295776, "epoch": 4.729819879919947, "step": 14180 }, { "epoch": 4.729819879919947, "ref_ce_loss": 0.16300438344478607, "step": 14180 }, { "epoch": 4.729819879919947, "loss": 0.40946438908576965, "step": 14180 }, { "ce_loss": 0.1276213824748993, "epoch": 4.729819879919947, "step": 14180 }, { "distill_loss": 0.15523076057434082, "epoch": 4.729819879919947, "step": 14180 }, { "epoch": 4.729819879919947, "ref_ce_loss": 0.082614965736866, "step": 14180 }, { "epoch": 4.729819879919947, "loss": 0.46860525012016296, "step": 14180 }, { "ce_loss": 0.2063319981098175, "epoch": 4.729819879919947, "step": 14180 }, { "distill_loss": 0.12221656739711761, "epoch": 4.729819879919947, "step": 14180 }, { "epoch": 4.729819879919947, "ref_ce_loss": 0.1397939771413803, "step": 14180 }, { "epoch": 4.729819879919947, "loss": 0.3337191641330719, "step": 14180 }, { "ce_loss": 0.0916631668806076, "epoch": 4.729819879919947, "step": 14180 }, { "distill_loss": 0.122122623026371, "epoch": 4.729819879919947, "step": 14180 }, { "epoch": 4.729819879919947, "ref_ce_loss": 0.11975608766078949, "step": 14180 }, { "epoch": 4.733155436957972, "loss": 0.5128, "step": 14190 }, { "epoch": 4.733155436957972, "grad_norm": 2.4588358402252197, "step": 14190 }, { "epoch": 4.733155436957972, "learning_rate": 0.00011315030681791585, "step": 14190 }, { "epoch": 4.733155436957972, "loss": 0.5669198036193848, "step": 14190 }, { "ce_loss": 0.2199530005455017, "epoch": 4.733155436957972, "step": 14190 }, { "distill_loss": 0.13584327697753906, "epoch": 4.733155436957972, "step": 14190 }, { "epoch": 4.733155436957972, "ref_ce_loss": 0.13840018212795258, "step": 14190 }, { "epoch": 4.733155436957972, "loss": 0.5684414505958557, "step": 14190 }, { "ce_loss": 0.19079327583312988, "epoch": 4.733155436957972, "step": 14190 }, { "distill_loss": 0.1567957103252411, "epoch": 4.733155436957972, "step": 14190 }, { "epoch": 4.733155436957972, "ref_ce_loss": 0.18704859912395477, "step": 14190 }, { "epoch": 4.733155436957972, "loss": 0.36393260955810547, "step": 14190 }, { "ce_loss": 0.12109008431434631, "epoch": 4.733155436957972, "step": 14190 }, { "distill_loss": 0.15616634488105774, "epoch": 4.733155436957972, "step": 14190 }, { "epoch": 4.733155436957972, "ref_ce_loss": 0.061299532651901245, "step": 14190 }, { "epoch": 4.733155436957972, "loss": 0.4499196410179138, "step": 14190 }, { "ce_loss": 0.11336159706115723, "epoch": 4.733155436957972, "step": 14190 }, { "distill_loss": 0.11849113553762436, "epoch": 4.733155436957972, "step": 14190 }, { "epoch": 4.733155436957972, "ref_ce_loss": 0.08597545325756073, "step": 14190 }, { "epoch": 4.736490993995997, "loss": 0.4978, "step": 14200 }, { "epoch": 4.736490993995997, "grad_norm": 3.0749502182006836, "step": 14200 }, { "epoch": 4.736490993995997, "learning_rate": 0.00011295398657077633, "step": 14200 }, { "epoch": 4.736490993995997, "loss": 0.383294939994812, "step": 14200 }, { "ce_loss": 0.14222989976406097, "epoch": 4.736490993995997, "step": 14200 }, { "distill_loss": 0.10870449244976044, "epoch": 4.736490993995997, "step": 14200 }, { "epoch": 4.736490993995997, "ref_ce_loss": 0.09834384173154831, "step": 14200 }, { "epoch": 4.736490993995997, "loss": 0.228814497590065, "step": 14200 }, { "ce_loss": 0.05184551328420639, "epoch": 4.736490993995997, "step": 14200 }, { "distill_loss": 0.10386732220649719, "epoch": 4.736490993995997, "step": 14200 }, { "epoch": 4.736490993995997, "ref_ce_loss": 0.0729263499379158, "step": 14200 }, { "epoch": 4.736490993995997, "loss": 0.37187135219573975, "step": 14200 }, { "ce_loss": 0.09685519337654114, "epoch": 4.736490993995997, "step": 14200 }, { "distill_loss": 0.09510515630245209, "epoch": 4.736490993995997, "step": 14200 }, { "epoch": 4.736490993995997, "ref_ce_loss": 0.09703058749437332, "step": 14200 }, { "epoch": 4.736490993995997, "loss": 0.27483007311820984, "step": 14200 }, { "ce_loss": 0.08314543217420578, "epoch": 4.736490993995997, "step": 14200 }, { "distill_loss": 0.09368523210287094, "epoch": 4.736490993995997, "step": 14200 }, { "epoch": 4.736490993995997, "ref_ce_loss": 0.07162053138017654, "step": 14200 }, { "epoch": 4.739826551034023, "loss": 0.4949, "step": 14210 }, { "epoch": 4.739826551034023, "grad_norm": 1.9191662073135376, "step": 14210 }, { "epoch": 4.739826551034023, "learning_rate": 0.00011275773388095185, "step": 14210 }, { "epoch": 4.739826551034023, "loss": 0.43043410778045654, "step": 14210 }, { "ce_loss": 0.10714246332645416, "epoch": 4.739826551034023, "step": 14210 }, { "distill_loss": 0.10732196271419525, "epoch": 4.739826551034023, "step": 14210 }, { "epoch": 4.739826551034023, "ref_ce_loss": 0.06809937208890915, "step": 14210 }, { "epoch": 4.739826551034023, "loss": 0.3872106671333313, "step": 14210 }, { "ce_loss": 0.09992988407611847, "epoch": 4.739826551034023, "step": 14210 }, { "distill_loss": 0.10379815101623535, "epoch": 4.739826551034023, "step": 14210 }, { "epoch": 4.739826551034023, "ref_ce_loss": 0.10170990973711014, "step": 14210 }, { "epoch": 4.739826551034023, "loss": 0.40114355087280273, "step": 14210 }, { "ce_loss": 0.10342927277088165, "epoch": 4.739826551034023, "step": 14210 }, { "distill_loss": 0.12139730900526047, "epoch": 4.739826551034023, "step": 14210 }, { "epoch": 4.739826551034023, "ref_ce_loss": 0.08242413401603699, "step": 14210 }, { "epoch": 4.739826551034023, "loss": 0.5521736741065979, "step": 14210 }, { "ce_loss": 0.11475184559822083, "epoch": 4.739826551034023, "step": 14210 }, { "distill_loss": 0.14817777276039124, "epoch": 4.739826551034023, "step": 14210 }, { "epoch": 4.739826551034023, "ref_ce_loss": 0.1247396469116211, "step": 14210 }, { "epoch": 4.743162108072048, "loss": 0.5387, "step": 14220 }, { "epoch": 4.743162108072048, "grad_norm": 2.4396839141845703, "step": 14220 }, { "epoch": 4.743162108072048, "learning_rate": 0.00011256154910632998, "step": 14220 }, { "epoch": 4.743162108072048, "loss": 0.9258735179901123, "step": 14220 }, { "ce_loss": 0.23057223856449127, "epoch": 4.743162108072048, "step": 14220 }, { "distill_loss": 0.162357360124588, "epoch": 4.743162108072048, "step": 14220 }, { "epoch": 4.743162108072048, "ref_ce_loss": 0.1291550099849701, "step": 14220 }, { "epoch": 4.743162108072048, "loss": 0.5556474924087524, "step": 14220 }, { "ce_loss": 0.1705268770456314, "epoch": 4.743162108072048, "step": 14220 }, { "distill_loss": 0.09931408613920212, "epoch": 4.743162108072048, "step": 14220 }, { "epoch": 4.743162108072048, "ref_ce_loss": 0.10927734524011612, "step": 14220 }, { "epoch": 4.743162108072048, "loss": 0.6825743913650513, "step": 14220 }, { "ce_loss": 0.09274369478225708, "epoch": 4.743162108072048, "step": 14220 }, { "distill_loss": 0.09510725736618042, "epoch": 4.743162108072048, "step": 14220 }, { "epoch": 4.743162108072048, "ref_ce_loss": 0.07657185196876526, "step": 14220 }, { "epoch": 4.743162108072048, "loss": 0.45636534690856934, "step": 14220 }, { "ce_loss": 0.1472557932138443, "epoch": 4.743162108072048, "step": 14220 }, { "distill_loss": 0.13751322031021118, "epoch": 4.743162108072048, "step": 14220 }, { "epoch": 4.743162108072048, "ref_ce_loss": 0.08142144232988358, "step": 14220 }, { "epoch": 4.746497665110073, "loss": 0.4579, "step": 14230 }, { "epoch": 4.746497665110073, "grad_norm": 2.141514539718628, "step": 14230 }, { "epoch": 4.746497665110073, "learning_rate": 0.00011236543260467418, "step": 14230 }, { "epoch": 4.746497665110073, "loss": 0.521102249622345, "step": 14230 }, { "ce_loss": 0.203525111079216, "epoch": 4.746497665110073, "step": 14230 }, { "distill_loss": 0.11754366755485535, "epoch": 4.746497665110073, "step": 14230 }, { "epoch": 4.746497665110073, "ref_ce_loss": 0.19978143274784088, "step": 14230 }, { "epoch": 4.746497665110073, "loss": 0.4507944583892822, "step": 14230 }, { "ce_loss": 0.0714416429400444, "epoch": 4.746497665110073, "step": 14230 }, { "distill_loss": 0.11438420414924622, "epoch": 4.746497665110073, "step": 14230 }, { "epoch": 4.746497665110073, "ref_ce_loss": 0.08590278029441833, "step": 14230 }, { "epoch": 4.746497665110073, "loss": 0.3971126973628998, "step": 14230 }, { "ce_loss": 0.1454489678144455, "epoch": 4.746497665110073, "step": 14230 }, { "distill_loss": 0.11768228560686111, "epoch": 4.746497665110073, "step": 14230 }, { "epoch": 4.746497665110073, "ref_ce_loss": 0.13380113244056702, "step": 14230 }, { "epoch": 4.746497665110073, "loss": 0.43891745805740356, "step": 14230 }, { "ce_loss": 0.1446937918663025, "epoch": 4.746497665110073, "step": 14230 }, { "distill_loss": 0.12114118039608002, "epoch": 4.746497665110073, "step": 14230 }, { "epoch": 4.746497665110073, "ref_ce_loss": 0.07623746246099472, "step": 14230 }, { "epoch": 4.749833222148099, "loss": 0.4922, "step": 14240 }, { "epoch": 4.749833222148099, "grad_norm": 2.9477293491363525, "step": 14240 }, { "epoch": 4.749833222148099, "learning_rate": 0.00011216938473362377, "step": 14240 }, { "epoch": 4.749833222148099, "loss": 0.4097670912742615, "step": 14240 }, { "ce_loss": 0.11067419499158859, "epoch": 4.749833222148099, "step": 14240 }, { "distill_loss": 0.15952913463115692, "epoch": 4.749833222148099, "step": 14240 }, { "epoch": 4.749833222148099, "ref_ce_loss": 0.09814775735139847, "step": 14240 }, { "epoch": 4.749833222148099, "loss": 0.5153831839561462, "step": 14240 }, { "ce_loss": 0.22324255108833313, "epoch": 4.749833222148099, "step": 14240 }, { "distill_loss": 0.13816522061824799, "epoch": 4.749833222148099, "step": 14240 }, { "epoch": 4.749833222148099, "ref_ce_loss": 0.1536806970834732, "step": 14240 }, { "epoch": 4.749833222148099, "loss": 0.5292138457298279, "step": 14240 }, { "ce_loss": 0.12397529184818268, "epoch": 4.749833222148099, "step": 14240 }, { "distill_loss": 0.1275257170200348, "epoch": 4.749833222148099, "step": 14240 }, { "epoch": 4.749833222148099, "ref_ce_loss": 0.0806649923324585, "step": 14240 }, { "epoch": 4.749833222148099, "loss": 0.43219509720802307, "step": 14240 }, { "ce_loss": 0.13407444953918457, "epoch": 4.749833222148099, "step": 14240 }, { "distill_loss": 0.14914165437221527, "epoch": 4.749833222148099, "step": 14240 }, { "epoch": 4.749833222148099, "ref_ce_loss": 0.10060304403305054, "step": 14240 }, { "epoch": 4.753168779186124, "loss": 0.4562, "step": 14250 }, { "epoch": 4.753168779186124, "grad_norm": 2.0971434116363525, "step": 14250 }, { "epoch": 4.753168779186124, "learning_rate": 0.00011197340585069259, "step": 14250 }, { "epoch": 4.753168779186124, "loss": 0.4454609751701355, "step": 14250 }, { "ce_loss": 0.0977015420794487, "epoch": 4.753168779186124, "step": 14250 }, { "distill_loss": 0.10412100702524185, "epoch": 4.753168779186124, "step": 14250 }, { "epoch": 4.753168779186124, "ref_ce_loss": 0.10450634360313416, "step": 14250 }, { "epoch": 4.753168779186124, "loss": 0.4546106457710266, "step": 14250 }, { "ce_loss": 0.18327759206295013, "epoch": 4.753168779186124, "step": 14250 }, { "distill_loss": 0.11298847198486328, "epoch": 4.753168779186124, "step": 14250 }, { "epoch": 4.753168779186124, "ref_ce_loss": 0.11655457317829132, "step": 14250 }, { "epoch": 4.753168779186124, "loss": 0.6993808746337891, "step": 14250 }, { "ce_loss": 0.132548525929451, "epoch": 4.753168779186124, "step": 14250 }, { "distill_loss": 0.18319010734558105, "epoch": 4.753168779186124, "step": 14250 }, { "epoch": 4.753168779186124, "ref_ce_loss": 0.08631079643964767, "step": 14250 }, { "epoch": 4.753168779186124, "loss": 0.4419463276863098, "step": 14250 }, { "ce_loss": 0.13755536079406738, "epoch": 4.753168779186124, "step": 14250 }, { "distill_loss": 0.1329721212387085, "epoch": 4.753168779186124, "step": 14250 }, { "epoch": 4.753168779186124, "ref_ce_loss": 0.0995076596736908, "step": 14250 }, { "epoch": 4.7565043362241495, "loss": 0.4859, "step": 14260 }, { "epoch": 4.7565043362241495, "grad_norm": 2.6352319717407227, "step": 14260 }, { "epoch": 4.7565043362241495, "learning_rate": 0.00011177749631326887, "step": 14260 }, { "epoch": 4.7565043362241495, "loss": 0.49517810344696045, "step": 14260 }, { "ce_loss": 0.1591547429561615, "epoch": 4.7565043362241495, "step": 14260 }, { "distill_loss": 0.15403160452842712, "epoch": 4.7565043362241495, "step": 14260 }, { "epoch": 4.7565043362241495, "ref_ce_loss": 0.1262081265449524, "step": 14260 }, { "epoch": 4.7565043362241495, "loss": 0.6272794604301453, "step": 14260 }, { "ce_loss": 0.31204718351364136, "epoch": 4.7565043362241495, "step": 14260 }, { "distill_loss": 0.16110165417194366, "epoch": 4.7565043362241495, "step": 14260 }, { "epoch": 4.7565043362241495, "ref_ce_loss": 0.13668222725391388, "step": 14260 }, { "epoch": 4.7565043362241495, "loss": 0.3994016945362091, "step": 14260 }, { "ce_loss": 0.09157441556453705, "epoch": 4.7565043362241495, "step": 14260 }, { "distill_loss": 0.13846012949943542, "epoch": 4.7565043362241495, "step": 14260 }, { "epoch": 4.7565043362241495, "ref_ce_loss": 0.10564356297254562, "step": 14260 }, { "epoch": 4.7565043362241495, "loss": 1.257520318031311, "step": 14260 }, { "ce_loss": 0.2869233787059784, "epoch": 4.7565043362241495, "step": 14260 }, { "distill_loss": 0.22305631637573242, "epoch": 4.7565043362241495, "step": 14260 }, { "epoch": 4.7565043362241495, "ref_ce_loss": 0.13798722624778748, "step": 14260 }, { "epoch": 4.759839893262175, "loss": 0.5491, "step": 14270 }, { "epoch": 4.759839893262175, "grad_norm": 2.1373462677001953, "step": 14270 }, { "epoch": 4.759839893262175, "learning_rate": 0.00011158165647861435, "step": 14270 }, { "epoch": 4.759839893262175, "loss": 0.45616188645362854, "step": 14270 }, { "ce_loss": 0.20117731392383575, "epoch": 4.759839893262175, "step": 14270 }, { "distill_loss": 0.11334202438592911, "epoch": 4.759839893262175, "step": 14270 }, { "epoch": 4.759839893262175, "ref_ce_loss": 0.11822082102298737, "step": 14270 }, { "epoch": 4.759839893262175, "loss": 0.5591301321983337, "step": 14270 }, { "ce_loss": 0.15160302817821503, "epoch": 4.759839893262175, "step": 14270 }, { "distill_loss": 0.12180090695619583, "epoch": 4.759839893262175, "step": 14270 }, { "epoch": 4.759839893262175, "ref_ce_loss": 0.13151346147060394, "step": 14270 }, { "epoch": 4.759839893262175, "loss": 0.41168203949928284, "step": 14270 }, { "ce_loss": 0.12057756632566452, "epoch": 4.759839893262175, "step": 14270 }, { "distill_loss": 0.12694385647773743, "epoch": 4.759839893262175, "step": 14270 }, { "epoch": 4.759839893262175, "ref_ce_loss": 0.11435339599847794, "step": 14270 }, { "epoch": 4.759839893262175, "loss": 0.39429712295532227, "step": 14270 }, { "ce_loss": 0.024822566658258438, "epoch": 4.759839893262175, "step": 14270 }, { "distill_loss": 0.08758328855037689, "epoch": 4.759839893262175, "step": 14270 }, { "epoch": 4.759839893262175, "ref_ce_loss": 0.05988505855202675, "step": 14270 }, { "epoch": 4.7631754503002, "loss": 0.502, "step": 14280 }, { "epoch": 4.7631754503002, "grad_norm": 2.955759286880493, "step": 14280 }, { "epoch": 4.7631754503002, "learning_rate": 0.00011138588670386358, "step": 14280 }, { "epoch": 4.7631754503002, "loss": 0.3777836263179779, "step": 14280 }, { "ce_loss": 0.15252237021923065, "epoch": 4.7631754503002, "step": 14280 }, { "distill_loss": 0.12933968007564545, "epoch": 4.7631754503002, "step": 14280 }, { "epoch": 4.7631754503002, "ref_ce_loss": 0.09582728147506714, "step": 14280 }, { "epoch": 4.7631754503002, "loss": 0.26405563950538635, "step": 14280 }, { "ce_loss": 0.09645365178585052, "epoch": 4.7631754503002, "step": 14280 }, { "distill_loss": 0.09345416724681854, "epoch": 4.7631754503002, "step": 14280 }, { "epoch": 4.7631754503002, "ref_ce_loss": 0.07391425222158432, "step": 14280 }, { "epoch": 4.7631754503002, "loss": 0.20412151515483856, "step": 14280 }, { "ce_loss": 0.025568762794137, "epoch": 4.7631754503002, "step": 14280 }, { "distill_loss": 0.07479839771986008, "epoch": 4.7631754503002, "step": 14280 }, { "epoch": 4.7631754503002, "ref_ce_loss": 0.06605122238397598, "step": 14280 }, { "epoch": 4.7631754503002, "loss": 0.3875410258769989, "step": 14280 }, { "ce_loss": 0.15600574016571045, "epoch": 4.7631754503002, "step": 14280 }, { "distill_loss": 0.1275922656059265, "epoch": 4.7631754503002, "step": 14280 }, { "epoch": 4.7631754503002, "ref_ce_loss": 0.07461174577474594, "step": 14280 }, { "epoch": 4.7665110073382255, "loss": 0.444, "step": 14290 }, { "epoch": 4.7665110073382255, "grad_norm": 2.552724838256836, "step": 14290 }, { "epoch": 4.7665110073382255, "learning_rate": 0.0001111901873460235, "step": 14290 }, { "epoch": 4.7665110073382255, "loss": 0.43322882056236267, "step": 14290 }, { "ce_loss": 0.17510177195072174, "epoch": 4.7665110073382255, "step": 14290 }, { "distill_loss": 0.13811074197292328, "epoch": 4.7665110073382255, "step": 14290 }, { "epoch": 4.7665110073382255, "ref_ce_loss": 0.11976328492164612, "step": 14290 }, { "epoch": 4.7665110073382255, "loss": 0.5688863396644592, "step": 14290 }, { "ce_loss": 0.2690722942352295, "epoch": 4.7665110073382255, "step": 14290 }, { "distill_loss": 0.11816976964473724, "epoch": 4.7665110073382255, "step": 14290 }, { "epoch": 4.7665110073382255, "ref_ce_loss": 0.14800193905830383, "step": 14290 }, { "epoch": 4.7665110073382255, "loss": 0.3511410355567932, "step": 14290 }, { "ce_loss": 0.13480216264724731, "epoch": 4.7665110073382255, "step": 14290 }, { "distill_loss": 0.10867208987474442, "epoch": 4.7665110073382255, "step": 14290 }, { "epoch": 4.7665110073382255, "ref_ce_loss": 0.056814275681972504, "step": 14290 }, { "epoch": 4.7665110073382255, "loss": 0.7902196645736694, "step": 14290 }, { "ce_loss": 0.23585091531276703, "epoch": 4.7665110073382255, "step": 14290 }, { "distill_loss": 0.13959118723869324, "epoch": 4.7665110073382255, "step": 14290 }, { "epoch": 4.7665110073382255, "ref_ce_loss": 0.10354238748550415, "step": 14290 }, { "epoch": 4.769846564376251, "loss": 0.4744, "step": 14300 }, { "epoch": 4.769846564376251, "grad_norm": 3.3194260597229004, "step": 14300 }, { "epoch": 4.769846564376251, "learning_rate": 0.0001109945587619724, "step": 14300 }, { "epoch": 4.769846564376251, "loss": 0.504343569278717, "step": 14300 }, { "ce_loss": 0.16864703595638275, "epoch": 4.769846564376251, "step": 14300 }, { "distill_loss": 0.14713454246520996, "epoch": 4.769846564376251, "step": 14300 }, { "epoch": 4.769846564376251, "ref_ce_loss": 0.13997380435466766, "step": 14300 }, { "epoch": 4.769846564376251, "loss": 0.3732161223888397, "step": 14300 }, { "ce_loss": 0.11053475737571716, "epoch": 4.769846564376251, "step": 14300 }, { "distill_loss": 0.10585125535726547, "epoch": 4.769846564376251, "step": 14300 }, { "epoch": 4.769846564376251, "ref_ce_loss": 0.06673439592123032, "step": 14300 }, { "epoch": 4.769846564376251, "loss": 0.5125171542167664, "step": 14300 }, { "ce_loss": 0.2005772441625595, "epoch": 4.769846564376251, "step": 14300 }, { "distill_loss": 0.1538941115140915, "epoch": 4.769846564376251, "step": 14300 }, { "epoch": 4.769846564376251, "ref_ce_loss": 0.12865741550922394, "step": 14300 }, { "epoch": 4.769846564376251, "loss": 0.5521990060806274, "step": 14300 }, { "ce_loss": 0.16355322301387787, "epoch": 4.769846564376251, "step": 14300 }, { "distill_loss": 0.12216568738222122, "epoch": 4.769846564376251, "step": 14300 }, { "epoch": 4.769846564376251, "ref_ce_loss": 0.10937811434268951, "step": 14300 }, { "epoch": 4.773182121414276, "loss": 0.4654, "step": 14310 }, { "epoch": 4.773182121414276, "grad_norm": 1.955685019493103, "step": 14310 }, { "epoch": 4.773182121414276, "learning_rate": 0.00011079900130845976, "step": 14310 }, { "epoch": 4.773182121414276, "loss": 0.45663851499557495, "step": 14310 }, { "ce_loss": 0.13172826170921326, "epoch": 4.773182121414276, "step": 14310 }, { "distill_loss": 0.10975062847137451, "epoch": 4.773182121414276, "step": 14310 }, { "epoch": 4.773182121414276, "ref_ce_loss": 0.08388905972242355, "step": 14310 }, { "epoch": 4.773182121414276, "loss": 0.642504870891571, "step": 14310 }, { "ce_loss": 0.125015527009964, "epoch": 4.773182121414276, "step": 14310 }, { "distill_loss": 0.1834339052438736, "epoch": 4.773182121414276, "step": 14310 }, { "epoch": 4.773182121414276, "ref_ce_loss": 0.1528361737728119, "step": 14310 }, { "epoch": 4.773182121414276, "loss": 0.479583740234375, "step": 14310 }, { "ce_loss": 0.18668422102928162, "epoch": 4.773182121414276, "step": 14310 }, { "distill_loss": 0.10831034928560257, "epoch": 4.773182121414276, "step": 14310 }, { "epoch": 4.773182121414276, "ref_ce_loss": 0.13767296075820923, "step": 14310 }, { "epoch": 4.773182121414276, "loss": 0.45591068267822266, "step": 14310 }, { "ce_loss": 0.1843109130859375, "epoch": 4.773182121414276, "step": 14310 }, { "distill_loss": 0.16249501705169678, "epoch": 4.773182121414276, "step": 14310 }, { "epoch": 4.773182121414276, "ref_ce_loss": 0.10894269496202469, "step": 14310 }, { "epoch": 4.776517678452302, "loss": 0.4768, "step": 14320 }, { "epoch": 4.776517678452302, "grad_norm": 2.224634885787964, "step": 14320 }, { "epoch": 4.776517678452302, "learning_rate": 0.00011060351534210522, "step": 14320 }, { "epoch": 4.776517678452302, "loss": 0.36553362011909485, "step": 14320 }, { "ce_loss": 0.16818471252918243, "epoch": 4.776517678452302, "step": 14320 }, { "distill_loss": 0.13930566608905792, "epoch": 4.776517678452302, "step": 14320 }, { "epoch": 4.776517678452302, "ref_ce_loss": 0.05784922093153, "step": 14320 }, { "epoch": 4.776517678452302, "loss": 0.361936092376709, "step": 14320 }, { "ce_loss": 0.09541445970535278, "epoch": 4.776517678452302, "step": 14320 }, { "distill_loss": 0.11125258356332779, "epoch": 4.776517678452302, "step": 14320 }, { "epoch": 4.776517678452302, "ref_ce_loss": 0.09528297185897827, "step": 14320 }, { "epoch": 4.776517678452302, "loss": 0.40957027673721313, "step": 14320 }, { "ce_loss": 0.07518623024225235, "epoch": 4.776517678452302, "step": 14320 }, { "distill_loss": 0.12784458696842194, "epoch": 4.776517678452302, "step": 14320 }, { "epoch": 4.776517678452302, "ref_ce_loss": 0.07247889041900635, "step": 14320 }, { "epoch": 4.776517678452302, "loss": 0.46657589077949524, "step": 14320 }, { "ce_loss": 0.11209922283887863, "epoch": 4.776517678452302, "step": 14320 }, { "distill_loss": 0.13903628289699554, "epoch": 4.776517678452302, "step": 14320 }, { "epoch": 4.776517678452302, "ref_ce_loss": 0.1548924446105957, "step": 14320 }, { "epoch": 4.779853235490327, "loss": 0.4814, "step": 14330 }, { "epoch": 4.779853235490327, "grad_norm": 6.027482509613037, "step": 14330 }, { "epoch": 4.779853235490327, "learning_rate": 0.00011040810121939803, "step": 14330 }, { "epoch": 4.779853235490327, "loss": 0.42402875423431396, "step": 14330 }, { "ce_loss": 0.16902658343315125, "epoch": 4.779853235490327, "step": 14330 }, { "distill_loss": 0.11825080960988998, "epoch": 4.779853235490327, "step": 14330 }, { "epoch": 4.779853235490327, "ref_ce_loss": 0.08660327643156052, "step": 14330 }, { "epoch": 4.779853235490327, "loss": 0.36984482407569885, "step": 14330 }, { "ce_loss": 0.1259082704782486, "epoch": 4.779853235490327, "step": 14330 }, { "distill_loss": 0.13614816963672638, "epoch": 4.779853235490327, "step": 14330 }, { "epoch": 4.779853235490327, "ref_ce_loss": 0.09089109301567078, "step": 14330 }, { "epoch": 4.779853235490327, "loss": 0.39057061076164246, "step": 14330 }, { "ce_loss": 0.1435929536819458, "epoch": 4.779853235490327, "step": 14330 }, { "distill_loss": 0.13623438775539398, "epoch": 4.779853235490327, "step": 14330 }, { "epoch": 4.779853235490327, "ref_ce_loss": 0.08988281339406967, "step": 14330 }, { "epoch": 4.779853235490327, "loss": 0.541616678237915, "step": 14330 }, { "ce_loss": 0.17811079323291779, "epoch": 4.779853235490327, "step": 14330 }, { "distill_loss": 0.12328089773654938, "epoch": 4.779853235490327, "step": 14330 }, { "epoch": 4.779853235490327, "ref_ce_loss": 0.11967521905899048, "step": 14330 }, { "epoch": 4.783188792528352, "loss": 0.43, "step": 14340 }, { "epoch": 4.783188792528352, "grad_norm": 3.0198564529418945, "step": 14340 }, { "epoch": 4.783188792528352, "learning_rate": 0.00011021275929669648, "step": 14340 }, { "epoch": 4.783188792528352, "loss": 0.37297478318214417, "step": 14340 }, { "ce_loss": 0.10748539865016937, "epoch": 4.783188792528352, "step": 14340 }, { "distill_loss": 0.141023188829422, "epoch": 4.783188792528352, "step": 14340 }, { "epoch": 4.783188792528352, "ref_ce_loss": 0.12405706942081451, "step": 14340 }, { "epoch": 4.783188792528352, "loss": 0.34351977705955505, "step": 14340 }, { "ce_loss": 0.09799874573945999, "epoch": 4.783188792528352, "step": 14340 }, { "distill_loss": 0.1253485083580017, "epoch": 4.783188792528352, "step": 14340 }, { "epoch": 4.783188792528352, "ref_ce_loss": 0.11973781138658524, "step": 14340 }, { "epoch": 4.783188792528352, "loss": 0.991863489151001, "step": 14340 }, { "ce_loss": 0.24459530413150787, "epoch": 4.783188792528352, "step": 14340 }, { "distill_loss": 0.13070832192897797, "epoch": 4.783188792528352, "step": 14340 }, { "epoch": 4.783188792528352, "ref_ce_loss": 0.09614545851945877, "step": 14340 }, { "epoch": 4.783188792528352, "loss": 0.5935628414154053, "step": 14340 }, { "ce_loss": 0.12343177199363708, "epoch": 4.783188792528352, "step": 14340 }, { "distill_loss": 0.14855144917964935, "epoch": 4.783188792528352, "step": 14340 }, { "epoch": 4.783188792528352, "ref_ce_loss": 0.09137774258852005, "step": 14340 }, { "epoch": 4.786524349566378, "loss": 0.5001, "step": 14350 }, { "epoch": 4.786524349566378, "grad_norm": 2.282027006149292, "step": 14350 }, { "epoch": 4.786524349566378, "learning_rate": 0.00011001748993022722, "step": 14350 }, { "epoch": 4.786524349566378, "loss": 0.3946148157119751, "step": 14350 }, { "ce_loss": 0.07618418335914612, "epoch": 4.786524349566378, "step": 14350 }, { "distill_loss": 0.15871164202690125, "epoch": 4.786524349566378, "step": 14350 }, { "epoch": 4.786524349566378, "ref_ce_loss": 0.05771951004862785, "step": 14350 }, { "epoch": 4.786524349566378, "loss": 0.450163871049881, "step": 14350 }, { "ce_loss": 0.07075124979019165, "epoch": 4.786524349566378, "step": 14350 }, { "distill_loss": 0.17303964495658875, "epoch": 4.786524349566378, "step": 14350 }, { "epoch": 4.786524349566378, "ref_ce_loss": 0.0855056643486023, "step": 14350 }, { "epoch": 4.786524349566378, "loss": 0.49539071321487427, "step": 14350 }, { "ce_loss": 0.13586698472499847, "epoch": 4.786524349566378, "step": 14350 }, { "distill_loss": 0.18673719465732574, "epoch": 4.786524349566378, "step": 14350 }, { "epoch": 4.786524349566378, "ref_ce_loss": 0.14078141748905182, "step": 14350 }, { "epoch": 4.786524349566378, "loss": 0.41358691453933716, "step": 14350 }, { "ce_loss": 0.12783963978290558, "epoch": 4.786524349566378, "step": 14350 }, { "distill_loss": 0.1703040450811386, "epoch": 4.786524349566378, "step": 14350 }, { "epoch": 4.786524349566378, "ref_ce_loss": 0.0900261327624321, "step": 14350 }, { "epoch": 4.789859906604403, "loss": 0.4901, "step": 14360 }, { "epoch": 4.789859906604403, "grad_norm": 3.0800037384033203, "step": 14360 }, { "epoch": 4.789859906604403, "learning_rate": 0.00010982229347608446, "step": 14360 }, { "epoch": 4.789859906604403, "loss": 0.412725031375885, "step": 14360 }, { "ce_loss": 0.1197974905371666, "epoch": 4.789859906604403, "step": 14360 }, { "distill_loss": 0.18120068311691284, "epoch": 4.789859906604403, "step": 14360 }, { "epoch": 4.789859906604403, "ref_ce_loss": 0.08914639800786972, "step": 14360 }, { "epoch": 4.789859906604403, "loss": 0.4203656017780304, "step": 14360 }, { "ce_loss": 0.08861410617828369, "epoch": 4.789859906604403, "step": 14360 }, { "distill_loss": 0.1394980102777481, "epoch": 4.789859906604403, "step": 14360 }, { "epoch": 4.789859906604403, "ref_ce_loss": 0.08281680941581726, "step": 14360 }, { "epoch": 4.789859906604403, "loss": 0.4946492612361908, "step": 14360 }, { "ce_loss": 0.11306752264499664, "epoch": 4.789859906604403, "step": 14360 }, { "distill_loss": 0.17083683609962463, "epoch": 4.789859906604403, "step": 14360 }, { "epoch": 4.789859906604403, "ref_ce_loss": 0.1248176321387291, "step": 14360 }, { "epoch": 4.789859906604403, "loss": 0.5337336659431458, "step": 14360 }, { "ce_loss": 0.166512593626976, "epoch": 4.789859906604403, "step": 14360 }, { "distill_loss": 0.18739216029644012, "epoch": 4.789859906604403, "step": 14360 }, { "epoch": 4.789859906604403, "ref_ce_loss": 0.121116504073143, "step": 14360 }, { "epoch": 4.793195463642428, "loss": 0.4662, "step": 14370 }, { "epoch": 4.793195463642428, "grad_norm": 4.17993688583374, "step": 14370 }, { "epoch": 4.793195463642428, "learning_rate": 0.00010962717029022967, "step": 14370 }, { "epoch": 4.793195463642428, "loss": 0.46002838015556335, "step": 14370 }, { "ce_loss": 0.13797122240066528, "epoch": 4.793195463642428, "step": 14370 }, { "distill_loss": 0.17406927049160004, "epoch": 4.793195463642428, "step": 14370 }, { "epoch": 4.793195463642428, "ref_ce_loss": 0.12049631774425507, "step": 14370 }, { "epoch": 4.793195463642428, "loss": 0.2839353084564209, "step": 14370 }, { "ce_loss": 0.06530182808637619, "epoch": 4.793195463642428, "step": 14370 }, { "distill_loss": 0.14406593143939972, "epoch": 4.793195463642428, "step": 14370 }, { "epoch": 4.793195463642428, "ref_ce_loss": 0.0744338408112526, "step": 14370 }, { "epoch": 4.793195463642428, "loss": 0.5167728662490845, "step": 14370 }, { "ce_loss": 0.05427071079611778, "epoch": 4.793195463642428, "step": 14370 }, { "distill_loss": 0.19449418783187866, "epoch": 4.793195463642428, "step": 14370 }, { "epoch": 4.793195463642428, "ref_ce_loss": 0.10985428839921951, "step": 14370 }, { "epoch": 4.793195463642428, "loss": 0.5486382246017456, "step": 14370 }, { "ce_loss": 0.2093065083026886, "epoch": 4.793195463642428, "step": 14370 }, { "distill_loss": 0.19122518599033356, "epoch": 4.793195463642428, "step": 14370 }, { "epoch": 4.793195463642428, "ref_ce_loss": 0.08407463133335114, "step": 14370 }, { "epoch": 4.796531020680454, "loss": 0.4607, "step": 14380 }, { "epoch": 4.796531020680454, "grad_norm": 2.9548277854919434, "step": 14380 }, { "epoch": 4.796531020680454, "learning_rate": 0.00010943212072849042, "step": 14380 }, { "epoch": 4.796531020680454, "loss": 0.3544643819332123, "step": 14380 }, { "ce_loss": 0.07386091351509094, "epoch": 4.796531020680454, "step": 14380 }, { "distill_loss": 0.10469326376914978, "epoch": 4.796531020680454, "step": 14380 }, { "epoch": 4.796531020680454, "ref_ce_loss": 0.12491313368082047, "step": 14380 }, { "epoch": 4.796531020680454, "loss": 0.4699355959892273, "step": 14380 }, { "ce_loss": 0.11637619882822037, "epoch": 4.796531020680454, "step": 14380 }, { "distill_loss": 0.17287704348564148, "epoch": 4.796531020680454, "step": 14380 }, { "epoch": 4.796531020680454, "ref_ce_loss": 0.07765893638134003, "step": 14380 }, { "epoch": 4.796531020680454, "loss": 0.47565996646881104, "step": 14380 }, { "ce_loss": 0.1342860609292984, "epoch": 4.796531020680454, "step": 14380 }, { "distill_loss": 0.1429038941860199, "epoch": 4.796531020680454, "step": 14380 }, { "epoch": 4.796531020680454, "ref_ce_loss": 0.06292593479156494, "step": 14380 }, { "epoch": 4.796531020680454, "loss": 0.35165899991989136, "step": 14380 }, { "ce_loss": 0.05011136829853058, "epoch": 4.796531020680454, "step": 14380 }, { "distill_loss": 0.10166673362255096, "epoch": 4.796531020680454, "step": 14380 }, { "epoch": 4.796531020680454, "ref_ce_loss": 0.05985415354371071, "step": 14380 }, { "epoch": 4.799866577718479, "loss": 0.4348, "step": 14390 }, { "epoch": 4.799866577718479, "grad_norm": 1.8150907754898071, "step": 14390 }, { "epoch": 4.799866577718479, "learning_rate": 0.00010923714514656023, "step": 14390 }, { "epoch": 4.799866577718479, "loss": 0.4065036475658417, "step": 14390 }, { "ce_loss": 0.162733793258667, "epoch": 4.799866577718479, "step": 14390 }, { "distill_loss": 0.15392005443572998, "epoch": 4.799866577718479, "step": 14390 }, { "epoch": 4.799866577718479, "ref_ce_loss": 0.0887485072016716, "step": 14390 }, { "epoch": 4.799866577718479, "loss": 0.3206755220890045, "step": 14390 }, { "ce_loss": 0.1351880133152008, "epoch": 4.799866577718479, "step": 14390 }, { "distill_loss": 0.10881204158067703, "epoch": 4.799866577718479, "step": 14390 }, { "epoch": 4.799866577718479, "ref_ce_loss": 0.07634250819683075, "step": 14390 }, { "epoch": 4.799866577718479, "loss": 0.30655360221862793, "step": 14390 }, { "ce_loss": 0.07555373758077621, "epoch": 4.799866577718479, "step": 14390 }, { "distill_loss": 0.1615779846906662, "epoch": 4.799866577718479, "step": 14390 }, { "epoch": 4.799866577718479, "ref_ce_loss": 0.0693129375576973, "step": 14390 }, { "epoch": 4.799866577718479, "loss": 0.3501516580581665, "step": 14390 }, { "ce_loss": 0.08676886558532715, "epoch": 4.799866577718479, "step": 14390 }, { "distill_loss": 0.10866406559944153, "epoch": 4.799866577718479, "step": 14390 }, { "epoch": 4.799866577718479, "ref_ce_loss": 0.0950222760438919, "step": 14390 }, { "epoch": 4.803202134756504, "loss": 0.4181, "step": 14400 }, { "epoch": 4.803202134756504, "grad_norm": 2.890615463256836, "step": 14400 }, { "epoch": 4.803202134756504, "learning_rate": 0.00010904224389999772, "step": 14400 }, { "epoch": 4.803202134756504, "loss": 0.5171407461166382, "step": 14400 }, { "ce_loss": 0.07824192941188812, "epoch": 4.803202134756504, "step": 14400 }, { "distill_loss": 0.1526021808385849, "epoch": 4.803202134756504, "step": 14400 }, { "epoch": 4.803202134756504, "ref_ce_loss": 0.09420009702444077, "step": 14400 }, { "epoch": 4.803202134756504, "loss": 0.36038991808891296, "step": 14400 }, { "ce_loss": 0.12154200673103333, "epoch": 4.803202134756504, "step": 14400 }, { "distill_loss": 0.14514140784740448, "epoch": 4.803202134756504, "step": 14400 }, { "epoch": 4.803202134756504, "ref_ce_loss": 0.07570644468069077, "step": 14400 }, { "epoch": 4.803202134756504, "loss": 0.44446617364883423, "step": 14400 }, { "ce_loss": 0.155522882938385, "epoch": 4.803202134756504, "step": 14400 }, { "distill_loss": 0.16018736362457275, "epoch": 4.803202134756504, "step": 14400 }, { "epoch": 4.803202134756504, "ref_ce_loss": 0.09933032840490341, "step": 14400 }, { "epoch": 4.803202134756504, "loss": 0.43344542384147644, "step": 14400 }, { "ce_loss": 0.11573299020528793, "epoch": 4.803202134756504, "step": 14400 }, { "distill_loss": 0.1858031451702118, "epoch": 4.803202134756504, "step": 14400 }, { "epoch": 4.803202134756504, "ref_ce_loss": 0.13141514360904694, "step": 14400 }, { "epoch": 4.80653769179453, "loss": 0.3817, "step": 14410 }, { "epoch": 4.80653769179453, "grad_norm": 2.4496381282806396, "step": 14410 }, { "epoch": 4.80653769179453, "learning_rate": 0.00010884741734422578, "step": 14410 }, { "epoch": 4.80653769179453, "loss": 0.7833337783813477, "step": 14410 }, { "ce_loss": 0.11659690737724304, "epoch": 4.80653769179453, "step": 14410 }, { "distill_loss": 0.12998512387275696, "epoch": 4.80653769179453, "step": 14410 }, { "epoch": 4.80653769179453, "ref_ce_loss": 0.09738679975271225, "step": 14410 }, { "epoch": 4.80653769179453, "loss": 0.38516169786453247, "step": 14410 }, { "ce_loss": 0.13952520489692688, "epoch": 4.80653769179453, "step": 14410 }, { "distill_loss": 0.15259574353694916, "epoch": 4.80653769179453, "step": 14410 }, { "epoch": 4.80653769179453, "ref_ce_loss": 0.09262451529502869, "step": 14410 }, { "epoch": 4.80653769179453, "loss": 0.46893495321273804, "step": 14410 }, { "ce_loss": 0.10217462480068207, "epoch": 4.80653769179453, "step": 14410 }, { "distill_loss": 0.18115559220314026, "epoch": 4.80653769179453, "step": 14410 }, { "epoch": 4.80653769179453, "ref_ce_loss": 0.07953232526779175, "step": 14410 }, { "epoch": 4.80653769179453, "loss": 0.3491207957267761, "step": 14410 }, { "ce_loss": 0.11116782575845718, "epoch": 4.80653769179453, "step": 14410 }, { "distill_loss": 0.11189278960227966, "epoch": 4.80653769179453, "step": 14410 }, { "epoch": 4.80653769179453, "ref_ce_loss": 0.0821090117096901, "step": 14410 }, { "epoch": 4.809873248832555, "loss": 0.4266, "step": 14420 }, { "epoch": 4.809873248832555, "grad_norm": 3.1390187740325928, "step": 14420 }, { "epoch": 4.809873248832555, "learning_rate": 0.00010865266583453127, "step": 14420 }, { "epoch": 4.809873248832555, "loss": 0.22609755396842957, "step": 14420 }, { "ce_loss": 0.03655946999788284, "epoch": 4.809873248832555, "step": 14420 }, { "distill_loss": 0.09298989176750183, "epoch": 4.809873248832555, "step": 14420 }, { "epoch": 4.809873248832555, "ref_ce_loss": 0.0591839924454689, "step": 14420 }, { "epoch": 4.809873248832555, "loss": 0.3537759780883789, "step": 14420 }, { "ce_loss": 0.06846825778484344, "epoch": 4.809873248832555, "step": 14420 }, { "distill_loss": 0.10115021467208862, "epoch": 4.809873248832555, "step": 14420 }, { "epoch": 4.809873248832555, "ref_ce_loss": 0.0874619260430336, "step": 14420 }, { "epoch": 4.809873248832555, "loss": 0.3921999931335449, "step": 14420 }, { "ce_loss": 0.09556600451469421, "epoch": 4.809873248832555, "step": 14420 }, { "distill_loss": 0.13231655955314636, "epoch": 4.809873248832555, "step": 14420 }, { "epoch": 4.809873248832555, "ref_ce_loss": 0.05044102296233177, "step": 14420 }, { "epoch": 4.809873248832555, "loss": 0.6851783990859985, "step": 14420 }, { "ce_loss": 0.20700956881046295, "epoch": 4.809873248832555, "step": 14420 }, { "distill_loss": 0.2140367180109024, "epoch": 4.809873248832555, "step": 14420 }, { "epoch": 4.809873248832555, "ref_ce_loss": 0.15442690253257751, "step": 14420 }, { "epoch": 4.81320880587058, "loss": 0.3931, "step": 14430 }, { "epoch": 4.81320880587058, "grad_norm": 2.6131036281585693, "step": 14430 }, { "epoch": 4.81320880587058, "learning_rate": 0.00010845798972606404, "step": 14430 }, { "epoch": 4.81320880587058, "loss": 0.21408505737781525, "step": 14430 }, { "ce_loss": 0.028935782611370087, "epoch": 4.81320880587058, "step": 14430 }, { "distill_loss": 0.11932484805583954, "epoch": 4.81320880587058, "step": 14430 }, { "epoch": 4.81320880587058, "ref_ce_loss": 0.06558303534984589, "step": 14430 }, { "epoch": 4.81320880587058, "loss": 1.2338591814041138, "step": 14430 }, { "ce_loss": 0.20345933735370636, "epoch": 4.81320880587058, "step": 14430 }, { "distill_loss": 0.17151473462581635, "epoch": 4.81320880587058, "step": 14430 }, { "epoch": 4.81320880587058, "ref_ce_loss": 0.11353405565023422, "step": 14430 }, { "epoch": 4.81320880587058, "loss": 0.23042449355125427, "step": 14430 }, { "ce_loss": 0.08617976307868958, "epoch": 4.81320880587058, "step": 14430 }, { "distill_loss": 0.09825091063976288, "epoch": 4.81320880587058, "step": 14430 }, { "epoch": 4.81320880587058, "ref_ce_loss": 0.04588671773672104, "step": 14430 }, { "epoch": 4.81320880587058, "loss": 0.2805485725402832, "step": 14430 }, { "ce_loss": 0.04742460697889328, "epoch": 4.81320880587058, "step": 14430 }, { "distill_loss": 0.1001681461930275, "epoch": 4.81320880587058, "step": 14430 }, { "epoch": 4.81320880587058, "ref_ce_loss": 0.0729212611913681, "step": 14430 }, { "epoch": 4.816544362908606, "loss": 0.4052, "step": 14440 }, { "epoch": 4.816544362908606, "grad_norm": 6.898608684539795, "step": 14440 }, { "epoch": 4.816544362908606, "learning_rate": 0.00010826338937383656, "step": 14440 }, { "epoch": 4.816544362908606, "loss": 0.2627457082271576, "step": 14440 }, { "ce_loss": 0.05536928027868271, "epoch": 4.816544362908606, "step": 14440 }, { "distill_loss": 0.11648040264844894, "epoch": 4.816544362908606, "step": 14440 }, { "epoch": 4.816544362908606, "ref_ce_loss": 0.09046781063079834, "step": 14440 }, { "epoch": 4.816544362908606, "loss": 0.5691077709197998, "step": 14440 }, { "ce_loss": 0.17502161860466003, "epoch": 4.816544362908606, "step": 14440 }, { "distill_loss": 0.20319420099258423, "epoch": 4.816544362908606, "step": 14440 }, { "epoch": 4.816544362908606, "ref_ce_loss": 0.09941479563713074, "step": 14440 }, { "epoch": 4.816544362908606, "loss": 0.2933313846588135, "step": 14440 }, { "ce_loss": 0.04782741516828537, "epoch": 4.816544362908606, "step": 14440 }, { "distill_loss": 0.11192759871482849, "epoch": 4.816544362908606, "step": 14440 }, { "epoch": 4.816544362908606, "ref_ce_loss": 0.07869847863912582, "step": 14440 }, { "epoch": 4.816544362908606, "loss": 0.32633116841316223, "step": 14440 }, { "ce_loss": 0.13065771758556366, "epoch": 4.816544362908606, "step": 14440 }, { "distill_loss": 0.12477478384971619, "epoch": 4.816544362908606, "step": 14440 }, { "epoch": 4.816544362908606, "ref_ce_loss": 0.07074069231748581, "step": 14440 }, { "epoch": 4.819879919946631, "loss": 0.4009, "step": 14450 }, { "epoch": 4.819879919946631, "grad_norm": 1.7006696462631226, "step": 14450 }, { "epoch": 4.819879919946631, "learning_rate": 0.00010806886513272319, "step": 14450 }, { "epoch": 4.819879919946631, "loss": 0.5183548331260681, "step": 14450 }, { "ce_loss": 0.14206622540950775, "epoch": 4.819879919946631, "step": 14450 }, { "distill_loss": 0.10981175303459167, "epoch": 4.819879919946631, "step": 14450 }, { "epoch": 4.819879919946631, "ref_ce_loss": 0.10474380850791931, "step": 14450 }, { "epoch": 4.819879919946631, "loss": 0.37155765295028687, "step": 14450 }, { "ce_loss": 0.10685473680496216, "epoch": 4.819879919946631, "step": 14450 }, { "distill_loss": 0.12410110980272293, "epoch": 4.819879919946631, "step": 14450 }, { "epoch": 4.819879919946631, "ref_ce_loss": 0.09817085415124893, "step": 14450 }, { "epoch": 4.819879919946631, "loss": 0.5181396007537842, "step": 14450 }, { "ce_loss": 0.13136331737041473, "epoch": 4.819879919946631, "step": 14450 }, { "distill_loss": 0.16278839111328125, "epoch": 4.819879919946631, "step": 14450 }, { "epoch": 4.819879919946631, "ref_ce_loss": 0.11400038003921509, "step": 14450 }, { "epoch": 4.819879919946631, "loss": 0.31124332547187805, "step": 14450 }, { "ce_loss": 0.08205088973045349, "epoch": 4.819879919946631, "step": 14450 }, { "distill_loss": 0.17146164178848267, "epoch": 4.819879919946631, "step": 14450 }, { "epoch": 4.819879919946631, "ref_ce_loss": 0.05737442150712013, "step": 14450 }, { "epoch": 4.8232154769846565, "loss": 0.386, "step": 14460 }, { "epoch": 4.8232154769846565, "grad_norm": 3.951909065246582, "step": 14460 }, { "epoch": 4.8232154769846565, "learning_rate": 0.00010787441735745924, "step": 14460 }, { "epoch": 4.8232154769846565, "loss": 0.47462722659111023, "step": 14460 }, { "ce_loss": 0.1653720587491989, "epoch": 4.8232154769846565, "step": 14460 }, { "distill_loss": 0.19095370173454285, "epoch": 4.8232154769846565, "step": 14460 }, { "epoch": 4.8232154769846565, "ref_ce_loss": 0.07604599744081497, "step": 14460 }, { "epoch": 4.8232154769846565, "loss": 0.43112313747406006, "step": 14460 }, { "ce_loss": 0.08467692136764526, "epoch": 4.8232154769846565, "step": 14460 }, { "distill_loss": 0.12960241734981537, "epoch": 4.8232154769846565, "step": 14460 }, { "epoch": 4.8232154769846565, "ref_ce_loss": 0.07011184096336365, "step": 14460 }, { "epoch": 4.8232154769846565, "loss": 0.42976704239845276, "step": 14460 }, { "ce_loss": 0.11868524551391602, "epoch": 4.8232154769846565, "step": 14460 }, { "distill_loss": 0.1772325485944748, "epoch": 4.8232154769846565, "step": 14460 }, { "epoch": 4.8232154769846565, "ref_ce_loss": 0.09218654036521912, "step": 14460 }, { "epoch": 4.8232154769846565, "loss": 0.3322980999946594, "step": 14460 }, { "ce_loss": 0.09327730536460876, "epoch": 4.8232154769846565, "step": 14460 }, { "distill_loss": 0.11820797622203827, "epoch": 4.8232154769846565, "step": 14460 }, { "epoch": 4.8232154769846565, "ref_ce_loss": 0.12056463956832886, "step": 14460 }, { "epoch": 4.826551034022682, "loss": 0.4343, "step": 14470 }, { "epoch": 4.826551034022682, "grad_norm": 3.124202013015747, "step": 14470 }, { "epoch": 4.826551034022682, "learning_rate": 0.00010768004640264087, "step": 14470 }, { "epoch": 4.826551034022682, "loss": 0.5490837097167969, "step": 14470 }, { "ce_loss": 0.13129980862140656, "epoch": 4.826551034022682, "step": 14470 }, { "distill_loss": 0.15426191687583923, "epoch": 4.826551034022682, "step": 14470 }, { "epoch": 4.826551034022682, "ref_ce_loss": 0.09926041960716248, "step": 14470 }, { "epoch": 4.826551034022682, "loss": 0.4295900762081146, "step": 14470 }, { "ce_loss": 0.07018239051103592, "epoch": 4.826551034022682, "step": 14470 }, { "distill_loss": 0.19665493071079254, "epoch": 4.826551034022682, "step": 14470 }, { "epoch": 4.826551034022682, "ref_ce_loss": 0.12419581413269043, "step": 14470 }, { "epoch": 4.826551034022682, "loss": 0.445414662361145, "step": 14470 }, { "ce_loss": 0.15395063161849976, "epoch": 4.826551034022682, "step": 14470 }, { "distill_loss": 0.16152891516685486, "epoch": 4.826551034022682, "step": 14470 }, { "epoch": 4.826551034022682, "ref_ce_loss": 0.08925561606884003, "step": 14470 }, { "epoch": 4.826551034022682, "loss": 0.37601250410079956, "step": 14470 }, { "ce_loss": 0.08798770606517792, "epoch": 4.826551034022682, "step": 14470 }, { "distill_loss": 0.12041836231946945, "epoch": 4.826551034022682, "step": 14470 }, { "epoch": 4.826551034022682, "ref_ce_loss": 0.0818629190325737, "step": 14470 }, { "epoch": 4.829886591060707, "loss": 0.4342, "step": 14480 }, { "epoch": 4.829886591060707, "grad_norm": 2.767193078994751, "step": 14480 }, { "epoch": 4.829886591060707, "learning_rate": 0.00010748575262272406, "step": 14480 }, { "epoch": 4.829886591060707, "loss": 0.29959261417388916, "step": 14480 }, { "ce_loss": 0.08880975842475891, "epoch": 4.829886591060707, "step": 14480 }, { "distill_loss": 0.13206779956817627, "epoch": 4.829886591060707, "step": 14480 }, { "epoch": 4.829886591060707, "ref_ce_loss": 0.048692721873521805, "step": 14480 }, { "epoch": 4.829886591060707, "loss": 0.5737348794937134, "step": 14480 }, { "ce_loss": 0.11966817826032639, "epoch": 4.829886591060707, "step": 14480 }, { "distill_loss": 0.1964222639799118, "epoch": 4.829886591060707, "step": 14480 }, { "epoch": 4.829886591060707, "ref_ce_loss": 0.07506822049617767, "step": 14480 }, { "epoch": 4.829886591060707, "loss": 0.6172971725463867, "step": 14480 }, { "ce_loss": 0.15250805020332336, "epoch": 4.829886591060707, "step": 14480 }, { "distill_loss": 0.179644376039505, "epoch": 4.829886591060707, "step": 14480 }, { "epoch": 4.829886591060707, "ref_ce_loss": 0.09086277335882187, "step": 14480 }, { "epoch": 4.829886591060707, "loss": 0.3475439250469208, "step": 14480 }, { "ce_loss": 0.06675507873296738, "epoch": 4.829886591060707, "step": 14480 }, { "distill_loss": 0.14787790179252625, "epoch": 4.829886591060707, "step": 14480 }, { "epoch": 4.829886591060707, "ref_ce_loss": 0.0421852171421051, "step": 14480 }, { "epoch": 4.8332221480987325, "loss": 0.3986, "step": 14490 }, { "epoch": 4.8332221480987325, "grad_norm": 1.9744664430618286, "step": 14490 }, { "epoch": 4.8332221480987325, "learning_rate": 0.00010729153637202389, "step": 14490 }, { "epoch": 4.8332221480987325, "loss": 0.5732771158218384, "step": 14490 }, { "ce_loss": 0.11101856082677841, "epoch": 4.8332221480987325, "step": 14490 }, { "distill_loss": 0.1596483588218689, "epoch": 4.8332221480987325, "step": 14490 }, { "epoch": 4.8332221480987325, "ref_ce_loss": 0.13086585700511932, "step": 14490 }, { "epoch": 4.8332221480987325, "loss": 0.5659236907958984, "step": 14490 }, { "ce_loss": 0.10885528475046158, "epoch": 4.8332221480987325, "step": 14490 }, { "distill_loss": 0.13874022662639618, "epoch": 4.8332221480987325, "step": 14490 }, { "epoch": 4.8332221480987325, "ref_ce_loss": 0.06971140950918198, "step": 14490 }, { "epoch": 4.8332221480987325, "loss": 0.26478707790374756, "step": 14490 }, { "ce_loss": 0.05931922420859337, "epoch": 4.8332221480987325, "step": 14490 }, { "distill_loss": 0.13022437691688538, "epoch": 4.8332221480987325, "step": 14490 }, { "epoch": 4.8332221480987325, "ref_ce_loss": 0.0495450533926487, "step": 14490 }, { "epoch": 4.8332221480987325, "loss": 0.46827933192253113, "step": 14490 }, { "ce_loss": 0.07739992439746857, "epoch": 4.8332221480987325, "step": 14490 }, { "distill_loss": 0.14634524285793304, "epoch": 4.8332221480987325, "step": 14490 }, { "epoch": 4.8332221480987325, "ref_ce_loss": 0.05592044070363045, "step": 14490 }, { "epoch": 4.836557705136758, "loss": 0.4342, "step": 14500 }, { "epoch": 4.836557705136758, "grad_norm": 2.3469674587249756, "step": 14500 }, { "epoch": 4.836557705136758, "learning_rate": 0.00010709739800471433, "step": 14500 }, { "epoch": 4.836557705136758, "loss": 0.3189893662929535, "step": 14500 }, { "ce_loss": 0.07006145268678665, "epoch": 4.836557705136758, "step": 14500 }, { "distill_loss": 0.12909801304340363, "epoch": 4.836557705136758, "step": 14500 }, { "epoch": 4.836557705136758, "ref_ce_loss": 0.08902038633823395, "step": 14500 }, { "epoch": 4.836557705136758, "loss": 0.4594857394695282, "step": 14500 }, { "ce_loss": 0.12085427343845367, "epoch": 4.836557705136758, "step": 14500 }, { "distill_loss": 0.15103530883789062, "epoch": 4.836557705136758, "step": 14500 }, { "epoch": 4.836557705136758, "ref_ce_loss": 0.09070585668087006, "step": 14500 }, { "epoch": 4.836557705136758, "loss": 0.33890047669410706, "step": 14500 }, { "ce_loss": 0.09388411045074463, "epoch": 4.836557705136758, "step": 14500 }, { "distill_loss": 0.12131661921739578, "epoch": 4.836557705136758, "step": 14500 }, { "epoch": 4.836557705136758, "ref_ce_loss": 0.08646147698163986, "step": 14500 }, { "epoch": 4.836557705136758, "loss": 0.3899589776992798, "step": 14500 }, { "ce_loss": 0.11472490429878235, "epoch": 4.836557705136758, "step": 14500 }, { "distill_loss": 0.17198053002357483, "epoch": 4.836557705136758, "step": 14500 }, { "epoch": 4.836557705136758, "ref_ce_loss": 0.07948368787765503, "step": 14500 }, { "epoch": 4.839893262174783, "loss": 0.3939, "step": 14510 }, { "epoch": 4.839893262174783, "grad_norm": 2.1642537117004395, "step": 14510 }, { "epoch": 4.839893262174783, "learning_rate": 0.00010690333787482708, "step": 14510 }, { "epoch": 4.839893262174783, "loss": 0.5003763437271118, "step": 14510 }, { "ce_loss": 0.0803409069776535, "epoch": 4.839893262174783, "step": 14510 }, { "distill_loss": 0.13409990072250366, "epoch": 4.839893262174783, "step": 14510 }, { "epoch": 4.839893262174783, "ref_ce_loss": 0.10229332000017166, "step": 14510 }, { "epoch": 4.839893262174783, "loss": 0.3218827545642853, "step": 14510 }, { "ce_loss": 0.12027572095394135, "epoch": 4.839893262174783, "step": 14510 }, { "distill_loss": 0.12687276303768158, "epoch": 4.839893262174783, "step": 14510 }, { "epoch": 4.839893262174783, "ref_ce_loss": 0.07463307678699493, "step": 14510 }, { "epoch": 4.839893262174783, "loss": 0.43994343280792236, "step": 14510 }, { "ce_loss": 0.13173320889472961, "epoch": 4.839893262174783, "step": 14510 }, { "distill_loss": 0.174768328666687, "epoch": 4.839893262174783, "step": 14510 }, { "epoch": 4.839893262174783, "ref_ce_loss": 0.0962105318903923, "step": 14510 }, { "epoch": 4.839893262174783, "loss": 0.3595537841320038, "step": 14510 }, { "ce_loss": 0.11435595899820328, "epoch": 4.839893262174783, "step": 14510 }, { "distill_loss": 0.1392708569765091, "epoch": 4.839893262174783, "step": 14510 }, { "epoch": 4.839893262174783, "ref_ce_loss": 0.10565324127674103, "step": 14510 }, { "epoch": 4.843228819212809, "loss": 0.4273, "step": 14520 }, { "epoch": 4.843228819212809, "grad_norm": 4.729673385620117, "step": 14520 }, { "epoch": 4.843228819212809, "learning_rate": 0.00010670935633625125, "step": 14520 }, { "epoch": 4.843228819212809, "loss": 0.39400550723075867, "step": 14520 }, { "ce_loss": 0.12447620928287506, "epoch": 4.843228819212809, "step": 14520 }, { "distill_loss": 0.1449531614780426, "epoch": 4.843228819212809, "step": 14520 }, { "epoch": 4.843228819212809, "ref_ce_loss": 0.0862637460231781, "step": 14520 }, { "epoch": 4.843228819212809, "loss": 0.3979857563972473, "step": 14520 }, { "ce_loss": 0.11020766198635101, "epoch": 4.843228819212809, "step": 14520 }, { "distill_loss": 0.1860402375459671, "epoch": 4.843228819212809, "step": 14520 }, { "epoch": 4.843228819212809, "ref_ce_loss": 0.07654957473278046, "step": 14520 }, { "epoch": 4.843228819212809, "loss": 0.4634833335876465, "step": 14520 }, { "ce_loss": 0.12632647156715393, "epoch": 4.843228819212809, "step": 14520 }, { "distill_loss": 0.14024150371551514, "epoch": 4.843228819212809, "step": 14520 }, { "epoch": 4.843228819212809, "ref_ce_loss": 0.13895127177238464, "step": 14520 }, { "epoch": 4.843228819212809, "loss": 0.2567285895347595, "step": 14520 }, { "ce_loss": 0.04383685067296028, "epoch": 4.843228819212809, "step": 14520 }, { "distill_loss": 0.147134467959404, "epoch": 4.843228819212809, "step": 14520 }, { "epoch": 4.843228819212809, "ref_ce_loss": 0.04801378771662712, "step": 14520 }, { "epoch": 4.846564376250834, "loss": 0.43, "step": 14530 }, { "epoch": 4.846564376250834, "grad_norm": 3.172746419906616, "step": 14530 }, { "epoch": 4.846564376250834, "learning_rate": 0.0001065154537427328, "step": 14530 }, { "epoch": 4.846564376250834, "loss": 0.3927261233329773, "step": 14530 }, { "ce_loss": 0.09357485920190811, "epoch": 4.846564376250834, "step": 14530 }, { "distill_loss": 0.11319194734096527, "epoch": 4.846564376250834, "step": 14530 }, { "epoch": 4.846564376250834, "ref_ce_loss": 0.09334545582532883, "step": 14530 }, { "epoch": 4.846564376250834, "loss": 0.3599247336387634, "step": 14530 }, { "ce_loss": 0.05117193982005119, "epoch": 4.846564376250834, "step": 14530 }, { "distill_loss": 0.12257934361696243, "epoch": 4.846564376250834, "step": 14530 }, { "epoch": 4.846564376250834, "ref_ce_loss": 0.05467826500535011, "step": 14530 }, { "epoch": 4.846564376250834, "loss": 0.6843513250350952, "step": 14530 }, { "ce_loss": 0.10040271282196045, "epoch": 4.846564376250834, "step": 14530 }, { "distill_loss": 0.14522188901901245, "epoch": 4.846564376250834, "step": 14530 }, { "epoch": 4.846564376250834, "ref_ce_loss": 0.08244102448225021, "step": 14530 }, { "epoch": 4.846564376250834, "loss": 0.4031752943992615, "step": 14530 }, { "ce_loss": 0.14085879921913147, "epoch": 4.846564376250834, "step": 14530 }, { "distill_loss": 0.16135947406291962, "epoch": 4.846564376250834, "step": 14530 }, { "epoch": 4.846564376250834, "ref_ce_loss": 0.06957376003265381, "step": 14530 }, { "epoch": 4.849899933288859, "loss": 0.4429, "step": 14540 }, { "epoch": 4.849899933288859, "grad_norm": 3.005507230758667, "step": 14540 }, { "epoch": 4.849899933288859, "learning_rate": 0.0001063216304478734, "step": 14540 }, { "epoch": 4.849899933288859, "loss": 0.291361004114151, "step": 14540 }, { "ce_loss": 0.042861636728048325, "epoch": 4.849899933288859, "step": 14540 }, { "distill_loss": 0.10996908694505692, "epoch": 4.849899933288859, "step": 14540 }, { "epoch": 4.849899933288859, "ref_ce_loss": 0.09997054189443588, "step": 14540 }, { "epoch": 4.849899933288859, "loss": 0.4132578372955322, "step": 14540 }, { "ce_loss": 0.11565080285072327, "epoch": 4.849899933288859, "step": 14540 }, { "distill_loss": 0.18525856733322144, "epoch": 4.849899933288859, "step": 14540 }, { "epoch": 4.849899933288859, "ref_ce_loss": 0.08986419439315796, "step": 14540 }, { "epoch": 4.849899933288859, "loss": 0.5397231578826904, "step": 14540 }, { "ce_loss": 0.09195056557655334, "epoch": 4.849899933288859, "step": 14540 }, { "distill_loss": 0.12949776649475098, "epoch": 4.849899933288859, "step": 14540 }, { "epoch": 4.849899933288859, "ref_ce_loss": 0.1093427911400795, "step": 14540 }, { "epoch": 4.849899933288859, "loss": 0.5233044028282166, "step": 14540 }, { "ce_loss": 0.1069001853466034, "epoch": 4.849899933288859, "step": 14540 }, { "distill_loss": 0.17089305818080902, "epoch": 4.849899933288859, "step": 14540 }, { "epoch": 4.849899933288859, "ref_ce_loss": 0.11540444940328598, "step": 14540 }, { "epoch": 4.853235490326885, "loss": 0.4654, "step": 14550 }, { "epoch": 4.853235490326885, "grad_norm": 2.990715265274048, "step": 14550 }, { "epoch": 4.853235490326885, "learning_rate": 0.00010612788680513038, "step": 14550 }, { "epoch": 4.853235490326885, "loss": 0.40204113721847534, "step": 14550 }, { "ce_loss": 0.1541042923927307, "epoch": 4.853235490326885, "step": 14550 }, { "distill_loss": 0.14879141747951508, "epoch": 4.853235490326885, "step": 14550 }, { "epoch": 4.853235490326885, "ref_ce_loss": 0.09888723492622375, "step": 14550 }, { "epoch": 4.853235490326885, "loss": 0.35200127959251404, "step": 14550 }, { "ce_loss": 0.11289265751838684, "epoch": 4.853235490326885, "step": 14550 }, { "distill_loss": 0.15979474782943726, "epoch": 4.853235490326885, "step": 14550 }, { "epoch": 4.853235490326885, "ref_ce_loss": 0.07536427676677704, "step": 14550 }, { "epoch": 4.853235490326885, "loss": 0.3538520038127899, "step": 14550 }, { "ce_loss": 0.09295067936182022, "epoch": 4.853235490326885, "step": 14550 }, { "distill_loss": 0.1400989592075348, "epoch": 4.853235490326885, "step": 14550 }, { "epoch": 4.853235490326885, "ref_ce_loss": 0.08571676164865494, "step": 14550 }, { "epoch": 4.853235490326885, "loss": 0.43285155296325684, "step": 14550 }, { "ce_loss": 0.09952595084905624, "epoch": 4.853235490326885, "step": 14550 }, { "distill_loss": 0.1657617688179016, "epoch": 4.853235490326885, "step": 14550 }, { "epoch": 4.853235490326885, "ref_ce_loss": 0.09958847612142563, "step": 14550 }, { "epoch": 4.85657104736491, "loss": 0.4291, "step": 14560 }, { "epoch": 4.85657104736491, "grad_norm": 2.8867013454437256, "step": 14560 }, { "epoch": 4.85657104736491, "learning_rate": 0.00010593422316781567, "step": 14560 }, { "epoch": 4.85657104736491, "loss": 0.4896565079689026, "step": 14560 }, { "ce_loss": 0.1618562489748001, "epoch": 4.85657104736491, "step": 14560 }, { "distill_loss": 0.15716411173343658, "epoch": 4.85657104736491, "step": 14560 }, { "epoch": 4.85657104736491, "ref_ce_loss": 0.09468694031238556, "step": 14560 }, { "epoch": 4.85657104736491, "loss": 0.4915570318698883, "step": 14560 }, { "ce_loss": 0.18240197002887726, "epoch": 4.85657104736491, "step": 14560 }, { "distill_loss": 0.15938079357147217, "epoch": 4.85657104736491, "step": 14560 }, { "epoch": 4.85657104736491, "ref_ce_loss": 0.10698254406452179, "step": 14560 }, { "epoch": 4.85657104736491, "loss": 0.39248335361480713, "step": 14560 }, { "ce_loss": 0.11607158184051514, "epoch": 4.85657104736491, "step": 14560 }, { "distill_loss": 0.15520218014717102, "epoch": 4.85657104736491, "step": 14560 }, { "epoch": 4.85657104736491, "ref_ce_loss": 0.09117502719163895, "step": 14560 }, { "epoch": 4.85657104736491, "loss": 0.3708113133907318, "step": 14560 }, { "ce_loss": 0.053829822689294815, "epoch": 4.85657104736491, "step": 14560 }, { "distill_loss": 0.1012469157576561, "epoch": 4.85657104736491, "step": 14560 }, { "epoch": 4.85657104736491, "ref_ce_loss": 0.11660492420196533, "step": 14560 }, { "epoch": 4.859906604402935, "loss": 0.4288, "step": 14570 }, { "epoch": 4.859906604402935, "grad_norm": 2.4022624492645264, "step": 14570 }, { "epoch": 4.859906604402935, "learning_rate": 0.00010574063988909538, "step": 14570 }, { "epoch": 4.859906604402935, "loss": 0.3762693405151367, "step": 14570 }, { "ce_loss": 0.09208332747220993, "epoch": 4.859906604402935, "step": 14570 }, { "distill_loss": 0.1544317752122879, "epoch": 4.859906604402935, "step": 14570 }, { "epoch": 4.859906604402935, "ref_ce_loss": 0.0855952724814415, "step": 14570 }, { "epoch": 4.859906604402935, "loss": 0.40494173765182495, "step": 14570 }, { "ce_loss": 0.09856753051280975, "epoch": 4.859906604402935, "step": 14570 }, { "distill_loss": 0.13378620147705078, "epoch": 4.859906604402935, "step": 14570 }, { "epoch": 4.859906604402935, "ref_ce_loss": 0.0777980238199234, "step": 14570 }, { "epoch": 4.859906604402935, "loss": 0.31158962845802307, "step": 14570 }, { "ce_loss": 0.08274923264980316, "epoch": 4.859906604402935, "step": 14570 }, { "distill_loss": 0.144447922706604, "epoch": 4.859906604402935, "step": 14570 }, { "epoch": 4.859906604402935, "ref_ce_loss": 0.0840868353843689, "step": 14570 }, { "epoch": 4.859906604402935, "loss": 0.3044297993183136, "step": 14570 }, { "ce_loss": 0.06392364948987961, "epoch": 4.859906604402935, "step": 14570 }, { "distill_loss": 0.13459810614585876, "epoch": 4.859906604402935, "step": 14570 }, { "epoch": 4.859906604402935, "ref_ce_loss": 0.08571823686361313, "step": 14570 }, { "epoch": 4.863242161440961, "loss": 0.3983, "step": 14580 }, { "epoch": 4.863242161440961, "grad_norm": 2.0568363666534424, "step": 14580 }, { "epoch": 4.863242161440961, "learning_rate": 0.00010554713732198905, "step": 14580 }, { "epoch": 4.863242161440961, "loss": 0.6924458742141724, "step": 14580 }, { "ce_loss": 0.2798740565776825, "epoch": 4.863242161440961, "step": 14580 }, { "distill_loss": 0.16391947865486145, "epoch": 4.863242161440961, "step": 14580 }, { "epoch": 4.863242161440961, "ref_ce_loss": 0.15253989398479462, "step": 14580 }, { "epoch": 4.863242161440961, "loss": 0.4279673397541046, "step": 14580 }, { "ce_loss": 0.11625529825687408, "epoch": 4.863242161440961, "step": 14580 }, { "distill_loss": 0.13469599187374115, "epoch": 4.863242161440961, "step": 14580 }, { "epoch": 4.863242161440961, "ref_ce_loss": 0.0896773636341095, "step": 14580 }, { "epoch": 4.863242161440961, "loss": 0.3884398341178894, "step": 14580 }, { "ce_loss": 0.13770216703414917, "epoch": 4.863242161440961, "step": 14580 }, { "distill_loss": 0.13542982935905457, "epoch": 4.863242161440961, "step": 14580 }, { "epoch": 4.863242161440961, "ref_ce_loss": 0.09085490554571152, "step": 14580 }, { "epoch": 4.863242161440961, "loss": 0.27767953276634216, "step": 14580 }, { "ce_loss": 0.06366454809904099, "epoch": 4.863242161440961, "step": 14580 }, { "distill_loss": 0.09851067513227463, "epoch": 4.863242161440961, "step": 14580 }, { "epoch": 4.863242161440961, "ref_ce_loss": 0.06787992268800735, "step": 14580 }, { "epoch": 4.866577718478986, "loss": 0.3815, "step": 14590 }, { "epoch": 4.866577718478986, "grad_norm": 2.244171380996704, "step": 14590 }, { "epoch": 4.866577718478986, "learning_rate": 0.000105353715819369, "step": 14590 }, { "epoch": 4.866577718478986, "loss": 0.40847817063331604, "step": 14590 }, { "ce_loss": 0.14665713906288147, "epoch": 4.866577718478986, "step": 14590 }, { "distill_loss": 0.11157718300819397, "epoch": 4.866577718478986, "step": 14590 }, { "epoch": 4.866577718478986, "ref_ce_loss": 0.12388044595718384, "step": 14590 }, { "epoch": 4.866577718478986, "loss": 0.39813506603240967, "step": 14590 }, { "ce_loss": 0.11959546059370041, "epoch": 4.866577718478986, "step": 14590 }, { "distill_loss": 0.12179628759622574, "epoch": 4.866577718478986, "step": 14590 }, { "epoch": 4.866577718478986, "ref_ce_loss": 0.11253191530704498, "step": 14590 }, { "epoch": 4.866577718478986, "loss": 0.2991400361061096, "step": 14590 }, { "ce_loss": 0.06128731742501259, "epoch": 4.866577718478986, "step": 14590 }, { "distill_loss": 0.1151864305138588, "epoch": 4.866577718478986, "step": 14590 }, { "epoch": 4.866577718478986, "ref_ce_loss": 0.07057370245456696, "step": 14590 }, { "epoch": 4.866577718478986, "loss": 0.39899754524230957, "step": 14590 }, { "ce_loss": 0.10590605437755585, "epoch": 4.866577718478986, "step": 14590 }, { "distill_loss": 0.13386231660842896, "epoch": 4.866577718478986, "step": 14590 }, { "epoch": 4.866577718478986, "ref_ce_loss": 0.09403873980045319, "step": 14590 }, { "epoch": 4.869913275517011, "loss": 0.3918, "step": 14600 }, { "epoch": 4.869913275517011, "grad_norm": 3.4217731952667236, "step": 14600 }, { "epoch": 4.869913275517011, "learning_rate": 0.00010516037573395978, "step": 14600 }, { "epoch": 4.869913275517011, "loss": 0.3474060893058777, "step": 14600 }, { "ce_loss": 0.07884716987609863, "epoch": 4.869913275517011, "step": 14600 }, { "distill_loss": 0.13276150822639465, "epoch": 4.869913275517011, "step": 14600 }, { "epoch": 4.869913275517011, "ref_ce_loss": 0.08149795234203339, "step": 14600 }, { "epoch": 4.869913275517011, "loss": 0.4022737741470337, "step": 14600 }, { "ce_loss": 0.16848134994506836, "epoch": 4.869913275517011, "step": 14600 }, { "distill_loss": 0.11421038955450058, "epoch": 4.869913275517011, "step": 14600 }, { "epoch": 4.869913275517011, "ref_ce_loss": 0.10647285729646683, "step": 14600 }, { "epoch": 4.869913275517011, "loss": 0.6741193532943726, "step": 14600 }, { "ce_loss": 0.1266450732946396, "epoch": 4.869913275517011, "step": 14600 }, { "distill_loss": 0.14333488047122955, "epoch": 4.869913275517011, "step": 14600 }, { "epoch": 4.869913275517011, "ref_ce_loss": 0.08235146850347519, "step": 14600 }, { "epoch": 4.869913275517011, "loss": 0.28810229897499084, "step": 14600 }, { "ce_loss": 0.0736110657453537, "epoch": 4.869913275517011, "step": 14600 }, { "distill_loss": 0.11870429664850235, "epoch": 4.869913275517011, "step": 14600 }, { "epoch": 4.869913275517011, "ref_ce_loss": 0.048248760402202606, "step": 14600 }, { "epoch": 4.873248832555037, "loss": 0.4, "step": 14610 }, { "epoch": 4.873248832555037, "grad_norm": 2.2944371700286865, "step": 14610 }, { "epoch": 4.873248832555037, "learning_rate": 0.00010496711741833745, "step": 14610 }, { "epoch": 4.873248832555037, "loss": 0.39695531129837036, "step": 14610 }, { "ce_loss": 0.07801104336977005, "epoch": 4.873248832555037, "step": 14610 }, { "distill_loss": 0.10177255421876907, "epoch": 4.873248832555037, "step": 14610 }, { "epoch": 4.873248832555037, "ref_ce_loss": 0.08016975969076157, "step": 14610 }, { "epoch": 4.873248832555037, "loss": 0.46816474199295044, "step": 14610 }, { "ce_loss": 0.15950001776218414, "epoch": 4.873248832555037, "step": 14610 }, { "distill_loss": 0.14450381696224213, "epoch": 4.873248832555037, "step": 14610 }, { "epoch": 4.873248832555037, "ref_ce_loss": 0.13032293319702148, "step": 14610 }, { "epoch": 4.873248832555037, "loss": 0.3655878007411957, "step": 14610 }, { "ce_loss": 0.0782846063375473, "epoch": 4.873248832555037, "step": 14610 }, { "distill_loss": 0.11802016943693161, "epoch": 4.873248832555037, "step": 14610 }, { "epoch": 4.873248832555037, "ref_ce_loss": 0.07457852363586426, "step": 14610 }, { "epoch": 4.873248832555037, "loss": 0.2640596926212311, "step": 14610 }, { "ce_loss": 0.06261890381574631, "epoch": 4.873248832555037, "step": 14610 }, { "distill_loss": 0.1388099044561386, "epoch": 4.873248832555037, "step": 14610 }, { "epoch": 4.873248832555037, "ref_ce_loss": 0.06244688853621483, "step": 14610 }, { "epoch": 4.876584389593062, "loss": 0.3597, "step": 14620 }, { "epoch": 4.876584389593062, "grad_norm": 8.353504180908203, "step": 14620 }, { "epoch": 4.876584389593062, "learning_rate": 0.0001047739412249289, "step": 14620 }, { "epoch": 4.876584389593062, "loss": 0.35462868213653564, "step": 14620 }, { "ce_loss": 0.05959922820329666, "epoch": 4.876584389593062, "step": 14620 }, { "distill_loss": 0.09392604976892471, "epoch": 4.876584389593062, "step": 14620 }, { "epoch": 4.876584389593062, "ref_ce_loss": 0.06135998293757439, "step": 14620 }, { "epoch": 4.876584389593062, "loss": 0.4801032543182373, "step": 14620 }, { "ce_loss": 0.1412152349948883, "epoch": 4.876584389593062, "step": 14620 }, { "distill_loss": 0.15411214530467987, "epoch": 4.876584389593062, "step": 14620 }, { "epoch": 4.876584389593062, "ref_ce_loss": 0.08248871564865112, "step": 14620 }, { "epoch": 4.876584389593062, "loss": 0.5009456872940063, "step": 14620 }, { "ce_loss": 0.06161312758922577, "epoch": 4.876584389593062, "step": 14620 }, { "distill_loss": 0.09107838571071625, "epoch": 4.876584389593062, "step": 14620 }, { "epoch": 4.876584389593062, "ref_ce_loss": 0.05931251123547554, "step": 14620 }, { "epoch": 4.876584389593062, "loss": 0.29225414991378784, "step": 14620 }, { "ce_loss": 0.06127912178635597, "epoch": 4.876584389593062, "step": 14620 }, { "distill_loss": 0.12588757276535034, "epoch": 4.876584389593062, "step": 14620 }, { "epoch": 4.876584389593062, "ref_ce_loss": 0.10451364517211914, "step": 14620 }, { "epoch": 4.879919946631087, "loss": 0.3968, "step": 14630 }, { "epoch": 4.879919946631087, "grad_norm": 2.361440420150757, "step": 14630 }, { "epoch": 4.879919946631087, "learning_rate": 0.00010458084750601137, "step": 14630 }, { "epoch": 4.879919946631087, "loss": 0.34836554527282715, "step": 14630 }, { "ce_loss": 0.1210658922791481, "epoch": 4.879919946631087, "step": 14630 }, { "distill_loss": 0.11962322890758514, "epoch": 4.879919946631087, "step": 14630 }, { "epoch": 4.879919946631087, "ref_ce_loss": 0.10724155604839325, "step": 14630 }, { "epoch": 4.879919946631087, "loss": 0.42567917704582214, "step": 14630 }, { "ce_loss": 0.08495273441076279, "epoch": 4.879919946631087, "step": 14630 }, { "distill_loss": 0.09693010151386261, "epoch": 4.879919946631087, "step": 14630 }, { "epoch": 4.879919946631087, "ref_ce_loss": 0.08949395269155502, "step": 14630 }, { "epoch": 4.879919946631087, "loss": 0.4357303977012634, "step": 14630 }, { "ce_loss": 0.0803510919213295, "epoch": 4.879919946631087, "step": 14630 }, { "distill_loss": 0.13041549921035767, "epoch": 4.879919946631087, "step": 14630 }, { "epoch": 4.879919946631087, "ref_ce_loss": 0.12345951050519943, "step": 14630 }, { "epoch": 4.879919946631087, "loss": 0.26853373646736145, "step": 14630 }, { "ce_loss": 0.05560803785920143, "epoch": 4.879919946631087, "step": 14630 }, { "distill_loss": 0.10235027968883514, "epoch": 4.879919946631087, "step": 14630 }, { "epoch": 4.879919946631087, "ref_ce_loss": 0.11022245138883591, "step": 14630 }, { "epoch": 4.883255503669113, "loss": 0.3832, "step": 14640 }, { "epoch": 4.883255503669113, "grad_norm": 2.1481926441192627, "step": 14640 }, { "epoch": 4.883255503669113, "learning_rate": 0.00010438783661371154, "step": 14640 }, { "epoch": 4.883255503669113, "loss": 0.35740113258361816, "step": 14640 }, { "ce_loss": 0.091608926653862, "epoch": 4.883255503669113, "step": 14640 }, { "distill_loss": 0.10623601078987122, "epoch": 4.883255503669113, "step": 14640 }, { "epoch": 4.883255503669113, "ref_ce_loss": 0.09509522467851639, "step": 14640 }, { "epoch": 4.883255503669113, "loss": 0.4130536615848541, "step": 14640 }, { "ce_loss": 0.11907906085252762, "epoch": 4.883255503669113, "step": 14640 }, { "distill_loss": 0.09680334478616714, "epoch": 4.883255503669113, "step": 14640 }, { "epoch": 4.883255503669113, "ref_ce_loss": 0.10953420400619507, "step": 14640 }, { "epoch": 4.883255503669113, "loss": 0.33304738998413086, "step": 14640 }, { "ce_loss": 0.0939435064792633, "epoch": 4.883255503669113, "step": 14640 }, { "distill_loss": 0.12896057963371277, "epoch": 4.883255503669113, "step": 14640 }, { "epoch": 4.883255503669113, "ref_ce_loss": 0.0519283190369606, "step": 14640 }, { "epoch": 4.883255503669113, "loss": 0.2252245545387268, "step": 14640 }, { "ce_loss": 0.054917193949222565, "epoch": 4.883255503669113, "step": 14640 }, { "distill_loss": 0.0945071280002594, "epoch": 4.883255503669113, "step": 14640 }, { "epoch": 4.883255503669113, "ref_ce_loss": 0.07565527409315109, "step": 14640 }, { "epoch": 4.886591060707138, "loss": 0.3909, "step": 14650 }, { "epoch": 4.886591060707138, "grad_norm": 2.6093320846557617, "step": 14650 }, { "epoch": 4.886591060707138, "learning_rate": 0.00010419490890000523, "step": 14650 }, { "epoch": 4.886591060707138, "loss": 0.2625221908092499, "step": 14650 }, { "ce_loss": 0.07245766371488571, "epoch": 4.886591060707138, "step": 14650 }, { "distill_loss": 0.11005422472953796, "epoch": 4.886591060707138, "step": 14650 }, { "epoch": 4.886591060707138, "ref_ce_loss": 0.07978059351444244, "step": 14650 }, { "epoch": 4.886591060707138, "loss": 0.3463577330112457, "step": 14650 }, { "ce_loss": 0.11487383395433426, "epoch": 4.886591060707138, "step": 14650 }, { "distill_loss": 0.12385935336351395, "epoch": 4.886591060707138, "step": 14650 }, { "epoch": 4.886591060707138, "ref_ce_loss": 0.08501281589269638, "step": 14650 }, { "epoch": 4.886591060707138, "loss": 0.4161044955253601, "step": 14650 }, { "ce_loss": 0.14107713103294373, "epoch": 4.886591060707138, "step": 14650 }, { "distill_loss": 0.13545288145542145, "epoch": 4.886591060707138, "step": 14650 }, { "epoch": 4.886591060707138, "ref_ce_loss": 0.09442567825317383, "step": 14650 }, { "epoch": 4.886591060707138, "loss": 0.407216340303421, "step": 14650 }, { "ce_loss": 0.09006106853485107, "epoch": 4.886591060707138, "step": 14650 }, { "distill_loss": 0.10209393501281738, "epoch": 4.886591060707138, "step": 14650 }, { "epoch": 4.886591060707138, "ref_ce_loss": 0.09535461664199829, "step": 14650 }, { "epoch": 4.8899266177451635, "loss": 0.4326, "step": 14660 }, { "epoch": 4.8899266177451635, "grad_norm": 2.7057816982269287, "step": 14660 }, { "epoch": 4.8899266177451635, "learning_rate": 0.00010400206471671645, "step": 14660 }, { "epoch": 4.8899266177451635, "loss": 0.48617222905158997, "step": 14660 }, { "ce_loss": 0.16330264508724213, "epoch": 4.8899266177451635, "step": 14660 }, { "distill_loss": 0.13402962684631348, "epoch": 4.8899266177451635, "step": 14660 }, { "epoch": 4.8899266177451635, "ref_ce_loss": 0.1386226862668991, "step": 14660 }, { "epoch": 4.8899266177451635, "loss": 0.5076068639755249, "step": 14660 }, { "ce_loss": 0.12294797599315643, "epoch": 4.8899266177451635, "step": 14660 }, { "distill_loss": 0.1680523306131363, "epoch": 4.8899266177451635, "step": 14660 }, { "epoch": 4.8899266177451635, "ref_ce_loss": 0.1611853539943695, "step": 14660 }, { "epoch": 4.8899266177451635, "loss": 0.42812928557395935, "step": 14660 }, { "ce_loss": 0.10041183978319168, "epoch": 4.8899266177451635, "step": 14660 }, { "distill_loss": 0.11321305483579636, "epoch": 4.8899266177451635, "step": 14660 }, { "epoch": 4.8899266177451635, "ref_ce_loss": 0.07629314810037613, "step": 14660 }, { "epoch": 4.8899266177451635, "loss": 0.25367259979248047, "step": 14660 }, { "ce_loss": 0.06785446405410767, "epoch": 4.8899266177451635, "step": 14660 }, { "distill_loss": 0.09600482881069183, "epoch": 4.8899266177451635, "step": 14660 }, { "epoch": 4.8899266177451635, "ref_ce_loss": 0.06254395842552185, "step": 14660 }, { "epoch": 4.893262174783189, "loss": 0.4031, "step": 14670 }, { "epoch": 4.893262174783189, "grad_norm": 2.916531562805176, "step": 14670 }, { "epoch": 4.893262174783189, "learning_rate": 0.00010380930441551692, "step": 14670 }, { "epoch": 4.893262174783189, "loss": 0.9963822364807129, "step": 14670 }, { "ce_loss": 0.1517850160598755, "epoch": 4.893262174783189, "step": 14670 }, { "distill_loss": 0.11565428227186203, "epoch": 4.893262174783189, "step": 14670 }, { "epoch": 4.893262174783189, "ref_ce_loss": 0.07845375686883926, "step": 14670 }, { "epoch": 4.893262174783189, "loss": 0.36983203887939453, "step": 14670 }, { "ce_loss": 0.07163172960281372, "epoch": 4.893262174783189, "step": 14670 }, { "distill_loss": 0.1265677660703659, "epoch": 4.893262174783189, "step": 14670 }, { "epoch": 4.893262174783189, "ref_ce_loss": 0.08316890150308609, "step": 14670 }, { "epoch": 4.893262174783189, "loss": 0.2734651267528534, "step": 14670 }, { "ce_loss": 0.08729465305805206, "epoch": 4.893262174783189, "step": 14670 }, { "distill_loss": 0.10305607318878174, "epoch": 4.893262174783189, "step": 14670 }, { "epoch": 4.893262174783189, "ref_ce_loss": 0.06024034693837166, "step": 14670 }, { "epoch": 4.893262174783189, "loss": 0.3107224404811859, "step": 14670 }, { "ce_loss": 0.0668327808380127, "epoch": 4.893262174783189, "step": 14670 }, { "distill_loss": 0.1244446337223053, "epoch": 4.893262174783189, "step": 14670 }, { "epoch": 4.893262174783189, "ref_ce_loss": 0.05027075111865997, "step": 14670 }, { "epoch": 4.896597731821214, "loss": 0.4134, "step": 14680 }, { "epoch": 4.896597731821214, "grad_norm": 3.499980926513672, "step": 14680 }, { "epoch": 4.896597731821214, "learning_rate": 0.00010361662834792541, "step": 14680 }, { "epoch": 4.896597731821214, "loss": 0.6011641025543213, "step": 14680 }, { "ce_loss": 0.06787215173244476, "epoch": 4.896597731821214, "step": 14680 }, { "distill_loss": 0.10903525352478027, "epoch": 4.896597731821214, "step": 14680 }, { "epoch": 4.896597731821214, "ref_ce_loss": 0.07811160385608673, "step": 14680 }, { "epoch": 4.896597731821214, "loss": 0.3264457583427429, "step": 14680 }, { "ce_loss": 0.11346080899238586, "epoch": 4.896597731821214, "step": 14680 }, { "distill_loss": 0.08637680113315582, "epoch": 4.896597731821214, "step": 14680 }, { "epoch": 4.896597731821214, "ref_ce_loss": 0.09686467051506042, "step": 14680 }, { "epoch": 4.896597731821214, "loss": 0.577754020690918, "step": 14680 }, { "ce_loss": 0.10299709439277649, "epoch": 4.896597731821214, "step": 14680 }, { "distill_loss": 0.12928466498851776, "epoch": 4.896597731821214, "step": 14680 }, { "epoch": 4.896597731821214, "ref_ce_loss": 0.15525569021701813, "step": 14680 }, { "epoch": 4.896597731821214, "loss": 0.3560793995857239, "step": 14680 }, { "ce_loss": 0.11099941283464432, "epoch": 4.896597731821214, "step": 14680 }, { "distill_loss": 0.11336791515350342, "epoch": 4.896597731821214, "step": 14680 }, { "epoch": 4.896597731821214, "ref_ce_loss": 0.09310568124055862, "step": 14680 }, { "epoch": 4.8999332888592395, "loss": 0.4816, "step": 14690 }, { "epoch": 4.8999332888592395, "grad_norm": 2.3682327270507812, "step": 14690 }, { "epoch": 4.8999332888592395, "learning_rate": 0.00010342403686530702, "step": 14690 }, { "epoch": 4.8999332888592395, "loss": 0.3210378587245941, "step": 14690 }, { "ce_loss": 0.09722641110420227, "epoch": 4.8999332888592395, "step": 14690 }, { "distill_loss": 0.10519842803478241, "epoch": 4.8999332888592395, "step": 14690 }, { "epoch": 4.8999332888592395, "ref_ce_loss": 0.11840209364891052, "step": 14690 }, { "epoch": 4.8999332888592395, "loss": 0.33558833599090576, "step": 14690 }, { "ce_loss": 0.10728945583105087, "epoch": 4.8999332888592395, "step": 14690 }, { "distill_loss": 0.1204787865281105, "epoch": 4.8999332888592395, "step": 14690 }, { "epoch": 4.8999332888592395, "ref_ce_loss": 0.10764899104833603, "step": 14690 }, { "epoch": 4.8999332888592395, "loss": 0.369645893573761, "step": 14690 }, { "ce_loss": 0.0854637622833252, "epoch": 4.8999332888592395, "step": 14690 }, { "distill_loss": 0.0883425697684288, "epoch": 4.8999332888592395, "step": 14690 }, { "epoch": 4.8999332888592395, "ref_ce_loss": 0.07716590911149979, "step": 14690 }, { "epoch": 4.8999332888592395, "loss": 0.5961611866950989, "step": 14690 }, { "ce_loss": 0.15953610837459564, "epoch": 4.8999332888592395, "step": 14690 }, { "distill_loss": 0.14236530661582947, "epoch": 4.8999332888592395, "step": 14690 }, { "epoch": 4.8999332888592395, "ref_ce_loss": 0.1169867068529129, "step": 14690 }, { "epoch": 4.903268845897265, "loss": 0.4148, "step": 14700 }, { "epoch": 4.903268845897265, "grad_norm": 3.376802444458008, "step": 14700 }, { "epoch": 4.903268845897265, "learning_rate": 0.00010323153031887267, "step": 14700 }, { "epoch": 4.903268845897265, "loss": 0.34086325764656067, "step": 14700 }, { "ce_loss": 0.09868812561035156, "epoch": 4.903268845897265, "step": 14700 }, { "distill_loss": 0.09084837138652802, "epoch": 4.903268845897265, "step": 14700 }, { "epoch": 4.903268845897265, "ref_ce_loss": 0.07390137016773224, "step": 14700 }, { "epoch": 4.903268845897265, "loss": 0.4819284975528717, "step": 14700 }, { "ce_loss": 0.13006484508514404, "epoch": 4.903268845897265, "step": 14700 }, { "distill_loss": 0.1349356472492218, "epoch": 4.903268845897265, "step": 14700 }, { "epoch": 4.903268845897265, "ref_ce_loss": 0.11490657180547714, "step": 14700 }, { "epoch": 4.903268845897265, "loss": 0.7943411469459534, "step": 14700 }, { "ce_loss": 0.1040106862783432, "epoch": 4.903268845897265, "step": 14700 }, { "distill_loss": 0.11534878611564636, "epoch": 4.903268845897265, "step": 14700 }, { "epoch": 4.903268845897265, "ref_ce_loss": 0.11220097541809082, "step": 14700 }, { "epoch": 4.903268845897265, "loss": 0.3427712619304657, "step": 14700 }, { "ce_loss": 0.11044802516698837, "epoch": 4.903268845897265, "step": 14700 }, { "distill_loss": 0.12651844322681427, "epoch": 4.903268845897265, "step": 14700 }, { "epoch": 4.903268845897265, "ref_ce_loss": 0.10573562979698181, "step": 14700 }, { "epoch": 4.90660440293529, "loss": 0.4514, "step": 14710 }, { "epoch": 4.90660440293529, "grad_norm": 2.766533851623535, "step": 14710 }, { "epoch": 4.90660440293529, "learning_rate": 0.0001030391090596784, "step": 14710 }, { "epoch": 4.90660440293529, "loss": 0.2449745088815689, "step": 14710 }, { "ce_loss": 0.0558619424700737, "epoch": 4.90660440293529, "step": 14710 }, { "distill_loss": 0.10561040788888931, "epoch": 4.90660440293529, "step": 14710 }, { "epoch": 4.90660440293529, "ref_ce_loss": 0.08319824934005737, "step": 14710 }, { "epoch": 4.90660440293529, "loss": 0.3997700810432434, "step": 14710 }, { "ce_loss": 0.13185864686965942, "epoch": 4.90660440293529, "step": 14710 }, { "distill_loss": 0.11414431780576706, "epoch": 4.90660440293529, "step": 14710 }, { "epoch": 4.90660440293529, "ref_ce_loss": 0.07824277877807617, "step": 14710 }, { "epoch": 4.90660440293529, "loss": 0.6474665403366089, "step": 14710 }, { "ce_loss": 0.14329788088798523, "epoch": 4.90660440293529, "step": 14710 }, { "distill_loss": 0.12736766040325165, "epoch": 4.90660440293529, "step": 14710 }, { "epoch": 4.90660440293529, "ref_ce_loss": 0.07725608348846436, "step": 14710 }, { "epoch": 4.90660440293529, "loss": 0.2920524775981903, "step": 14710 }, { "ce_loss": 0.11028847098350525, "epoch": 4.90660440293529, "step": 14710 }, { "distill_loss": 0.10173401981592178, "epoch": 4.90660440293529, "step": 14710 }, { "epoch": 4.90660440293529, "ref_ce_loss": 0.06018834933638573, "step": 14710 }, { "epoch": 4.909939959973316, "loss": 0.4032, "step": 14720 }, { "epoch": 4.909939959973316, "grad_norm": 2.0447373390197754, "step": 14720 }, { "epoch": 4.909939959973316, "learning_rate": 0.00010284677343862461, "step": 14720 }, { "epoch": 4.909939959973316, "loss": 0.5206084251403809, "step": 14720 }, { "ce_loss": 0.11765378713607788, "epoch": 4.909939959973316, "step": 14720 }, { "distill_loss": 0.11574839055538177, "epoch": 4.909939959973316, "step": 14720 }, { "epoch": 4.909939959973316, "ref_ce_loss": 0.09627361595630646, "step": 14720 }, { "epoch": 4.909939959973316, "loss": 0.26153698563575745, "step": 14720 }, { "ce_loss": 0.056028977036476135, "epoch": 4.909939959973316, "step": 14720 }, { "distill_loss": 0.12160011380910873, "epoch": 4.909939959973316, "step": 14720 }, { "epoch": 4.909939959973316, "ref_ce_loss": 0.062378816306591034, "step": 14720 }, { "epoch": 4.909939959973316, "loss": 0.40411004424095154, "step": 14720 }, { "ce_loss": 0.09394532442092896, "epoch": 4.909939959973316, "step": 14720 }, { "distill_loss": 0.12955696880817413, "epoch": 4.909939959973316, "step": 14720 }, { "epoch": 4.909939959973316, "ref_ce_loss": 0.11500399559736252, "step": 14720 }, { "epoch": 4.909939959973316, "loss": 0.8402743339538574, "step": 14720 }, { "ce_loss": 0.08294497430324554, "epoch": 4.909939959973316, "step": 14720 }, { "distill_loss": 0.10579205304384232, "epoch": 4.909939959973316, "step": 14720 }, { "epoch": 4.909939959973316, "ref_ce_loss": 0.0730375424027443, "step": 14720 }, { "epoch": 4.913275517011341, "loss": 0.375, "step": 14730 }, { "epoch": 4.913275517011341, "grad_norm": 2.3185336589813232, "step": 14730 }, { "epoch": 4.913275517011341, "learning_rate": 0.0001026545238064557, "step": 14730 }, { "epoch": 4.913275517011341, "loss": 0.31869810819625854, "step": 14730 }, { "ce_loss": 0.06146164610981941, "epoch": 4.913275517011341, "step": 14730 }, { "distill_loss": 0.10617264360189438, "epoch": 4.913275517011341, "step": 14730 }, { "epoch": 4.913275517011341, "ref_ce_loss": 0.11182975023984909, "step": 14730 }, { "epoch": 4.913275517011341, "loss": 0.5334634184837341, "step": 14730 }, { "ce_loss": 0.09403307735919952, "epoch": 4.913275517011341, "step": 14730 }, { "distill_loss": 0.105626180768013, "epoch": 4.913275517011341, "step": 14730 }, { "epoch": 4.913275517011341, "ref_ce_loss": 0.09046263247728348, "step": 14730 }, { "epoch": 4.913275517011341, "loss": 0.4704914391040802, "step": 14730 }, { "ce_loss": 0.14943642914295197, "epoch": 4.913275517011341, "step": 14730 }, { "distill_loss": 0.12391936779022217, "epoch": 4.913275517011341, "step": 14730 }, { "epoch": 4.913275517011341, "ref_ce_loss": 0.08655329793691635, "step": 14730 }, { "epoch": 4.913275517011341, "loss": 0.8025875091552734, "step": 14730 }, { "ce_loss": 0.19269277155399323, "epoch": 4.913275517011341, "step": 14730 }, { "distill_loss": 0.1386529505252838, "epoch": 4.913275517011341, "step": 14730 }, { "epoch": 4.913275517011341, "ref_ce_loss": 0.11787714064121246, "step": 14730 }, { "epoch": 4.916611074049366, "loss": 0.3848, "step": 14740 }, { "epoch": 4.916611074049366, "grad_norm": 1.9901200532913208, "step": 14740 }, { "epoch": 4.916611074049366, "learning_rate": 0.00010246236051375899, "step": 14740 }, { "epoch": 4.916611074049366, "loss": 0.35475262999534607, "step": 14740 }, { "ce_loss": 0.1351785510778427, "epoch": 4.916611074049366, "step": 14740 }, { "distill_loss": 0.10362657904624939, "epoch": 4.916611074049366, "step": 14740 }, { "epoch": 4.916611074049366, "ref_ce_loss": 0.11586165428161621, "step": 14740 }, { "epoch": 4.916611074049366, "loss": 0.3570358157157898, "step": 14740 }, { "ce_loss": 0.06367281079292297, "epoch": 4.916611074049366, "step": 14740 }, { "distill_loss": 0.10224097222089767, "epoch": 4.916611074049366, "step": 14740 }, { "epoch": 4.916611074049366, "ref_ce_loss": 0.09009752422571182, "step": 14740 }, { "epoch": 4.916611074049366, "loss": 0.25970083475112915, "step": 14740 }, { "ce_loss": 0.08381228893995285, "epoch": 4.916611074049366, "step": 14740 }, { "distill_loss": 0.08009132742881775, "epoch": 4.916611074049366, "step": 14740 }, { "epoch": 4.916611074049366, "ref_ce_loss": 0.09562067687511444, "step": 14740 }, { "epoch": 4.916611074049366, "loss": 0.31035858392715454, "step": 14740 }, { "ce_loss": 0.03239971399307251, "epoch": 4.916611074049366, "step": 14740 }, { "distill_loss": 0.0916653499007225, "epoch": 4.916611074049366, "step": 14740 }, { "epoch": 4.916611074049366, "ref_ce_loss": 0.07490362226963043, "step": 14740 }, { "epoch": 4.919946631087392, "loss": 0.3686, "step": 14750 }, { "epoch": 4.919946631087392, "grad_norm": 1.433788776397705, "step": 14750 }, { "epoch": 4.919946631087392, "learning_rate": 0.00010227028391096469, "step": 14750 }, { "epoch": 4.919946631087392, "loss": 0.3197961151599884, "step": 14750 }, { "ce_loss": 0.13021771609783173, "epoch": 4.919946631087392, "step": 14750 }, { "distill_loss": 0.1070857048034668, "epoch": 4.919946631087392, "step": 14750 }, { "epoch": 4.919946631087392, "ref_ce_loss": 0.0823674276471138, "step": 14750 }, { "epoch": 4.919946631087392, "loss": 0.4557018280029297, "step": 14750 }, { "ce_loss": 0.17964871227741241, "epoch": 4.919946631087392, "step": 14750 }, { "distill_loss": 0.14461633563041687, "epoch": 4.919946631087392, "step": 14750 }, { "epoch": 4.919946631087392, "ref_ce_loss": 0.09955228120088577, "step": 14750 }, { "epoch": 4.919946631087392, "loss": 0.3339523375034332, "step": 14750 }, { "ce_loss": 0.07346391677856445, "epoch": 4.919946631087392, "step": 14750 }, { "distill_loss": 0.11796528100967407, "epoch": 4.919946631087392, "step": 14750 }, { "epoch": 4.919946631087392, "ref_ce_loss": 0.08985766023397446, "step": 14750 }, { "epoch": 4.919946631087392, "loss": 0.24510349333286285, "step": 14750 }, { "ce_loss": 0.04468422010540962, "epoch": 4.919946631087392, "step": 14750 }, { "distill_loss": 0.09657389670610428, "epoch": 4.919946631087392, "step": 14750 }, { "epoch": 4.919946631087392, "ref_ce_loss": 0.08417684584856033, "step": 14750 }, { "epoch": 4.923282188125417, "loss": 0.388, "step": 14760 }, { "epoch": 4.923282188125417, "grad_norm": 2.640810012817383, "step": 14760 }, { "epoch": 4.923282188125417, "learning_rate": 0.00010207829434834476, "step": 14760 }, { "epoch": 4.923282188125417, "loss": 0.5349435806274414, "step": 14760 }, { "ce_loss": 0.09291098266839981, "epoch": 4.923282188125417, "step": 14760 }, { "distill_loss": 0.14082714915275574, "epoch": 4.923282188125417, "step": 14760 }, { "epoch": 4.923282188125417, "ref_ce_loss": 0.1280236840248108, "step": 14760 }, { "epoch": 4.923282188125417, "loss": 0.6028576493263245, "step": 14760 }, { "ce_loss": 0.08771508932113647, "epoch": 4.923282188125417, "step": 14760 }, { "distill_loss": 0.097988560795784, "epoch": 4.923282188125417, "step": 14760 }, { "epoch": 4.923282188125417, "ref_ce_loss": 0.09704327583312988, "step": 14760 }, { "epoch": 4.923282188125417, "loss": 0.3179771900177002, "step": 14760 }, { "ce_loss": 0.08840013295412064, "epoch": 4.923282188125417, "step": 14760 }, { "distill_loss": 0.11706671863794327, "epoch": 4.923282188125417, "step": 14760 }, { "epoch": 4.923282188125417, "ref_ce_loss": 0.08572734147310257, "step": 14760 }, { "epoch": 4.923282188125417, "loss": 0.27515414357185364, "step": 14760 }, { "ce_loss": 0.07249299436807632, "epoch": 4.923282188125417, "step": 14760 }, { "distill_loss": 0.10312556475400925, "epoch": 4.923282188125417, "step": 14760 }, { "epoch": 4.923282188125417, "ref_ce_loss": 0.06152420863509178, "step": 14760 }, { "epoch": 4.926617745163442, "loss": 0.4136, "step": 14770 }, { "epoch": 4.926617745163442, "grad_norm": 3.415719509124756, "step": 14770 }, { "epoch": 4.926617745163442, "learning_rate": 0.00010188639217601227, "step": 14770 }, { "epoch": 4.926617745163442, "loss": 0.3267880976200104, "step": 14770 }, { "ce_loss": 0.0738874077796936, "epoch": 4.926617745163442, "step": 14770 }, { "distill_loss": 0.13988655805587769, "epoch": 4.926617745163442, "step": 14770 }, { "epoch": 4.926617745163442, "ref_ce_loss": 0.0890965461730957, "step": 14770 }, { "epoch": 4.926617745163442, "loss": 0.607414960861206, "step": 14770 }, { "ce_loss": 0.05598186329007149, "epoch": 4.926617745163442, "step": 14770 }, { "distill_loss": 0.1434912234544754, "epoch": 4.926617745163442, "step": 14770 }, { "epoch": 4.926617745163442, "ref_ce_loss": 0.0997924655675888, "step": 14770 }, { "epoch": 4.926617745163442, "loss": 0.3053884506225586, "step": 14770 }, { "ce_loss": 0.08139229565858841, "epoch": 4.926617745163442, "step": 14770 }, { "distill_loss": 0.10093171149492264, "epoch": 4.926617745163442, "step": 14770 }, { "epoch": 4.926617745163442, "ref_ce_loss": 0.12291386723518372, "step": 14770 }, { "epoch": 4.926617745163442, "loss": 0.3276132047176361, "step": 14770 }, { "ce_loss": 0.0874326154589653, "epoch": 4.926617745163442, "step": 14770 }, { "distill_loss": 0.14592541754245758, "epoch": 4.926617745163442, "step": 14770 }, { "epoch": 4.926617745163442, "ref_ce_loss": 0.0702369436621666, "step": 14770 }, { "epoch": 4.929953302201468, "loss": 0.4035, "step": 14780 }, { "epoch": 4.929953302201468, "grad_norm": 2.4359805583953857, "step": 14780 }, { "epoch": 4.929953302201468, "learning_rate": 0.00010169457774392122, "step": 14780 }, { "epoch": 4.929953302201468, "loss": 0.3487001061439514, "step": 14780 }, { "ce_loss": 0.09250488132238388, "epoch": 4.929953302201468, "step": 14780 }, { "distill_loss": 0.11125314980745316, "epoch": 4.929953302201468, "step": 14780 }, { "epoch": 4.929953302201468, "ref_ce_loss": 0.09922449290752411, "step": 14780 }, { "epoch": 4.929953302201468, "loss": 0.40673667192459106, "step": 14780 }, { "ce_loss": 0.10500854253768921, "epoch": 4.929953302201468, "step": 14780 }, { "distill_loss": 0.11264103651046753, "epoch": 4.929953302201468, "step": 14780 }, { "epoch": 4.929953302201468, "ref_ce_loss": 0.0795854777097702, "step": 14780 }, { "epoch": 4.929953302201468, "loss": 0.32172906398773193, "step": 14780 }, { "ce_loss": 0.1017569825053215, "epoch": 4.929953302201468, "step": 14780 }, { "distill_loss": 0.099338598549366, "epoch": 4.929953302201468, "step": 14780 }, { "epoch": 4.929953302201468, "ref_ce_loss": 0.09416161477565765, "step": 14780 }, { "epoch": 4.929953302201468, "loss": 0.3082839250564575, "step": 14780 }, { "ce_loss": 0.08893798291683197, "epoch": 4.929953302201468, "step": 14780 }, { "distill_loss": 0.12213946878910065, "epoch": 4.929953302201468, "step": 14780 }, { "epoch": 4.929953302201468, "ref_ce_loss": 0.07671026140451431, "step": 14780 }, { "epoch": 4.933288859239493, "loss": 0.3804, "step": 14790 }, { "epoch": 4.933288859239493, "grad_norm": 3.638474941253662, "step": 14790 }, { "epoch": 4.933288859239493, "learning_rate": 0.00010150285140186546, "step": 14790 }, { "epoch": 4.933288859239493, "loss": 0.41449302434921265, "step": 14790 }, { "ce_loss": 0.04984421283006668, "epoch": 4.933288859239493, "step": 14790 }, { "distill_loss": 0.09936082363128662, "epoch": 4.933288859239493, "step": 14790 }, { "epoch": 4.933288859239493, "ref_ce_loss": 0.1158912256360054, "step": 14790 }, { "epoch": 4.933288859239493, "loss": 0.5383901000022888, "step": 14790 }, { "ce_loss": 0.23258216679096222, "epoch": 4.933288859239493, "step": 14790 }, { "distill_loss": 0.1699237823486328, "epoch": 4.933288859239493, "step": 14790 }, { "epoch": 4.933288859239493, "ref_ce_loss": 0.10595542937517166, "step": 14790 }, { "epoch": 4.933288859239493, "loss": 0.3543599247932434, "step": 14790 }, { "ce_loss": 0.14072558283805847, "epoch": 4.933288859239493, "step": 14790 }, { "distill_loss": 0.11986932903528214, "epoch": 4.933288859239493, "step": 14790 }, { "epoch": 4.933288859239493, "ref_ce_loss": 0.0930936187505722, "step": 14790 }, { "epoch": 4.933288859239493, "loss": 0.31760847568511963, "step": 14790 }, { "ce_loss": 0.0677962526679039, "epoch": 4.933288859239493, "step": 14790 }, { "distill_loss": 0.09178701788187027, "epoch": 4.933288859239493, "step": 14790 }, { "epoch": 4.933288859239493, "ref_ce_loss": 0.11888806521892548, "step": 14790 }, { "epoch": 4.936624416277518, "loss": 0.3989, "step": 14800 }, { "epoch": 4.936624416277518, "grad_norm": 5.042342662811279, "step": 14800 }, { "epoch": 4.936624416277518, "learning_rate": 0.00010131121349947811, "step": 14800 }, { "epoch": 4.936624416277518, "loss": 0.5192350745201111, "step": 14800 }, { "ce_loss": 0.22760631144046783, "epoch": 4.936624416277518, "step": 14800 }, { "distill_loss": 0.13159260153770447, "epoch": 4.936624416277518, "step": 14800 }, { "epoch": 4.936624416277518, "ref_ce_loss": 0.12293851375579834, "step": 14800 }, { "epoch": 4.936624416277518, "loss": 0.3765476942062378, "step": 14800 }, { "ce_loss": 0.08823811262845993, "epoch": 4.936624416277518, "step": 14800 }, { "distill_loss": 0.09266003221273422, "epoch": 4.936624416277518, "step": 14800 }, { "epoch": 4.936624416277518, "ref_ce_loss": 0.08648016303777695, "step": 14800 }, { "epoch": 4.936624416277518, "loss": 0.44473010301589966, "step": 14800 }, { "ce_loss": 0.1467081606388092, "epoch": 4.936624416277518, "step": 14800 }, { "distill_loss": 0.12626172602176666, "epoch": 4.936624416277518, "step": 14800 }, { "epoch": 4.936624416277518, "ref_ce_loss": 0.11209473758935928, "step": 14800 }, { "epoch": 4.936624416277518, "loss": 0.2532784938812256, "step": 14800 }, { "ce_loss": 0.08552855998277664, "epoch": 4.936624416277518, "step": 14800 }, { "distill_loss": 0.09357450902462006, "epoch": 4.936624416277518, "step": 14800 }, { "epoch": 4.936624416277518, "ref_ce_loss": 0.07390519231557846, "step": 14800 }, { "epoch": 4.939959973315544, "loss": 0.3884, "step": 14810 }, { "epoch": 4.939959973315544, "grad_norm": 2.153294801712036, "step": 14810 }, { "epoch": 4.939959973315544, "learning_rate": 0.00010111966438623127, "step": 14810 }, { "epoch": 4.939959973315544, "loss": 0.2643769681453705, "step": 14810 }, { "ce_loss": 0.0686085894703865, "epoch": 4.939959973315544, "step": 14810 }, { "distill_loss": 0.09371940046548843, "epoch": 4.939959973315544, "step": 14810 }, { "epoch": 4.939959973315544, "ref_ce_loss": 0.07681851089000702, "step": 14810 }, { "epoch": 4.939959973315544, "loss": 0.5107899904251099, "step": 14810 }, { "ce_loss": 0.058815546333789825, "epoch": 4.939959973315544, "step": 14810 }, { "distill_loss": 0.08406341820955276, "epoch": 4.939959973315544, "step": 14810 }, { "epoch": 4.939959973315544, "ref_ce_loss": 0.09192143380641937, "step": 14810 }, { "epoch": 4.939959973315544, "loss": 0.436542809009552, "step": 14810 }, { "ce_loss": 0.1103186309337616, "epoch": 4.939959973315544, "step": 14810 }, { "distill_loss": 0.10437675565481186, "epoch": 4.939959973315544, "step": 14810 }, { "epoch": 4.939959973315544, "ref_ce_loss": 0.11292348057031631, "step": 14810 }, { "epoch": 4.939959973315544, "loss": 0.6722804307937622, "step": 14810 }, { "ce_loss": 0.1129743903875351, "epoch": 4.939959973315544, "step": 14810 }, { "distill_loss": 0.13765385746955872, "epoch": 4.939959973315544, "step": 14810 }, { "epoch": 4.939959973315544, "ref_ce_loss": 0.13108649849891663, "step": 14810 }, { "epoch": 4.943295530353569, "loss": 0.4369, "step": 14820 }, { "epoch": 4.943295530353569, "grad_norm": 3.259122133255005, "step": 14820 }, { "epoch": 4.943295530353569, "learning_rate": 0.00010092820441143482, "step": 14820 }, { "epoch": 4.943295530353569, "loss": 0.2509721517562866, "step": 14820 }, { "ce_loss": 0.07367528975009918, "epoch": 4.943295530353569, "step": 14820 }, { "distill_loss": 0.0924886018037796, "epoch": 4.943295530353569, "step": 14820 }, { "epoch": 4.943295530353569, "ref_ce_loss": 0.08422941714525223, "step": 14820 }, { "epoch": 4.943295530353569, "loss": 0.3739933669567108, "step": 14820 }, { "ce_loss": 0.14804257452487946, "epoch": 4.943295530353569, "step": 14820 }, { "distill_loss": 0.10976754873991013, "epoch": 4.943295530353569, "step": 14820 }, { "epoch": 4.943295530353569, "ref_ce_loss": 0.11567100137472153, "step": 14820 }, { "epoch": 4.943295530353569, "loss": 0.4644673466682434, "step": 14820 }, { "ce_loss": 0.14605847001075745, "epoch": 4.943295530353569, "step": 14820 }, { "distill_loss": 0.08814170211553574, "epoch": 4.943295530353569, "step": 14820 }, { "epoch": 4.943295530353569, "ref_ce_loss": 0.09028080850839615, "step": 14820 }, { "epoch": 4.943295530353569, "loss": 0.42085397243499756, "step": 14820 }, { "ce_loss": 0.15428964793682098, "epoch": 4.943295530353569, "step": 14820 }, { "distill_loss": 0.12081962078809738, "epoch": 4.943295530353569, "step": 14820 }, { "epoch": 4.943295530353569, "ref_ce_loss": 0.09920281916856766, "step": 14820 }, { "epoch": 4.946631087391594, "loss": 0.3485, "step": 14830 }, { "epoch": 4.946631087391594, "grad_norm": 2.371697187423706, "step": 14830 }, { "epoch": 4.946631087391594, "learning_rate": 0.00010073683392423623, "step": 14830 }, { "epoch": 4.946631087391594, "loss": 0.19433481991291046, "step": 14830 }, { "ce_loss": 0.034791141748428345, "epoch": 4.946631087391594, "step": 14830 }, { "distill_loss": 0.09125552326440811, "epoch": 4.946631087391594, "step": 14830 }, { "epoch": 4.946631087391594, "ref_ce_loss": 0.06818924844264984, "step": 14830 }, { "epoch": 4.946631087391594, "loss": 0.31020668148994446, "step": 14830 }, { "ce_loss": 0.09354670345783234, "epoch": 4.946631087391594, "step": 14830 }, { "distill_loss": 0.10771866887807846, "epoch": 4.946631087391594, "step": 14830 }, { "epoch": 4.946631087391594, "ref_ce_loss": 0.07463540136814117, "step": 14830 }, { "epoch": 4.946631087391594, "loss": 0.5806359648704529, "step": 14830 }, { "ce_loss": 0.14421100914478302, "epoch": 4.946631087391594, "step": 14830 }, { "distill_loss": 0.11986065655946732, "epoch": 4.946631087391594, "step": 14830 }, { "epoch": 4.946631087391594, "ref_ce_loss": 0.11439379304647446, "step": 14830 }, { "epoch": 4.946631087391594, "loss": 0.21567516028881073, "step": 14830 }, { "ce_loss": 0.028816038742661476, "epoch": 4.946631087391594, "step": 14830 }, { "distill_loss": 0.10529807209968567, "epoch": 4.946631087391594, "step": 14830 }, { "epoch": 4.946631087391594, "ref_ce_loss": 0.0813959464430809, "step": 14830 }, { "epoch": 4.94996664442962, "loss": 0.3416, "step": 14840 }, { "epoch": 4.94996664442962, "grad_norm": 4.2593994140625, "step": 14840 }, { "epoch": 4.94996664442962, "learning_rate": 0.00010054555327361993, "step": 14840 }, { "epoch": 4.94996664442962, "loss": 0.19674266874790192, "step": 14840 }, { "ce_loss": 0.033592235296964645, "epoch": 4.94996664442962, "step": 14840 }, { "distill_loss": 0.09351490437984467, "epoch": 4.94996664442962, "step": 14840 }, { "epoch": 4.94996664442962, "ref_ce_loss": 0.06934471428394318, "step": 14840 }, { "epoch": 4.94996664442962, "loss": 0.48189353942871094, "step": 14840 }, { "ce_loss": 0.10756346583366394, "epoch": 4.94996664442962, "step": 14840 }, { "distill_loss": 0.10618402063846588, "epoch": 4.94996664442962, "step": 14840 }, { "epoch": 4.94996664442962, "ref_ce_loss": 0.1061076894402504, "step": 14840 }, { "epoch": 4.94996664442962, "loss": 0.4971357583999634, "step": 14840 }, { "ce_loss": 0.13811559975147247, "epoch": 4.94996664442962, "step": 14840 }, { "distill_loss": 0.12884469330310822, "epoch": 4.94996664442962, "step": 14840 }, { "epoch": 4.94996664442962, "ref_ce_loss": 0.11955190449953079, "step": 14840 }, { "epoch": 4.94996664442962, "loss": 0.6511322855949402, "step": 14840 }, { "ce_loss": 0.184445321559906, "epoch": 4.94996664442962, "step": 14840 }, { "distill_loss": 0.15705184638500214, "epoch": 4.94996664442962, "step": 14840 }, { "epoch": 4.94996664442962, "ref_ce_loss": 0.15733648836612701, "step": 14840 }, { "epoch": 4.953302201467645, "loss": 0.4032, "step": 14850 }, { "epoch": 4.953302201467645, "grad_norm": 3.079538106918335, "step": 14850 }, { "epoch": 4.953302201467645, "learning_rate": 0.00010035436280840621, "step": 14850 }, { "epoch": 4.953302201467645, "loss": 0.2901090383529663, "step": 14850 }, { "ce_loss": 0.03799564763903618, "epoch": 4.953302201467645, "step": 14850 }, { "distill_loss": 0.10415848344564438, "epoch": 4.953302201467645, "step": 14850 }, { "epoch": 4.953302201467645, "ref_ce_loss": 0.09406749904155731, "step": 14850 }, { "epoch": 4.953302201467645, "loss": 0.3220950961112976, "step": 14850 }, { "ce_loss": 0.11926550418138504, "epoch": 4.953302201467645, "step": 14850 }, { "distill_loss": 0.11953283846378326, "epoch": 4.953302201467645, "step": 14850 }, { "epoch": 4.953302201467645, "ref_ce_loss": 0.08315497636795044, "step": 14850 }, { "epoch": 4.953302201467645, "loss": 0.4622819125652313, "step": 14850 }, { "ce_loss": 0.1031547486782074, "epoch": 4.953302201467645, "step": 14850 }, { "distill_loss": 0.09223375469446182, "epoch": 4.953302201467645, "step": 14850 }, { "epoch": 4.953302201467645, "ref_ce_loss": 0.10606669634580612, "step": 14850 }, { "epoch": 4.953302201467645, "loss": 0.33610832691192627, "step": 14850 }, { "ce_loss": 0.06804166734218597, "epoch": 4.953302201467645, "step": 14850 }, { "distill_loss": 0.10635029524564743, "epoch": 4.953302201467645, "step": 14850 }, { "epoch": 4.953302201467645, "ref_ce_loss": 0.07996688038110733, "step": 14850 }, { "epoch": 4.9566377585056705, "loss": 0.381, "step": 14860 }, { "epoch": 4.9566377585056705, "grad_norm": 2.194052219390869, "step": 14860 }, { "epoch": 4.9566377585056705, "learning_rate": 0.00010016326287725116, "step": 14860 }, { "epoch": 4.9566377585056705, "loss": 0.3040575087070465, "step": 14860 }, { "ce_loss": 0.08786673098802567, "epoch": 4.9566377585056705, "step": 14860 }, { "distill_loss": 0.14260633289813995, "epoch": 4.9566377585056705, "step": 14860 }, { "epoch": 4.9566377585056705, "ref_ce_loss": 0.0735347643494606, "step": 14860 }, { "epoch": 4.9566377585056705, "loss": 0.8339674472808838, "step": 14860 }, { "ce_loss": 0.1228819414973259, "epoch": 4.9566377585056705, "step": 14860 }, { "distill_loss": 0.15760846436023712, "epoch": 4.9566377585056705, "step": 14860 }, { "epoch": 4.9566377585056705, "ref_ce_loss": 0.10669492930173874, "step": 14860 }, { "epoch": 4.9566377585056705, "loss": 0.3304751217365265, "step": 14860 }, { "ce_loss": 0.09910108894109726, "epoch": 4.9566377585056705, "step": 14860 }, { "distill_loss": 0.14979346096515656, "epoch": 4.9566377585056705, "step": 14860 }, { "epoch": 4.9566377585056705, "ref_ce_loss": 0.08147692680358887, "step": 14860 }, { "epoch": 4.9566377585056705, "loss": 0.2014983594417572, "step": 14860 }, { "ce_loss": 0.032280270010232925, "epoch": 4.9566377585056705, "step": 14860 }, { "distill_loss": 0.08623968809843063, "epoch": 4.9566377585056705, "step": 14860 }, { "epoch": 4.9566377585056705, "ref_ce_loss": 0.05587891489267349, "step": 14860 }, { "epoch": 4.959973315543696, "loss": 0.3914, "step": 14870 }, { "epoch": 4.959973315543696, "grad_norm": 4.903111934661865, "step": 14870 }, { "epoch": 4.959973315543696, "learning_rate": 9.997225382864559e-05, "step": 14870 }, { "epoch": 4.959973315543696, "loss": 0.2674598693847656, "step": 14870 }, { "ce_loss": 0.056575529277324677, "epoch": 4.959973315543696, "step": 14870 }, { "distill_loss": 0.10776714235544205, "epoch": 4.959973315543696, "step": 14870 }, { "epoch": 4.959973315543696, "ref_ce_loss": 0.057600509375333786, "step": 14870 }, { "epoch": 4.959973315543696, "loss": 0.521746039390564, "step": 14870 }, { "ce_loss": 0.09200442582368851, "epoch": 4.959973315543696, "step": 14870 }, { "distill_loss": 0.13319039344787598, "epoch": 4.959973315543696, "step": 14870 }, { "epoch": 4.959973315543696, "ref_ce_loss": 0.09574585407972336, "step": 14870 }, { "epoch": 4.959973315543696, "loss": 0.33523058891296387, "step": 14870 }, { "ce_loss": 0.06467235833406448, "epoch": 4.959973315543696, "step": 14870 }, { "distill_loss": 0.11361615359783173, "epoch": 4.959973315543696, "step": 14870 }, { "epoch": 4.959973315543696, "ref_ce_loss": 0.09359608590602875, "step": 14870 }, { "epoch": 4.959973315543696, "loss": 0.5538613200187683, "step": 14870 }, { "ce_loss": 0.17402806878089905, "epoch": 4.959973315543696, "step": 14870 }, { "distill_loss": 0.16963981091976166, "epoch": 4.959973315543696, "step": 14870 }, { "epoch": 4.959973315543696, "ref_ce_loss": 0.12446580082178116, "step": 14870 }, { "epoch": 4.963308872581721, "loss": 0.4007, "step": 14880 }, { "epoch": 4.963308872581721, "grad_norm": 2.6322154998779297, "step": 14880 }, { "epoch": 4.963308872581721, "learning_rate": 9.97813360109147e-05, "step": 14880 }, { "epoch": 4.963308872581721, "loss": 0.6043708920478821, "step": 14880 }, { "ce_loss": 0.1652269959449768, "epoch": 4.963308872581721, "step": 14880 }, { "distill_loss": 0.12191038578748703, "epoch": 4.963308872581721, "step": 14880 }, { "epoch": 4.963308872581721, "ref_ce_loss": 0.10965677350759506, "step": 14880 }, { "epoch": 4.963308872581721, "loss": 0.4230524003505707, "step": 14880 }, { "ce_loss": 0.09956231713294983, "epoch": 4.963308872581721, "step": 14880 }, { "distill_loss": 0.1212780550122261, "epoch": 4.963308872581721, "step": 14880 }, { "epoch": 4.963308872581721, "ref_ce_loss": 0.09010248631238937, "step": 14880 }, { "epoch": 4.963308872581721, "loss": 0.713090181350708, "step": 14880 }, { "ce_loss": 0.17262235283851624, "epoch": 4.963308872581721, "step": 14880 }, { "distill_loss": 0.15157324075698853, "epoch": 4.963308872581721, "step": 14880 }, { "epoch": 4.963308872581721, "ref_ce_loss": 0.10012070834636688, "step": 14880 }, { "epoch": 4.963308872581721, "loss": 0.40841516852378845, "step": 14880 }, { "ce_loss": 0.14642496407032013, "epoch": 4.963308872581721, "step": 14880 }, { "distill_loss": 0.11657308787107468, "epoch": 4.963308872581721, "step": 14880 }, { "epoch": 4.963308872581721, "ref_ce_loss": 0.109087735414505, "step": 14880 }, { "epoch": 4.9666444296197465, "loss": 0.4133, "step": 14890 }, { "epoch": 4.9666444296197465, "grad_norm": 1.9244799613952637, "step": 14890 }, { "epoch": 4.9666444296197465, "learning_rate": 9.959050977221732e-05, "step": 14890 }, { "epoch": 4.9666444296197465, "loss": 0.8595991134643555, "step": 14890 }, { "ce_loss": 0.17607881128787994, "epoch": 4.9666444296197465, "step": 14890 }, { "distill_loss": 0.10831549018621445, "epoch": 4.9666444296197465, "step": 14890 }, { "epoch": 4.9666444296197465, "ref_ce_loss": 0.08201535046100616, "step": 14890 }, { "epoch": 4.9666444296197465, "loss": 0.3811852037906647, "step": 14890 }, { "ce_loss": 0.06830538064241409, "epoch": 4.9666444296197465, "step": 14890 }, { "distill_loss": 0.09254138171672821, "epoch": 4.9666444296197465, "step": 14890 }, { "epoch": 4.9666444296197465, "ref_ce_loss": 0.07486068457365036, "step": 14890 }, { "epoch": 4.9666444296197465, "loss": 0.5785902738571167, "step": 14890 }, { "ce_loss": 0.16134631633758545, "epoch": 4.9666444296197465, "step": 14890 }, { "distill_loss": 0.1212388202548027, "epoch": 4.9666444296197465, "step": 14890 }, { "epoch": 4.9666444296197465, "ref_ce_loss": 0.10440519452095032, "step": 14890 }, { "epoch": 4.9666444296197465, "loss": 0.2205401211977005, "step": 14890 }, { "ce_loss": 0.03984503075480461, "epoch": 4.9666444296197465, "step": 14890 }, { "distill_loss": 0.09075970202684402, "epoch": 4.9666444296197465, "step": 14890 }, { "epoch": 4.9666444296197465, "ref_ce_loss": 0.08843724429607391, "step": 14890 }, { "epoch": 4.969979986657772, "loss": 0.3771, "step": 14900 }, { "epoch": 4.969979986657772, "grad_norm": 1.7490321397781372, "step": 14900 }, { "epoch": 4.969979986657772, "learning_rate": 9.939977546054517e-05, "step": 14900 }, { "epoch": 4.969979986657772, "loss": 0.26214566826820374, "step": 14900 }, { "ce_loss": 0.05214730277657509, "epoch": 4.969979986657772, "step": 14900 }, { "distill_loss": 0.11806771904230118, "epoch": 4.969979986657772, "step": 14900 }, { "epoch": 4.969979986657772, "ref_ce_loss": 0.0917915552854538, "step": 14900 }, { "epoch": 4.969979986657772, "loss": 0.4815196990966797, "step": 14900 }, { "ce_loss": 0.08143611997365952, "epoch": 4.969979986657772, "step": 14900 }, { "distill_loss": 0.09976162016391754, "epoch": 4.969979986657772, "step": 14900 }, { "epoch": 4.969979986657772, "ref_ce_loss": 0.08829763531684875, "step": 14900 }, { "epoch": 4.969979986657772, "loss": 0.48579132556915283, "step": 14900 }, { "ce_loss": 0.09691136330366135, "epoch": 4.969979986657772, "step": 14900 }, { "distill_loss": 0.12712033092975616, "epoch": 4.969979986657772, "step": 14900 }, { "epoch": 4.969979986657772, "ref_ce_loss": 0.08161374181509018, "step": 14900 }, { "epoch": 4.969979986657772, "loss": 0.32713285088539124, "step": 14900 }, { "ce_loss": 0.10395216941833496, "epoch": 4.969979986657772, "step": 14900 }, { "distill_loss": 0.11791080236434937, "epoch": 4.969979986657772, "step": 14900 }, { "epoch": 4.969979986657772, "ref_ce_loss": 0.07737979292869568, "step": 14900 }, { "epoch": 4.973315543695797, "loss": 0.3677, "step": 14910 }, { "epoch": 4.973315543695797, "grad_norm": 4.023821830749512, "step": 14910 }, { "epoch": 4.973315543695797, "learning_rate": 9.92091334237224e-05, "step": 14910 }, { "epoch": 4.973315543695797, "loss": 0.17526739835739136, "step": 14910 }, { "ce_loss": 0.0291630607098341, "epoch": 4.973315543695797, "step": 14910 }, { "distill_loss": 0.08895475417375565, "epoch": 4.973315543695797, "step": 14910 }, { "epoch": 4.973315543695797, "ref_ce_loss": 0.056943051517009735, "step": 14910 }, { "epoch": 4.973315543695797, "loss": 0.5921517610549927, "step": 14910 }, { "ce_loss": 0.14450669288635254, "epoch": 4.973315543695797, "step": 14910 }, { "distill_loss": 0.15296611189842224, "epoch": 4.973315543695797, "step": 14910 }, { "epoch": 4.973315543695797, "ref_ce_loss": 0.11224636435508728, "step": 14910 }, { "epoch": 4.973315543695797, "loss": 0.2923111915588379, "step": 14910 }, { "ce_loss": 0.09236502647399902, "epoch": 4.973315543695797, "step": 14910 }, { "distill_loss": 0.11198394745588303, "epoch": 4.973315543695797, "step": 14910 }, { "epoch": 4.973315543695797, "ref_ce_loss": 0.08783869445323944, "step": 14910 }, { "epoch": 4.973315543695797, "loss": 0.45573416352272034, "step": 14910 }, { "ce_loss": 0.18862444162368774, "epoch": 4.973315543695797, "step": 14910 }, { "distill_loss": 0.14855432510375977, "epoch": 4.973315543695797, "step": 14910 }, { "epoch": 4.973315543695797, "ref_ce_loss": 0.11837711930274963, "step": 14910 }, { "epoch": 4.9766511007338226, "loss": 0.3831, "step": 14920 }, { "epoch": 4.9766511007338226, "grad_norm": 1.6999053955078125, "step": 14920 }, { "epoch": 4.9766511007338226, "learning_rate": 9.901858400940496e-05, "step": 14920 }, { "epoch": 4.9766511007338226, "loss": 0.22062714397907257, "step": 14920 }, { "ce_loss": 0.041478704661130905, "epoch": 4.9766511007338226, "step": 14920 }, { "distill_loss": 0.08729581534862518, "epoch": 4.9766511007338226, "step": 14920 }, { "epoch": 4.9766511007338226, "ref_ce_loss": 0.04799078777432442, "step": 14920 }, { "epoch": 4.9766511007338226, "loss": 0.5395979881286621, "step": 14920 }, { "ce_loss": 0.18839100003242493, "epoch": 4.9766511007338226, "step": 14920 }, { "distill_loss": 0.17057166993618011, "epoch": 4.9766511007338226, "step": 14920 }, { "epoch": 4.9766511007338226, "ref_ce_loss": 0.09586435556411743, "step": 14920 }, { "epoch": 4.9766511007338226, "loss": 0.7154274582862854, "step": 14920 }, { "ce_loss": 0.17241571843624115, "epoch": 4.9766511007338226, "step": 14920 }, { "distill_loss": 0.13662411272525787, "epoch": 4.9766511007338226, "step": 14920 }, { "epoch": 4.9766511007338226, "ref_ce_loss": 0.1351361721754074, "step": 14920 }, { "epoch": 4.9766511007338226, "loss": 0.5005620121955872, "step": 14920 }, { "ce_loss": 0.15116122364997864, "epoch": 4.9766511007338226, "step": 14920 }, { "distill_loss": 0.12960541248321533, "epoch": 4.9766511007338226, "step": 14920 }, { "epoch": 4.9766511007338226, "ref_ce_loss": 0.11357004195451736, "step": 14920 }, { "epoch": 4.979986657771848, "loss": 0.4151, "step": 14930 }, { "epoch": 4.979986657771848, "grad_norm": 2.9988279342651367, "step": 14930 }, { "epoch": 4.979986657771848, "learning_rate": 9.88281275650797e-05, "step": 14930 }, { "epoch": 4.979986657771848, "loss": 0.3005850613117218, "step": 14930 }, { "ce_loss": 0.11170001327991486, "epoch": 4.979986657771848, "step": 14930 }, { "distill_loss": 0.12441539764404297, "epoch": 4.979986657771848, "step": 14930 }, { "epoch": 4.979986657771848, "ref_ce_loss": 0.06426378339529037, "step": 14930 }, { "epoch": 4.979986657771848, "loss": 0.4188843071460724, "step": 14930 }, { "ce_loss": 0.10497825592756271, "epoch": 4.979986657771848, "step": 14930 }, { "distill_loss": 0.10721724480390549, "epoch": 4.979986657771848, "step": 14930 }, { "epoch": 4.979986657771848, "ref_ce_loss": 0.07746762037277222, "step": 14930 }, { "epoch": 4.979986657771848, "loss": 0.3971782922744751, "step": 14930 }, { "ce_loss": 0.03395448625087738, "epoch": 4.979986657771848, "step": 14930 }, { "distill_loss": 0.10532855242490768, "epoch": 4.979986657771848, "step": 14930 }, { "epoch": 4.979986657771848, "ref_ce_loss": 0.07476559281349182, "step": 14930 }, { "epoch": 4.979986657771848, "loss": 0.16209197044372559, "step": 14930 }, { "ce_loss": 0.028277037665247917, "epoch": 4.979986657771848, "step": 14930 }, { "distill_loss": 0.06412121653556824, "epoch": 4.979986657771848, "step": 14930 }, { "epoch": 4.979986657771848, "ref_ce_loss": 0.03951544687151909, "step": 14930 }, { "epoch": 4.983322214809873, "loss": 0.3849, "step": 14940 }, { "epoch": 4.983322214809873, "grad_norm": 2.4632937908172607, "step": 14940 }, { "epoch": 4.983322214809873, "learning_rate": 9.863776443806414e-05, "step": 14940 }, { "epoch": 4.983322214809873, "loss": 0.3252128064632416, "step": 14940 }, { "ce_loss": 0.08578341454267502, "epoch": 4.983322214809873, "step": 14940 }, { "distill_loss": 0.10882560163736343, "epoch": 4.983322214809873, "step": 14940 }, { "epoch": 4.983322214809873, "ref_ce_loss": 0.0696091502904892, "step": 14940 }, { "epoch": 4.983322214809873, "loss": 0.4829108417034149, "step": 14940 }, { "ce_loss": 0.17730078101158142, "epoch": 4.983322214809873, "step": 14940 }, { "distill_loss": 0.125204935669899, "epoch": 4.983322214809873, "step": 14940 }, { "epoch": 4.983322214809873, "ref_ce_loss": 0.10830342024564743, "step": 14940 }, { "epoch": 4.983322214809873, "loss": 0.2506943643093109, "step": 14940 }, { "ce_loss": 0.015404731966555119, "epoch": 4.983322214809873, "step": 14940 }, { "distill_loss": 0.11019233614206314, "epoch": 4.983322214809873, "step": 14940 }, { "epoch": 4.983322214809873, "ref_ce_loss": 0.07074945420026779, "step": 14940 }, { "epoch": 4.983322214809873, "loss": 0.3966948986053467, "step": 14940 }, { "ce_loss": 0.08755076676607132, "epoch": 4.983322214809873, "step": 14940 }, { "distill_loss": 0.10180173814296722, "epoch": 4.983322214809873, "step": 14940 }, { "epoch": 4.983322214809873, "ref_ce_loss": 0.07565634697675705, "step": 14940 }, { "epoch": 4.986657771847899, "loss": 0.4103, "step": 14950 }, { "epoch": 4.986657771847899, "grad_norm": 3.3751020431518555, "step": 14950 }, { "epoch": 4.986657771847899, "learning_rate": 9.844749497550549e-05, "step": 14950 }, { "epoch": 4.986657771847899, "loss": 0.38711386919021606, "step": 14950 }, { "ce_loss": 0.12462891638278961, "epoch": 4.986657771847899, "step": 14950 }, { "distill_loss": 0.12855587899684906, "epoch": 4.986657771847899, "step": 14950 }, { "epoch": 4.986657771847899, "ref_ce_loss": 0.10362184047698975, "step": 14950 }, { "epoch": 4.986657771847899, "loss": 0.36301493644714355, "step": 14950 }, { "ce_loss": 0.14747299253940582, "epoch": 4.986657771847899, "step": 14950 }, { "distill_loss": 0.09996884316205978, "epoch": 4.986657771847899, "step": 14950 }, { "epoch": 4.986657771847899, "ref_ce_loss": 0.11533646285533905, "step": 14950 }, { "epoch": 4.986657771847899, "loss": 0.39188283681869507, "step": 14950 }, { "ce_loss": 0.058999788016080856, "epoch": 4.986657771847899, "step": 14950 }, { "distill_loss": 0.12072397768497467, "epoch": 4.986657771847899, "step": 14950 }, { "epoch": 4.986657771847899, "ref_ce_loss": 0.11123237013816833, "step": 14950 }, { "epoch": 4.986657771847899, "loss": 0.285819947719574, "step": 14950 }, { "ce_loss": 0.08039289712905884, "epoch": 4.986657771847899, "step": 14950 }, { "distill_loss": 0.09627498686313629, "epoch": 4.986657771847899, "step": 14950 }, { "epoch": 4.986657771847899, "ref_ce_loss": 0.07560764998197556, "step": 14950 }, { "epoch": 4.989993328885924, "loss": 0.3899, "step": 14960 }, { "epoch": 4.989993328885924, "grad_norm": 3.0399069786071777, "step": 14960 }, { "epoch": 4.989993328885924, "learning_rate": 9.825731952438019e-05, "step": 14960 }, { "epoch": 4.989993328885924, "loss": 0.2565562427043915, "step": 14960 }, { "ce_loss": 0.031080681830644608, "epoch": 4.989993328885924, "step": 14960 }, { "distill_loss": 0.12158703804016113, "epoch": 4.989993328885924, "step": 14960 }, { "epoch": 4.989993328885924, "ref_ce_loss": 0.0760444924235344, "step": 14960 }, { "epoch": 4.989993328885924, "loss": 0.32719331979751587, "step": 14960 }, { "ce_loss": 0.12624545395374298, "epoch": 4.989993328885924, "step": 14960 }, { "distill_loss": 0.08923916518688202, "epoch": 4.989993328885924, "step": 14960 }, { "epoch": 4.989993328885924, "ref_ce_loss": 0.07610367238521576, "step": 14960 }, { "epoch": 4.989993328885924, "loss": 0.3031754493713379, "step": 14960 }, { "ce_loss": 0.102663055062294, "epoch": 4.989993328885924, "step": 14960 }, { "distill_loss": 0.12523791193962097, "epoch": 4.989993328885924, "step": 14960 }, { "epoch": 4.989993328885924, "ref_ce_loss": 0.06037063151597977, "step": 14960 }, { "epoch": 4.989993328885924, "loss": 0.1924704611301422, "step": 14960 }, { "ce_loss": 0.052973054349422455, "epoch": 4.989993328885924, "step": 14960 }, { "distill_loss": 0.08791524171829224, "epoch": 4.989993328885924, "step": 14960 }, { "epoch": 4.989993328885924, "ref_ce_loss": 0.03767343983054161, "step": 14960 }, { "epoch": 4.993328885923949, "loss": 0.4152, "step": 14970 }, { "epoch": 4.993328885923949, "grad_norm": 2.185476064682007, "step": 14970 }, { "epoch": 4.993328885923949, "learning_rate": 9.806723843149328e-05, "step": 14970 }, { "epoch": 4.993328885923949, "loss": 0.5093054175376892, "step": 14970 }, { "ce_loss": 0.2362217903137207, "epoch": 4.993328885923949, "step": 14970 }, { "distill_loss": 0.13822926580905914, "epoch": 4.993328885923949, "step": 14970 }, { "epoch": 4.993328885923949, "ref_ce_loss": 0.1347385048866272, "step": 14970 }, { "epoch": 4.993328885923949, "loss": 0.4155021905899048, "step": 14970 }, { "ce_loss": 0.0930529236793518, "epoch": 4.993328885923949, "step": 14970 }, { "distill_loss": 0.12571507692337036, "epoch": 4.993328885923949, "step": 14970 }, { "epoch": 4.993328885923949, "ref_ce_loss": 0.1019885390996933, "step": 14970 }, { "epoch": 4.993328885923949, "loss": 0.5457403659820557, "step": 14970 }, { "ce_loss": 0.2082226276397705, "epoch": 4.993328885923949, "step": 14970 }, { "distill_loss": 0.14089271426200867, "epoch": 4.993328885923949, "step": 14970 }, { "epoch": 4.993328885923949, "ref_ce_loss": 0.11160488426685333, "step": 14970 }, { "epoch": 4.993328885923949, "loss": 0.2308986932039261, "step": 14970 }, { "ce_loss": 0.05669495090842247, "epoch": 4.993328885923949, "step": 14970 }, { "distill_loss": 0.10594461858272552, "epoch": 4.993328885923949, "step": 14970 }, { "epoch": 4.993328885923949, "ref_ce_loss": 0.046271566301584244, "step": 14970 }, { "epoch": 4.996664442961975, "loss": 0.3631, "step": 14980 }, { "epoch": 4.996664442961975, "grad_norm": 2.700335741043091, "step": 14980 }, { "epoch": 4.996664442961975, "learning_rate": 9.787725204347764e-05, "step": 14980 }, { "epoch": 4.996664442961975, "loss": 0.42493465542793274, "step": 14980 }, { "ce_loss": 0.14343814551830292, "epoch": 4.996664442961975, "step": 14980 }, { "distill_loss": 0.11766191571950912, "epoch": 4.996664442961975, "step": 14980 }, { "epoch": 4.996664442961975, "ref_ce_loss": 0.08248550444841385, "step": 14980 }, { "epoch": 4.996664442961975, "loss": 0.3019244074821472, "step": 14980 }, { "ce_loss": 0.06459664553403854, "epoch": 4.996664442961975, "step": 14980 }, { "distill_loss": 0.07496616989374161, "epoch": 4.996664442961975, "step": 14980 }, { "epoch": 4.996664442961975, "ref_ce_loss": 0.06792156398296356, "step": 14980 }, { "epoch": 4.996664442961975, "loss": 0.2959771156311035, "step": 14980 }, { "ce_loss": 0.050680242478847504, "epoch": 4.996664442961975, "step": 14980 }, { "distill_loss": 0.10879239439964294, "epoch": 4.996664442961975, "step": 14980 }, { "epoch": 4.996664442961975, "ref_ce_loss": 0.08710350841283798, "step": 14980 }, { "epoch": 4.996664442961975, "loss": 0.32662272453308105, "step": 14980 }, { "ce_loss": 0.11703333258628845, "epoch": 4.996664442961975, "step": 14980 }, { "distill_loss": 0.10472206771373749, "epoch": 4.996664442961975, "step": 14980 }, { "epoch": 4.996664442961975, "ref_ce_loss": 0.06959205120801926, "step": 14980 }, { "epoch": 5.0, "loss": 0.3927, "step": 14990 }, { "epoch": 5.0, "grad_norm": 3.773092031478882, "step": 14990 }, { "epoch": 5.0, "learning_rate": 9.768736070679355e-05, "step": 14990 }, { "epoch": 5.0, "loss": 0.24565860629081726, "step": 14990 }, { "ce_loss": 0.04676143452525139, "epoch": 5.0, "step": 14990 }, { "distill_loss": 0.10307568311691284, "epoch": 5.0, "step": 14990 }, { "epoch": 5.0, "ref_ce_loss": 0.09563593566417694, "step": 14990 }, { "epoch": 5.0, "loss": 0.2170976996421814, "step": 14990 }, { "ce_loss": 0.04341982305049896, "epoch": 5.0, "step": 14990 }, { "distill_loss": 0.09917079657316208, "epoch": 5.0, "step": 14990 }, { "epoch": 5.0, "ref_ce_loss": 0.06050785630941391, "step": 14990 }, { "epoch": 5.0, "loss": 0.27760088443756104, "step": 14990 }, { "ce_loss": 0.02374977245926857, "epoch": 5.0, "step": 14990 }, { "distill_loss": 0.0888613685965538, "epoch": 5.0, "step": 14990 }, { "epoch": 5.0, "ref_ce_loss": 0.07210943847894669, "step": 14990 }, { "epoch": 5.0, "loss": 0.3771913945674896, "step": 14990 }, { "ce_loss": 0.13417477905750275, "epoch": 5.0, "step": 14990 }, { "distill_loss": 0.1130586713552475, "epoch": 5.0, "step": 14990 }, { "epoch": 5.0, "ref_ce_loss": 0.08856955170631409, "step": 14990 }, { "epoch": 5.003335557038025, "loss": 0.2995, "step": 15000 }, { "epoch": 5.003335557038025, "grad_norm": 1.9504388570785522, "step": 15000 }, { "epoch": 5.003335557038025, "learning_rate": 9.749756476772786e-05, "step": 15000 }, { "epoch": 5.003335557038025, "loss": 0.2518184185028076, "step": 15000 }, { "ce_loss": 0.027764635160565376, "epoch": 5.003335557038025, "step": 15000 }, { "distill_loss": 0.11909808963537216, "epoch": 5.003335557038025, "step": 15000 }, { "epoch": 5.003335557038025, "ref_ce_loss": 0.06962376087903976, "step": 15000 }, { "epoch": 5.003335557038025, "loss": 0.3267233073711395, "step": 15000 }, { "ce_loss": 0.05888934060931206, "epoch": 5.003335557038025, "step": 15000 }, { "distill_loss": 0.10403020679950714, "epoch": 5.003335557038025, "step": 15000 }, { "epoch": 5.003335557038025, "ref_ce_loss": 0.04899250343441963, "step": 15000 }, { "epoch": 5.003335557038025, "loss": 0.2693495452404022, "step": 15000 }, { "ce_loss": 0.09646335989236832, "epoch": 5.003335557038025, "step": 15000 }, { "distill_loss": 0.09357049316167831, "epoch": 5.003335557038025, "step": 15000 }, { "epoch": 5.003335557038025, "ref_ce_loss": 0.07913413643836975, "step": 15000 }, { "epoch": 5.003335557038025, "loss": 0.2770167589187622, "step": 15000 }, { "ce_loss": 0.03341980651021004, "epoch": 5.003335557038025, "step": 15000 }, { "distill_loss": 0.10253458470106125, "epoch": 5.003335557038025, "step": 15000 }, { "epoch": 5.003335557038025, "ref_ce_loss": 0.0455530546605587, "step": 15000 }, { "epoch": 5.006671114076051, "loss": 0.3276, "step": 15010 }, { "epoch": 5.006671114076051, "grad_norm": 3.6639962196350098, "step": 15010 }, { "epoch": 5.006671114076051, "learning_rate": 9.73078645723935e-05, "step": 15010 }, { "epoch": 5.006671114076051, "loss": 0.3098304271697998, "step": 15010 }, { "ce_loss": 0.06160301715135574, "epoch": 5.006671114076051, "step": 15010 }, { "distill_loss": 0.11260523647069931, "epoch": 5.006671114076051, "step": 15010 }, { "epoch": 5.006671114076051, "ref_ce_loss": 0.13522294163703918, "step": 15010 }, { "epoch": 5.006671114076051, "loss": 0.20364679396152496, "step": 15010 }, { "ce_loss": 0.0429227389395237, "epoch": 5.006671114076051, "step": 15010 }, { "distill_loss": 0.07736965268850327, "epoch": 5.006671114076051, "step": 15010 }, { "epoch": 5.006671114076051, "ref_ce_loss": 0.08260839432477951, "step": 15010 }, { "epoch": 5.006671114076051, "loss": 0.22606542706489563, "step": 15010 }, { "ce_loss": 0.05941081792116165, "epoch": 5.006671114076051, "step": 15010 }, { "distill_loss": 0.11994045972824097, "epoch": 5.006671114076051, "step": 15010 }, { "epoch": 5.006671114076051, "ref_ce_loss": 0.03545362129807472, "step": 15010 }, { "epoch": 5.006671114076051, "loss": 0.28638121485710144, "step": 15010 }, { "ce_loss": 0.09630145877599716, "epoch": 5.006671114076051, "step": 15010 }, { "distill_loss": 0.10460197925567627, "epoch": 5.006671114076051, "step": 15010 }, { "epoch": 5.006671114076051, "ref_ce_loss": 0.06608293205499649, "step": 15010 }, { "epoch": 5.010006671114076, "loss": 0.3134, "step": 15020 }, { "epoch": 5.010006671114076, "grad_norm": 1.8041044473648071, "step": 15020 }, { "epoch": 5.010006671114076, "learning_rate": 9.711826046672886e-05, "step": 15020 }, { "epoch": 5.010006671114076, "loss": 0.5290608406066895, "step": 15020 }, { "ce_loss": 0.08037453144788742, "epoch": 5.010006671114076, "step": 15020 }, { "distill_loss": 0.11784839630126953, "epoch": 5.010006671114076, "step": 15020 }, { "epoch": 5.010006671114076, "ref_ce_loss": 0.07413246482610703, "step": 15020 }, { "epoch": 5.010006671114076, "loss": 0.31020018458366394, "step": 15020 }, { "ce_loss": 0.06627956032752991, "epoch": 5.010006671114076, "step": 15020 }, { "distill_loss": 0.09461265802383423, "epoch": 5.010006671114076, "step": 15020 }, { "epoch": 5.010006671114076, "ref_ce_loss": 0.07771609723567963, "step": 15020 }, { "epoch": 5.010006671114076, "loss": 0.2674647271633148, "step": 15020 }, { "ce_loss": 0.08439767360687256, "epoch": 5.010006671114076, "step": 15020 }, { "distill_loss": 0.09544394165277481, "epoch": 5.010006671114076, "step": 15020 }, { "epoch": 5.010006671114076, "ref_ce_loss": 0.06403058767318726, "step": 15020 }, { "epoch": 5.010006671114076, "loss": 0.23964346945285797, "step": 15020 }, { "ce_loss": 0.051768235862255096, "epoch": 5.010006671114076, "step": 15020 }, { "distill_loss": 0.09334868937730789, "epoch": 5.010006671114076, "step": 15020 }, { "epoch": 5.010006671114076, "ref_ce_loss": 0.07185187935829163, "step": 15020 }, { "epoch": 5.013342228152101, "loss": 0.3388, "step": 15030 }, { "epoch": 5.013342228152101, "grad_norm": 5.189137935638428, "step": 15030 }, { "epoch": 5.013342228152101, "learning_rate": 9.692875279649694e-05, "step": 15030 }, { "epoch": 5.013342228152101, "loss": 0.34460267424583435, "step": 15030 }, { "ce_loss": 0.1255204826593399, "epoch": 5.013342228152101, "step": 15030 }, { "distill_loss": 0.11821456253528595, "epoch": 5.013342228152101, "step": 15030 }, { "epoch": 5.013342228152101, "ref_ce_loss": 0.10058291256427765, "step": 15030 }, { "epoch": 5.013342228152101, "loss": 0.3356470465660095, "step": 15030 }, { "ce_loss": 0.032778576016426086, "epoch": 5.013342228152101, "step": 15030 }, { "distill_loss": 0.10354934632778168, "epoch": 5.013342228152101, "step": 15030 }, { "epoch": 5.013342228152101, "ref_ce_loss": 0.08033135533332825, "step": 15030 }, { "epoch": 5.013342228152101, "loss": 0.41687262058258057, "step": 15030 }, { "ce_loss": 0.07428724318742752, "epoch": 5.013342228152101, "step": 15030 }, { "distill_loss": 0.11658826470375061, "epoch": 5.013342228152101, "step": 15030 }, { "epoch": 5.013342228152101, "ref_ce_loss": 0.0638238936662674, "step": 15030 }, { "epoch": 5.013342228152101, "loss": 0.3669113218784332, "step": 15030 }, { "ce_loss": 0.11058314144611359, "epoch": 5.013342228152101, "step": 15030 }, { "distill_loss": 0.1107931137084961, "epoch": 5.013342228152101, "step": 15030 }, { "epoch": 5.013342228152101, "ref_ce_loss": 0.09497855603694916, "step": 15030 }, { "epoch": 5.016677785190127, "loss": 0.3591, "step": 15040 }, { "epoch": 5.016677785190127, "grad_norm": 2.142228603363037, "step": 15040 }, { "epoch": 5.016677785190127, "learning_rate": 9.6739341907285e-05, "step": 15040 }, { "epoch": 5.016677785190127, "loss": 0.3057650625705719, "step": 15040 }, { "ce_loss": 0.08420991897583008, "epoch": 5.016677785190127, "step": 15040 }, { "distill_loss": 0.11546208709478378, "epoch": 5.016677785190127, "step": 15040 }, { "epoch": 5.016677785190127, "ref_ce_loss": 0.0664733350276947, "step": 15040 }, { "epoch": 5.016677785190127, "loss": 0.37464606761932373, "step": 15040 }, { "ce_loss": 0.0816805511713028, "epoch": 5.016677785190127, "step": 15040 }, { "distill_loss": 0.13084836304187775, "epoch": 5.016677785190127, "step": 15040 }, { "epoch": 5.016677785190127, "ref_ce_loss": 0.07238293439149857, "step": 15040 }, { "epoch": 5.016677785190127, "loss": 0.22750362753868103, "step": 15040 }, { "ce_loss": 0.027732811868190765, "epoch": 5.016677785190127, "step": 15040 }, { "distill_loss": 0.06788545101881027, "epoch": 5.016677785190127, "step": 15040 }, { "epoch": 5.016677785190127, "ref_ce_loss": 0.05549793690443039, "step": 15040 }, { "epoch": 5.016677785190127, "loss": 0.18696361780166626, "step": 15040 }, { "ce_loss": 0.04056626930832863, "epoch": 5.016677785190127, "step": 15040 }, { "distill_loss": 0.09064240753650665, "epoch": 5.016677785190127, "step": 15040 }, { "epoch": 5.016677785190127, "ref_ce_loss": 0.05517708882689476, "step": 15040 }, { "epoch": 5.020013342228152, "loss": 0.3265, "step": 15050 }, { "epoch": 5.020013342228152, "grad_norm": 2.525740385055542, "step": 15050 }, { "epoch": 5.020013342228152, "learning_rate": 9.655002814450387e-05, "step": 15050 }, { "epoch": 5.020013342228152, "loss": 0.29186737537384033, "step": 15050 }, { "ce_loss": 0.054460786283016205, "epoch": 5.020013342228152, "step": 15050 }, { "distill_loss": 0.08650080114603043, "epoch": 5.020013342228152, "step": 15050 }, { "epoch": 5.020013342228152, "ref_ce_loss": 0.052335672080516815, "step": 15050 }, { "epoch": 5.020013342228152, "loss": 0.22459924221038818, "step": 15050 }, { "ce_loss": 0.0686323419213295, "epoch": 5.020013342228152, "step": 15050 }, { "distill_loss": 0.0952027291059494, "epoch": 5.020013342228152, "step": 15050 }, { "epoch": 5.020013342228152, "ref_ce_loss": 0.060247596353292465, "step": 15050 }, { "epoch": 5.020013342228152, "loss": 0.4892418384552002, "step": 15050 }, { "ce_loss": 0.06871223449707031, "epoch": 5.020013342228152, "step": 15050 }, { "distill_loss": 0.11085522174835205, "epoch": 5.020013342228152, "step": 15050 }, { "epoch": 5.020013342228152, "ref_ce_loss": 0.071814626455307, "step": 15050 }, { "epoch": 5.020013342228152, "loss": 0.3401147127151489, "step": 15050 }, { "ce_loss": 0.15101785957813263, "epoch": 5.020013342228152, "step": 15050 }, { "distill_loss": 0.12028548866510391, "epoch": 5.020013342228152, "step": 15050 }, { "epoch": 5.020013342228152, "ref_ce_loss": 0.04461260139942169, "step": 15050 }, { "epoch": 5.0233488992661774, "loss": 0.3523, "step": 15060 }, { "epoch": 5.0233488992661774, "grad_norm": 2.574275016784668, "step": 15060 }, { "epoch": 5.0233488992661774, "learning_rate": 9.636081185338707e-05, "step": 15060 }, { "epoch": 5.0233488992661774, "loss": 0.19955489039421082, "step": 15060 }, { "ce_loss": 0.021272744983434677, "epoch": 5.0233488992661774, "step": 15060 }, { "distill_loss": 0.0945524051785469, "epoch": 5.0233488992661774, "step": 15060 }, { "epoch": 5.0233488992661774, "ref_ce_loss": 0.05813675373792648, "step": 15060 }, { "epoch": 5.0233488992661774, "loss": 0.5390908122062683, "step": 15060 }, { "ce_loss": 0.18993806838989258, "epoch": 5.0233488992661774, "step": 15060 }, { "distill_loss": 0.12777474522590637, "epoch": 5.0233488992661774, "step": 15060 }, { "epoch": 5.0233488992661774, "ref_ce_loss": 0.10478173196315765, "step": 15060 }, { "epoch": 5.0233488992661774, "loss": 0.5204997062683105, "step": 15060 }, { "ce_loss": 0.06704582273960114, "epoch": 5.0233488992661774, "step": 15060 }, { "distill_loss": 0.15518198907375336, "epoch": 5.0233488992661774, "step": 15060 }, { "epoch": 5.0233488992661774, "ref_ce_loss": 0.07022137194871902, "step": 15060 }, { "epoch": 5.0233488992661774, "loss": 0.2526065707206726, "step": 15060 }, { "ce_loss": 0.037358131259679794, "epoch": 5.0233488992661774, "step": 15060 }, { "distill_loss": 0.09248964488506317, "epoch": 5.0233488992661774, "step": 15060 }, { "epoch": 5.0233488992661774, "ref_ce_loss": 0.08157315850257874, "step": 15060 }, { "epoch": 5.026684456304203, "loss": 0.3519, "step": 15070 }, { "epoch": 5.026684456304203, "grad_norm": 3.2034337520599365, "step": 15070 }, { "epoch": 5.026684456304203, "learning_rate": 9.617169337899059e-05, "step": 15070 }, { "epoch": 5.026684456304203, "loss": 0.40385645627975464, "step": 15070 }, { "ce_loss": 0.12731719017028809, "epoch": 5.026684456304203, "step": 15070 }, { "distill_loss": 0.1379529982805252, "epoch": 5.026684456304203, "step": 15070 }, { "epoch": 5.026684456304203, "ref_ce_loss": 0.07186760008335114, "step": 15070 }, { "epoch": 5.026684456304203, "loss": 0.3676081895828247, "step": 15070 }, { "ce_loss": 0.06205548718571663, "epoch": 5.026684456304203, "step": 15070 }, { "distill_loss": 0.13433827459812164, "epoch": 5.026684456304203, "step": 15070 }, { "epoch": 5.026684456304203, "ref_ce_loss": 0.08207282423973083, "step": 15070 }, { "epoch": 5.026684456304203, "loss": 0.4785653352737427, "step": 15070 }, { "ce_loss": 0.172767773270607, "epoch": 5.026684456304203, "step": 15070 }, { "distill_loss": 0.18138282001018524, "epoch": 5.026684456304203, "step": 15070 }, { "epoch": 5.026684456304203, "ref_ce_loss": 0.06890442222356796, "step": 15070 }, { "epoch": 5.026684456304203, "loss": 0.20031343400478363, "step": 15070 }, { "ce_loss": 0.03773951530456543, "epoch": 5.026684456304203, "step": 15070 }, { "distill_loss": 0.09012091904878616, "epoch": 5.026684456304203, "step": 15070 }, { "epoch": 5.026684456304203, "ref_ce_loss": 0.07225586473941803, "step": 15070 }, { "epoch": 5.030020013342228, "loss": 0.4076, "step": 15080 }, { "epoch": 5.030020013342228, "grad_norm": 2.200860023498535, "step": 15080 }, { "epoch": 5.030020013342228, "learning_rate": 9.598267306619173e-05, "step": 15080 }, { "epoch": 5.030020013342228, "loss": 0.3781607449054718, "step": 15080 }, { "ce_loss": 0.11247001588344574, "epoch": 5.030020013342228, "step": 15080 }, { "distill_loss": 0.15823885798454285, "epoch": 5.030020013342228, "step": 15080 }, { "epoch": 5.030020013342228, "ref_ce_loss": 0.08177276700735092, "step": 15080 }, { "epoch": 5.030020013342228, "loss": 0.4920407831668854, "step": 15080 }, { "ce_loss": 0.14705343544483185, "epoch": 5.030020013342228, "step": 15080 }, { "distill_loss": 0.19593298435211182, "epoch": 5.030020013342228, "step": 15080 }, { "epoch": 5.030020013342228, "ref_ce_loss": 0.11216946691274643, "step": 15080 }, { "epoch": 5.030020013342228, "loss": 0.218006432056427, "step": 15080 }, { "ce_loss": 0.03527948632836342, "epoch": 5.030020013342228, "step": 15080 }, { "distill_loss": 0.0989982932806015, "epoch": 5.030020013342228, "step": 15080 }, { "epoch": 5.030020013342228, "ref_ce_loss": 0.08341675996780396, "step": 15080 }, { "epoch": 5.030020013342228, "loss": 0.24107953906059265, "step": 15080 }, { "ce_loss": 0.043232500553131104, "epoch": 5.030020013342228, "step": 15080 }, { "distill_loss": 0.12532421946525574, "epoch": 5.030020013342228, "step": 15080 }, { "epoch": 5.030020013342228, "ref_ce_loss": 0.0574118047952652, "step": 15080 }, { "epoch": 5.0333555703802535, "loss": 0.3412, "step": 15090 }, { "epoch": 5.0333555703802535, "grad_norm": 2.083111047744751, "step": 15090 }, { "epoch": 5.0333555703802535, "learning_rate": 9.579375125968917e-05, "step": 15090 }, { "epoch": 5.0333555703802535, "loss": 0.44608885049819946, "step": 15090 }, { "ce_loss": 0.1435285210609436, "epoch": 5.0333555703802535, "step": 15090 }, { "distill_loss": 0.13029739260673523, "epoch": 5.0333555703802535, "step": 15090 }, { "epoch": 5.0333555703802535, "ref_ce_loss": 0.12861613929271698, "step": 15090 }, { "epoch": 5.0333555703802535, "loss": 0.28388845920562744, "step": 15090 }, { "ce_loss": 0.04726126790046692, "epoch": 5.0333555703802535, "step": 15090 }, { "distill_loss": 0.10819245129823685, "epoch": 5.0333555703802535, "step": 15090 }, { "epoch": 5.0333555703802535, "ref_ce_loss": 0.06490429490804672, "step": 15090 }, { "epoch": 5.0333555703802535, "loss": 0.34822142124176025, "step": 15090 }, { "ce_loss": 0.08054547011852264, "epoch": 5.0333555703802535, "step": 15090 }, { "distill_loss": 0.14222362637519836, "epoch": 5.0333555703802535, "step": 15090 }, { "epoch": 5.0333555703802535, "ref_ce_loss": 0.08700346946716309, "step": 15090 }, { "epoch": 5.0333555703802535, "loss": 0.1939743012189865, "step": 15090 }, { "ce_loss": 0.035219430923461914, "epoch": 5.0333555703802535, "step": 15090 }, { "distill_loss": 0.08615106344223022, "epoch": 5.0333555703802535, "step": 15090 }, { "epoch": 5.0333555703802535, "ref_ce_loss": 0.07249585539102554, "step": 15090 }, { "epoch": 5.036691127418279, "loss": 0.329, "step": 15100 }, { "epoch": 5.036691127418279, "grad_norm": 2.6159894466400146, "step": 15100 }, { "epoch": 5.036691127418279, "learning_rate": 9.560492830400172e-05, "step": 15100 }, { "epoch": 5.036691127418279, "loss": 0.19131746888160706, "step": 15100 }, { "ce_loss": 0.039891522377729416, "epoch": 5.036691127418279, "step": 15100 }, { "distill_loss": 0.0966179370880127, "epoch": 5.036691127418279, "step": 15100 }, { "epoch": 5.036691127418279, "ref_ce_loss": 0.05468432232737541, "step": 15100 }, { "epoch": 5.036691127418279, "loss": 0.38014402985572815, "step": 15100 }, { "ce_loss": 0.06777114421129227, "epoch": 5.036691127418279, "step": 15100 }, { "distill_loss": 0.1002044826745987, "epoch": 5.036691127418279, "step": 15100 }, { "epoch": 5.036691127418279, "ref_ce_loss": 0.056828632950782776, "step": 15100 }, { "epoch": 5.036691127418279, "loss": 0.3731182813644409, "step": 15100 }, { "ce_loss": 0.11887678503990173, "epoch": 5.036691127418279, "step": 15100 }, { "distill_loss": 0.11022542417049408, "epoch": 5.036691127418279, "step": 15100 }, { "epoch": 5.036691127418279, "ref_ce_loss": 0.11073588579893112, "step": 15100 }, { "epoch": 5.036691127418279, "loss": 0.317482590675354, "step": 15100 }, { "ce_loss": 0.05223778262734413, "epoch": 5.036691127418279, "step": 15100 }, { "distill_loss": 0.10656572878360748, "epoch": 5.036691127418279, "step": 15100 }, { "epoch": 5.036691127418279, "ref_ce_loss": 0.06900766491889954, "step": 15100 }, { "epoch": 5.040026684456304, "loss": 0.3281, "step": 15110 }, { "epoch": 5.040026684456304, "grad_norm": 2.9384427070617676, "step": 15110 }, { "epoch": 5.040026684456304, "learning_rate": 9.541620454346787e-05, "step": 15110 }, { "epoch": 5.040026684456304, "loss": 0.31656724214553833, "step": 15110 }, { "ce_loss": 0.09671363234519958, "epoch": 5.040026684456304, "step": 15110 }, { "distill_loss": 0.11364702135324478, "epoch": 5.040026684456304, "step": 15110 }, { "epoch": 5.040026684456304, "ref_ce_loss": 0.106138676404953, "step": 15110 }, { "epoch": 5.040026684456304, "loss": 0.31608346104621887, "step": 15110 }, { "ce_loss": 0.11589302867650986, "epoch": 5.040026684456304, "step": 15110 }, { "distill_loss": 0.10061141848564148, "epoch": 5.040026684456304, "step": 15110 }, { "epoch": 5.040026684456304, "ref_ce_loss": 0.09926266223192215, "step": 15110 }, { "epoch": 5.040026684456304, "loss": 0.5507553815841675, "step": 15110 }, { "ce_loss": 0.19675405323505402, "epoch": 5.040026684456304, "step": 15110 }, { "distill_loss": 0.09251846373081207, "epoch": 5.040026684456304, "step": 15110 }, { "epoch": 5.040026684456304, "ref_ce_loss": 0.12421887367963791, "step": 15110 }, { "epoch": 5.040026684456304, "loss": 0.28572094440460205, "step": 15110 }, { "ce_loss": 0.046184901148080826, "epoch": 5.040026684456304, "step": 15110 }, { "distill_loss": 0.1092517301440239, "epoch": 5.040026684456304, "step": 15110 }, { "epoch": 5.040026684456304, "ref_ce_loss": 0.084798164665699, "step": 15110 }, { "epoch": 5.0433622414943295, "loss": 0.3422, "step": 15120 }, { "epoch": 5.0433622414943295, "grad_norm": 4.081774711608887, "step": 15120 }, { "epoch": 5.0433622414943295, "learning_rate": 9.522758032224545e-05, "step": 15120 }, { "epoch": 5.0433622414943295, "loss": 0.493918776512146, "step": 15120 }, { "ce_loss": 0.1023336723446846, "epoch": 5.0433622414943295, "step": 15120 }, { "distill_loss": 0.12242228537797928, "epoch": 5.0433622414943295, "step": 15120 }, { "epoch": 5.0433622414943295, "ref_ce_loss": 0.053322214633226395, "step": 15120 }, { "epoch": 5.0433622414943295, "loss": 0.29232460260391235, "step": 15120 }, { "ce_loss": 0.05161577835679054, "epoch": 5.0433622414943295, "step": 15120 }, { "distill_loss": 0.10842837393283844, "epoch": 5.0433622414943295, "step": 15120 }, { "epoch": 5.0433622414943295, "ref_ce_loss": 0.05500758811831474, "step": 15120 }, { "epoch": 5.0433622414943295, "loss": 0.26376548409461975, "step": 15120 }, { "ce_loss": 0.07995926588773727, "epoch": 5.0433622414943295, "step": 15120 }, { "distill_loss": 0.09874370694160461, "epoch": 5.0433622414943295, "step": 15120 }, { "epoch": 5.0433622414943295, "ref_ce_loss": 0.08495025336742401, "step": 15120 }, { "epoch": 5.0433622414943295, "loss": 0.2894538938999176, "step": 15120 }, { "ce_loss": 0.04029352590441704, "epoch": 5.0433622414943295, "step": 15120 }, { "distill_loss": 0.13173624873161316, "epoch": 5.0433622414943295, "step": 15120 }, { "epoch": 5.0433622414943295, "ref_ce_loss": 0.0760183110833168, "step": 15120 }, { "epoch": 5.046697798532355, "loss": 0.3141, "step": 15130 }, { "epoch": 5.046697798532355, "grad_norm": 2.2110354900360107, "step": 15130 }, { "epoch": 5.046697798532355, "learning_rate": 9.503905598431053e-05, "step": 15130 }, { "epoch": 5.046697798532355, "loss": 0.30037644505500793, "step": 15130 }, { "ce_loss": 0.10891405493021011, "epoch": 5.046697798532355, "step": 15130 }, { "distill_loss": 0.11043551564216614, "epoch": 5.046697798532355, "step": 15130 }, { "epoch": 5.046697798532355, "ref_ce_loss": 0.06152831390500069, "step": 15130 }, { "epoch": 5.046697798532355, "loss": 0.2595139443874359, "step": 15130 }, { "ce_loss": 0.08509143441915512, "epoch": 5.046697798532355, "step": 15130 }, { "distill_loss": 0.1089167594909668, "epoch": 5.046697798532355, "step": 15130 }, { "epoch": 5.046697798532355, "ref_ce_loss": 0.04903317987918854, "step": 15130 }, { "epoch": 5.046697798532355, "loss": 0.24324160814285278, "step": 15130 }, { "ce_loss": 0.03422941640019417, "epoch": 5.046697798532355, "step": 15130 }, { "distill_loss": 0.12488461285829544, "epoch": 5.046697798532355, "step": 15130 }, { "epoch": 5.046697798532355, "ref_ce_loss": 0.0839512050151825, "step": 15130 }, { "epoch": 5.046697798532355, "loss": 0.9537922739982605, "step": 15130 }, { "ce_loss": 0.10920923948287964, "epoch": 5.046697798532355, "step": 15130 }, { "distill_loss": 0.11429134011268616, "epoch": 5.046697798532355, "step": 15130 }, { "epoch": 5.046697798532355, "ref_ce_loss": 0.07421170175075531, "step": 15130 }, { "epoch": 5.05003335557038, "loss": 0.3921, "step": 15140 }, { "epoch": 5.05003335557038, "grad_norm": 2.547940969467163, "step": 15140 }, { "epoch": 5.05003335557038, "learning_rate": 9.485063187345712e-05, "step": 15140 }, { "epoch": 5.05003335557038, "loss": 0.6976451873779297, "step": 15140 }, { "ce_loss": 0.09503049403429031, "epoch": 5.05003335557038, "step": 15140 }, { "distill_loss": 0.14060279726982117, "epoch": 5.05003335557038, "step": 15140 }, { "epoch": 5.05003335557038, "ref_ce_loss": 0.0637066587805748, "step": 15140 }, { "epoch": 5.05003335557038, "loss": 0.301154226064682, "step": 15140 }, { "ce_loss": 0.03091120533645153, "epoch": 5.05003335557038, "step": 15140 }, { "distill_loss": 0.11131655424833298, "epoch": 5.05003335557038, "step": 15140 }, { "epoch": 5.05003335557038, "ref_ce_loss": 0.055801503360271454, "step": 15140 }, { "epoch": 5.05003335557038, "loss": 0.3734534680843353, "step": 15140 }, { "ce_loss": 0.07543478161096573, "epoch": 5.05003335557038, "step": 15140 }, { "distill_loss": 0.13863107562065125, "epoch": 5.05003335557038, "step": 15140 }, { "epoch": 5.05003335557038, "ref_ce_loss": 0.10265032202005386, "step": 15140 }, { "epoch": 5.05003335557038, "loss": 0.20870301127433777, "step": 15140 }, { "ce_loss": 0.012648079544305801, "epoch": 5.05003335557038, "step": 15140 }, { "distill_loss": 0.091103196144104, "epoch": 5.05003335557038, "step": 15140 }, { "epoch": 5.05003335557038, "ref_ce_loss": 0.04741125553846359, "step": 15140 }, { "epoch": 5.053368912608406, "loss": 0.3344, "step": 15150 }, { "epoch": 5.053368912608406, "grad_norm": 3.0254764556884766, "step": 15150 }, { "epoch": 5.053368912608406, "learning_rate": 9.466230833329663e-05, "step": 15150 }, { "epoch": 5.053368912608406, "loss": 0.33127936720848083, "step": 15150 }, { "ce_loss": 0.045439403504133224, "epoch": 5.053368912608406, "step": 15150 }, { "distill_loss": 0.11033318191766739, "epoch": 5.053368912608406, "step": 15150 }, { "epoch": 5.053368912608406, "ref_ce_loss": 0.09881974011659622, "step": 15150 }, { "epoch": 5.053368912608406, "loss": 0.24848978221416473, "step": 15150 }, { "ce_loss": 0.08748488873243332, "epoch": 5.053368912608406, "step": 15150 }, { "distill_loss": 0.10021417587995529, "epoch": 5.053368912608406, "step": 15150 }, { "epoch": 5.053368912608406, "ref_ce_loss": 0.05940258502960205, "step": 15150 }, { "epoch": 5.053368912608406, "loss": 0.2611362934112549, "step": 15150 }, { "ce_loss": 0.06973495334386826, "epoch": 5.053368912608406, "step": 15150 }, { "distill_loss": 0.11396859586238861, "epoch": 5.053368912608406, "step": 15150 }, { "epoch": 5.053368912608406, "ref_ce_loss": 0.07708052545785904, "step": 15150 }, { "epoch": 5.053368912608406, "loss": 0.510417640209198, "step": 15150 }, { "ce_loss": 0.14811956882476807, "epoch": 5.053368912608406, "step": 15150 }, { "distill_loss": 0.13365697860717773, "epoch": 5.053368912608406, "step": 15150 }, { "epoch": 5.053368912608406, "ref_ce_loss": 0.0729028657078743, "step": 15150 }, { "epoch": 5.056704469646431, "loss": 0.3312, "step": 15160 }, { "epoch": 5.056704469646431, "grad_norm": 2.0235612392425537, "step": 15160 }, { "epoch": 5.056704469646431, "learning_rate": 9.447408570725673e-05, "step": 15160 }, { "epoch": 5.056704469646431, "loss": 0.2727489173412323, "step": 15160 }, { "ce_loss": 0.0502191036939621, "epoch": 5.056704469646431, "step": 15160 }, { "distill_loss": 0.11405930668115616, "epoch": 5.056704469646431, "step": 15160 }, { "epoch": 5.056704469646431, "ref_ce_loss": 0.07889159768819809, "step": 15160 }, { "epoch": 5.056704469646431, "loss": 0.25519660115242004, "step": 15160 }, { "ce_loss": 0.09428223967552185, "epoch": 5.056704469646431, "step": 15160 }, { "distill_loss": 0.10339818894863129, "epoch": 5.056704469646431, "step": 15160 }, { "epoch": 5.056704469646431, "ref_ce_loss": 0.05715091899037361, "step": 15160 }, { "epoch": 5.056704469646431, "loss": 0.2483268678188324, "step": 15160 }, { "ce_loss": 0.054164640605449677, "epoch": 5.056704469646431, "step": 15160 }, { "distill_loss": 0.08527297526597977, "epoch": 5.056704469646431, "step": 15160 }, { "epoch": 5.056704469646431, "ref_ce_loss": 0.0554942861199379, "step": 15160 }, { "epoch": 5.056704469646431, "loss": 0.35373765230178833, "step": 15160 }, { "ce_loss": 0.05942567065358162, "epoch": 5.056704469646431, "step": 15160 }, { "distill_loss": 0.1203283965587616, "epoch": 5.056704469646431, "step": 15160 }, { "epoch": 5.056704469646431, "ref_ce_loss": 0.08127188682556152, "step": 15160 }, { "epoch": 5.060040026684456, "loss": 0.3311, "step": 15170 }, { "epoch": 5.060040026684456, "grad_norm": 2.827293872833252, "step": 15170 }, { "epoch": 5.060040026684456, "learning_rate": 9.428596433858136e-05, "step": 15170 }, { "epoch": 5.060040026684456, "loss": 0.2379085123538971, "step": 15170 }, { "ce_loss": 0.041343193501234055, "epoch": 5.060040026684456, "step": 15170 }, { "distill_loss": 0.13744580745697021, "epoch": 5.060040026684456, "step": 15170 }, { "epoch": 5.060040026684456, "ref_ce_loss": 0.03911873325705528, "step": 15170 }, { "epoch": 5.060040026684456, "loss": 0.5597814917564392, "step": 15170 }, { "ce_loss": 0.04659683257341385, "epoch": 5.060040026684456, "step": 15170 }, { "distill_loss": 0.07799013704061508, "epoch": 5.060040026684456, "step": 15170 }, { "epoch": 5.060040026684456, "ref_ce_loss": 0.08074043691158295, "step": 15170 }, { "epoch": 5.060040026684456, "loss": 0.3475240170955658, "step": 15170 }, { "ce_loss": 0.0347454808652401, "epoch": 5.060040026684456, "step": 15170 }, { "distill_loss": 0.10918953269720078, "epoch": 5.060040026684456, "step": 15170 }, { "epoch": 5.060040026684456, "ref_ce_loss": 0.1333533227443695, "step": 15170 }, { "epoch": 5.060040026684456, "loss": 0.33609458804130554, "step": 15170 }, { "ce_loss": 0.1237572431564331, "epoch": 5.060040026684456, "step": 15170 }, { "distill_loss": 0.12210750579833984, "epoch": 5.060040026684456, "step": 15170 }, { "epoch": 5.060040026684456, "ref_ce_loss": 0.08911029249429703, "step": 15170 }, { "epoch": 5.063375583722482, "loss": 0.3352, "step": 15180 }, { "epoch": 5.063375583722482, "grad_norm": 1.6944329738616943, "step": 15180 }, { "epoch": 5.063375583722482, "learning_rate": 9.409794457032959e-05, "step": 15180 }, { "epoch": 5.063375583722482, "loss": 0.3432515859603882, "step": 15180 }, { "ce_loss": 0.0322825089097023, "epoch": 5.063375583722482, "step": 15180 }, { "distill_loss": 0.11505106091499329, "epoch": 5.063375583722482, "step": 15180 }, { "epoch": 5.063375583722482, "ref_ce_loss": 0.06054326146841049, "step": 15180 }, { "epoch": 5.063375583722482, "loss": 0.2868589460849762, "step": 15180 }, { "ce_loss": 0.05462491139769554, "epoch": 5.063375583722482, "step": 15180 }, { "distill_loss": 0.10324221104383469, "epoch": 5.063375583722482, "step": 15180 }, { "epoch": 5.063375583722482, "ref_ce_loss": 0.07764450460672379, "step": 15180 }, { "epoch": 5.063375583722482, "loss": 0.41091442108154297, "step": 15180 }, { "ce_loss": 0.14110834896564484, "epoch": 5.063375583722482, "step": 15180 }, { "distill_loss": 0.13619858026504517, "epoch": 5.063375583722482, "step": 15180 }, { "epoch": 5.063375583722482, "ref_ce_loss": 0.08693759888410568, "step": 15180 }, { "epoch": 5.063375583722482, "loss": 0.37911006808280945, "step": 15180 }, { "ce_loss": 0.09819250553846359, "epoch": 5.063375583722482, "step": 15180 }, { "distill_loss": 0.13968545198440552, "epoch": 5.063375583722482, "step": 15180 }, { "epoch": 5.063375583722482, "ref_ce_loss": 0.06106346845626831, "step": 15180 }, { "epoch": 5.066711140760507, "loss": 0.3193, "step": 15190 }, { "epoch": 5.066711140760507, "grad_norm": 2.3509528636932373, "step": 15190 }, { "epoch": 5.066711140760507, "learning_rate": 9.391002674537538e-05, "step": 15190 }, { "epoch": 5.066711140760507, "loss": 0.4000142812728882, "step": 15190 }, { "ce_loss": 0.07615770399570465, "epoch": 5.066711140760507, "step": 15190 }, { "distill_loss": 0.15297187864780426, "epoch": 5.066711140760507, "step": 15190 }, { "epoch": 5.066711140760507, "ref_ce_loss": 0.09272449463605881, "step": 15190 }, { "epoch": 5.066711140760507, "loss": 0.49558955430984497, "step": 15190 }, { "ce_loss": 0.12531791627407074, "epoch": 5.066711140760507, "step": 15190 }, { "distill_loss": 0.12746287882328033, "epoch": 5.066711140760507, "step": 15190 }, { "epoch": 5.066711140760507, "ref_ce_loss": 0.09971451759338379, "step": 15190 }, { "epoch": 5.066711140760507, "loss": 0.17504757642745972, "step": 15190 }, { "ce_loss": 0.02030886709690094, "epoch": 5.066711140760507, "step": 15190 }, { "distill_loss": 0.0881674736738205, "epoch": 5.066711140760507, "step": 15190 }, { "epoch": 5.066711140760507, "ref_ce_loss": 0.06651587039232254, "step": 15190 }, { "epoch": 5.066711140760507, "loss": 0.2239861637353897, "step": 15190 }, { "ce_loss": 0.023175470530986786, "epoch": 5.066711140760507, "step": 15190 }, { "distill_loss": 0.09253476560115814, "epoch": 5.066711140760507, "step": 15190 }, { "epoch": 5.066711140760507, "ref_ce_loss": 0.06468259543180466, "step": 15190 }, { "epoch": 5.070046697798532, "loss": 0.2986, "step": 15200 }, { "epoch": 5.070046697798532, "grad_norm": 3.5859014987945557, "step": 15200 }, { "epoch": 5.070046697798532, "learning_rate": 9.37222112064067e-05, "step": 15200 }, { "epoch": 5.070046697798532, "loss": 0.431865930557251, "step": 15200 }, { "ce_loss": 0.03801083192229271, "epoch": 5.070046697798532, "step": 15200 }, { "distill_loss": 0.09702988713979721, "epoch": 5.070046697798532, "step": 15200 }, { "epoch": 5.070046697798532, "ref_ce_loss": 0.08860328048467636, "step": 15200 }, { "epoch": 5.070046697798532, "loss": 0.3617652654647827, "step": 15200 }, { "ce_loss": 0.09015195816755295, "epoch": 5.070046697798532, "step": 15200 }, { "distill_loss": 0.10294674336910248, "epoch": 5.070046697798532, "step": 15200 }, { "epoch": 5.070046697798532, "ref_ce_loss": 0.06347037851810455, "step": 15200 }, { "epoch": 5.070046697798532, "loss": 0.19948525726795197, "step": 15200 }, { "ce_loss": 0.04417693242430687, "epoch": 5.070046697798532, "step": 15200 }, { "distill_loss": 0.09427918493747711, "epoch": 5.070046697798532, "step": 15200 }, { "epoch": 5.070046697798532, "ref_ce_loss": 0.05085763707756996, "step": 15200 }, { "epoch": 5.070046697798532, "loss": 0.24006901681423187, "step": 15200 }, { "ce_loss": 0.049805957823991776, "epoch": 5.070046697798532, "step": 15200 }, { "distill_loss": 0.06830768287181854, "epoch": 5.070046697798532, "step": 15200 }, { "epoch": 5.070046697798532, "ref_ce_loss": 0.0619424544274807, "step": 15200 }, { "epoch": 5.073382254836558, "loss": 0.3709, "step": 15210 }, { "epoch": 5.073382254836558, "grad_norm": 4.331812858581543, "step": 15210 }, { "epoch": 5.073382254836558, "learning_rate": 9.353449829592502e-05, "step": 15210 }, { "epoch": 5.073382254836558, "loss": 0.3576752841472626, "step": 15210 }, { "ce_loss": 0.09517745673656464, "epoch": 5.073382254836558, "step": 15210 }, { "distill_loss": 0.09828440099954605, "epoch": 5.073382254836558, "step": 15210 }, { "epoch": 5.073382254836558, "ref_ce_loss": 0.10009924322366714, "step": 15210 }, { "epoch": 5.073382254836558, "loss": 0.21468718349933624, "step": 15210 }, { "ce_loss": 0.044741351157426834, "epoch": 5.073382254836558, "step": 15210 }, { "distill_loss": 0.07588415592908859, "epoch": 5.073382254836558, "step": 15210 }, { "epoch": 5.073382254836558, "ref_ce_loss": 0.06994284689426422, "step": 15210 }, { "epoch": 5.073382254836558, "loss": 0.5331860184669495, "step": 15210 }, { "ce_loss": 0.06379786878824234, "epoch": 5.073382254836558, "step": 15210 }, { "distill_loss": 0.0999433696269989, "epoch": 5.073382254836558, "step": 15210 }, { "epoch": 5.073382254836558, "ref_ce_loss": 0.05732704699039459, "step": 15210 }, { "epoch": 5.073382254836558, "loss": 0.2616701126098633, "step": 15210 }, { "ce_loss": 0.03689726069569588, "epoch": 5.073382254836558, "step": 15210 }, { "distill_loss": 0.09216289967298508, "epoch": 5.073382254836558, "step": 15210 }, { "epoch": 5.073382254836558, "ref_ce_loss": 0.09889374673366547, "step": 15210 }, { "epoch": 5.076717811874583, "loss": 0.3476, "step": 15220 }, { "epoch": 5.076717811874583, "grad_norm": 2.3121414184570312, "step": 15220 }, { "epoch": 5.076717811874583, "learning_rate": 9.334688835624459e-05, "step": 15220 }, { "epoch": 5.076717811874583, "loss": 0.472523033618927, "step": 15220 }, { "ce_loss": 0.11912497878074646, "epoch": 5.076717811874583, "step": 15220 }, { "distill_loss": 0.1291174292564392, "epoch": 5.076717811874583, "step": 15220 }, { "epoch": 5.076717811874583, "ref_ce_loss": 0.11859400570392609, "step": 15220 }, { "epoch": 5.076717811874583, "loss": 0.30351346731185913, "step": 15220 }, { "ce_loss": 0.010588656179606915, "epoch": 5.076717811874583, "step": 15220 }, { "distill_loss": 0.07482418417930603, "epoch": 5.076717811874583, "step": 15220 }, { "epoch": 5.076717811874583, "ref_ce_loss": 0.07996892929077148, "step": 15220 }, { "epoch": 5.076717811874583, "loss": 0.458459734916687, "step": 15220 }, { "ce_loss": 0.06901762634515762, "epoch": 5.076717811874583, "step": 15220 }, { "distill_loss": 0.10283029824495316, "epoch": 5.076717811874583, "step": 15220 }, { "epoch": 5.076717811874583, "ref_ce_loss": 0.09888451546430588, "step": 15220 }, { "epoch": 5.076717811874583, "loss": 0.31203603744506836, "step": 15220 }, { "ce_loss": 0.09299197793006897, "epoch": 5.076717811874583, "step": 15220 }, { "distill_loss": 0.12244352698326111, "epoch": 5.076717811874583, "step": 15220 }, { "epoch": 5.076717811874583, "ref_ce_loss": 0.07369520515203476, "step": 15220 }, { "epoch": 5.080053368912608, "loss": 0.3461, "step": 15230 }, { "epoch": 5.080053368912608, "grad_norm": 3.2603442668914795, "step": 15230 }, { "epoch": 5.080053368912608, "learning_rate": 9.3159381729492e-05, "step": 15230 }, { "epoch": 5.080053368912608, "loss": 0.28526246547698975, "step": 15230 }, { "ce_loss": 0.09769675880670547, "epoch": 5.080053368912608, "step": 15230 }, { "distill_loss": 0.10557578504085541, "epoch": 5.080053368912608, "step": 15230 }, { "epoch": 5.080053368912608, "ref_ce_loss": 0.05751334875822067, "step": 15230 }, { "epoch": 5.080053368912608, "loss": 0.1877545714378357, "step": 15230 }, { "ce_loss": 0.014897209592163563, "epoch": 5.080053368912608, "step": 15230 }, { "distill_loss": 0.10393606126308441, "epoch": 5.080053368912608, "step": 15230 }, { "epoch": 5.080053368912608, "ref_ce_loss": 0.05093152076005936, "step": 15230 }, { "epoch": 5.080053368912608, "loss": 0.19574418663978577, "step": 15230 }, { "ce_loss": 0.01883758045732975, "epoch": 5.080053368912608, "step": 15230 }, { "distill_loss": 0.08023209124803543, "epoch": 5.080053368912608, "step": 15230 }, { "epoch": 5.080053368912608, "ref_ce_loss": 0.07024283707141876, "step": 15230 }, { "epoch": 5.080053368912608, "loss": 0.38844019174575806, "step": 15230 }, { "ce_loss": 0.12029389292001724, "epoch": 5.080053368912608, "step": 15230 }, { "distill_loss": 0.12513545155525208, "epoch": 5.080053368912608, "step": 15230 }, { "epoch": 5.080053368912608, "ref_ce_loss": 0.10578837245702744, "step": 15230 }, { "epoch": 5.083388925950634, "loss": 0.2986, "step": 15240 }, { "epoch": 5.083388925950634, "grad_norm": 3.108018159866333, "step": 15240 }, { "epoch": 5.083388925950634, "learning_rate": 9.297197875760533e-05, "step": 15240 }, { "epoch": 5.083388925950634, "loss": 0.6390986442565918, "step": 15240 }, { "ce_loss": 0.13565464317798615, "epoch": 5.083388925950634, "step": 15240 }, { "distill_loss": 0.12606781721115112, "epoch": 5.083388925950634, "step": 15240 }, { "epoch": 5.083388925950634, "ref_ce_loss": 0.08777014166116714, "step": 15240 }, { "epoch": 5.083388925950634, "loss": 0.27036014199256897, "step": 15240 }, { "ce_loss": 0.06920570880174637, "epoch": 5.083388925950634, "step": 15240 }, { "distill_loss": 0.09370496869087219, "epoch": 5.083388925950634, "step": 15240 }, { "epoch": 5.083388925950634, "ref_ce_loss": 0.06318096071481705, "step": 15240 }, { "epoch": 5.083388925950634, "loss": 0.3419868052005768, "step": 15240 }, { "ce_loss": 0.042620949447155, "epoch": 5.083388925950634, "step": 15240 }, { "distill_loss": 0.08936305344104767, "epoch": 5.083388925950634, "step": 15240 }, { "epoch": 5.083388925950634, "ref_ce_loss": 0.07430487871170044, "step": 15240 }, { "epoch": 5.083388925950634, "loss": 0.3731623888015747, "step": 15240 }, { "ce_loss": 0.07372905313968658, "epoch": 5.083388925950634, "step": 15240 }, { "distill_loss": 0.09781654924154282, "epoch": 5.083388925950634, "step": 15240 }, { "epoch": 5.083388925950634, "ref_ce_loss": 0.0866960659623146, "step": 15240 }, { "epoch": 5.086724482988659, "loss": 0.3528, "step": 15250 }, { "epoch": 5.086724482988659, "grad_norm": 2.033977508544922, "step": 15250 }, { "epoch": 5.086724482988659, "learning_rate": 9.278467978233372e-05, "step": 15250 }, { "epoch": 5.086724482988659, "loss": 0.3369258642196655, "step": 15250 }, { "ce_loss": 0.1479497104883194, "epoch": 5.086724482988659, "step": 15250 }, { "distill_loss": 0.1182151734828949, "epoch": 5.086724482988659, "step": 15250 }, { "epoch": 5.086724482988659, "ref_ce_loss": 0.06997857987880707, "step": 15250 }, { "epoch": 5.086724482988659, "loss": 0.26905104517936707, "step": 15250 }, { "ce_loss": 0.04487191140651703, "epoch": 5.086724482988659, "step": 15250 }, { "distill_loss": 0.08934783935546875, "epoch": 5.086724482988659, "step": 15250 }, { "epoch": 5.086724482988659, "ref_ce_loss": 0.06253762543201447, "step": 15250 }, { "epoch": 5.086724482988659, "loss": 0.39201846718788147, "step": 15250 }, { "ce_loss": 0.12044522911310196, "epoch": 5.086724482988659, "step": 15250 }, { "distill_loss": 0.11188692599534988, "epoch": 5.086724482988659, "step": 15250 }, { "epoch": 5.086724482988659, "ref_ce_loss": 0.0723666250705719, "step": 15250 }, { "epoch": 5.086724482988659, "loss": 0.17451854050159454, "step": 15250 }, { "ce_loss": 0.04381131753325462, "epoch": 5.086724482988659, "step": 15250 }, { "distill_loss": 0.08575734496116638, "epoch": 5.086724482988659, "step": 15250 }, { "epoch": 5.086724482988659, "ref_ce_loss": 0.0445830300450325, "step": 15250 }, { "epoch": 5.090060040026684, "loss": 0.2921, "step": 15260 }, { "epoch": 5.090060040026684, "grad_norm": 2.043253183364868, "step": 15260 }, { "epoch": 5.090060040026684, "learning_rate": 9.259748514523653e-05, "step": 15260 }, { "epoch": 5.090060040026684, "loss": 0.29348182678222656, "step": 15260 }, { "ce_loss": 0.07424913346767426, "epoch": 5.090060040026684, "step": 15260 }, { "distill_loss": 0.09006142616271973, "epoch": 5.090060040026684, "step": 15260 }, { "epoch": 5.090060040026684, "ref_ce_loss": 0.06587568670511246, "step": 15260 }, { "epoch": 5.090060040026684, "loss": 0.3772607147693634, "step": 15260 }, { "ce_loss": 0.08147673308849335, "epoch": 5.090060040026684, "step": 15260 }, { "distill_loss": 0.09111148118972778, "epoch": 5.090060040026684, "step": 15260 }, { "epoch": 5.090060040026684, "ref_ce_loss": 0.12253421545028687, "step": 15260 }, { "epoch": 5.090060040026684, "loss": 0.4282752275466919, "step": 15260 }, { "ce_loss": 0.10974805057048798, "epoch": 5.090060040026684, "step": 15260 }, { "distill_loss": 0.10467756539583206, "epoch": 5.090060040026684, "step": 15260 }, { "epoch": 5.090060040026684, "ref_ce_loss": 0.11229056119918823, "step": 15260 }, { "epoch": 5.090060040026684, "loss": 0.26593005657196045, "step": 15260 }, { "ce_loss": 0.05944935232400894, "epoch": 5.090060040026684, "step": 15260 }, { "distill_loss": 0.1361900418996811, "epoch": 5.090060040026684, "step": 15260 }, { "epoch": 5.090060040026684, "ref_ce_loss": 0.07016180455684662, "step": 15260 }, { "epoch": 5.09339559706471, "loss": 0.3176, "step": 15270 }, { "epoch": 5.09339559706471, "grad_norm": 4.857101917266846, "step": 15270 }, { "epoch": 5.09339559706471, "learning_rate": 9.241039518768301e-05, "step": 15270 }, { "epoch": 5.09339559706471, "loss": 0.19465944170951843, "step": 15270 }, { "ce_loss": 0.01948079839348793, "epoch": 5.09339559706471, "step": 15270 }, { "distill_loss": 0.08734491467475891, "epoch": 5.09339559706471, "step": 15270 }, { "epoch": 5.09339559706471, "ref_ce_loss": 0.04057784005999565, "step": 15270 }, { "epoch": 5.09339559706471, "loss": 0.4380457401275635, "step": 15270 }, { "ce_loss": 0.09683702141046524, "epoch": 5.09339559706471, "step": 15270 }, { "distill_loss": 0.13187667727470398, "epoch": 5.09339559706471, "step": 15270 }, { "epoch": 5.09339559706471, "ref_ce_loss": 0.08302433043718338, "step": 15270 }, { "epoch": 5.09339559706471, "loss": 0.42582035064697266, "step": 15270 }, { "ce_loss": 0.084503673017025, "epoch": 5.09339559706471, "step": 15270 }, { "distill_loss": 0.10415349155664444, "epoch": 5.09339559706471, "step": 15270 }, { "epoch": 5.09339559706471, "ref_ce_loss": 0.03986379876732826, "step": 15270 }, { "epoch": 5.09339559706471, "loss": 0.2158432900905609, "step": 15270 }, { "ce_loss": 0.058277517557144165, "epoch": 5.09339559706471, "step": 15270 }, { "distill_loss": 0.09706753492355347, "epoch": 5.09339559706471, "step": 15270 }, { "epoch": 5.09339559706471, "ref_ce_loss": 0.06036210060119629, "step": 15270 }, { "epoch": 5.096731154102735, "loss": 0.3193, "step": 15280 }, { "epoch": 5.096731154102735, "grad_norm": 1.6487956047058105, "step": 15280 }, { "epoch": 5.096731154102735, "learning_rate": 9.222341025085144e-05, "step": 15280 }, { "epoch": 5.096731154102735, "loss": 0.4731646776199341, "step": 15280 }, { "ce_loss": 0.16261830925941467, "epoch": 5.096731154102735, "step": 15280 }, { "distill_loss": 0.12539474666118622, "epoch": 5.096731154102735, "step": 15280 }, { "epoch": 5.096731154102735, "ref_ce_loss": 0.09363444149494171, "step": 15280 }, { "epoch": 5.096731154102735, "loss": 0.4620886445045471, "step": 15280 }, { "ce_loss": 0.06607316434383392, "epoch": 5.096731154102735, "step": 15280 }, { "distill_loss": 0.12559787929058075, "epoch": 5.096731154102735, "step": 15280 }, { "epoch": 5.096731154102735, "ref_ce_loss": 0.08794140070676804, "step": 15280 }, { "epoch": 5.096731154102735, "loss": 0.21943168342113495, "step": 15280 }, { "ce_loss": 0.03757490962743759, "epoch": 5.096731154102735, "step": 15280 }, { "distill_loss": 0.09392014890909195, "epoch": 5.096731154102735, "step": 15280 }, { "epoch": 5.096731154102735, "ref_ce_loss": 0.08771252632141113, "step": 15280 }, { "epoch": 5.096731154102735, "loss": 0.2953905761241913, "step": 15280 }, { "ce_loss": 0.05430855229496956, "epoch": 5.096731154102735, "step": 15280 }, { "distill_loss": 0.10314220190048218, "epoch": 5.096731154102735, "step": 15280 }, { "epoch": 5.096731154102735, "ref_ce_loss": 0.04704827815294266, "step": 15280 }, { "epoch": 5.1000667111407605, "loss": 0.3289, "step": 15290 }, { "epoch": 5.1000667111407605, "grad_norm": 2.2417662143707275, "step": 15290 }, { "epoch": 5.1000667111407605, "learning_rate": 9.203653067572855e-05, "step": 15290 }, { "epoch": 5.1000667111407605, "loss": 0.25665655732154846, "step": 15290 }, { "ce_loss": 0.03016096167266369, "epoch": 5.1000667111407605, "step": 15290 }, { "distill_loss": 0.09567056596279144, "epoch": 5.1000667111407605, "step": 15290 }, { "epoch": 5.1000667111407605, "ref_ce_loss": 0.04990023002028465, "step": 15290 }, { "epoch": 5.1000667111407605, "loss": 0.30939507484436035, "step": 15290 }, { "ce_loss": 0.09582208096981049, "epoch": 5.1000667111407605, "step": 15290 }, { "distill_loss": 0.11035038530826569, "epoch": 5.1000667111407605, "step": 15290 }, { "epoch": 5.1000667111407605, "ref_ce_loss": 0.06970993429422379, "step": 15290 }, { "epoch": 5.1000667111407605, "loss": 0.1940576434135437, "step": 15290 }, { "ce_loss": 0.04005913808941841, "epoch": 5.1000667111407605, "step": 15290 }, { "distill_loss": 0.09311024844646454, "epoch": 5.1000667111407605, "step": 15290 }, { "epoch": 5.1000667111407605, "ref_ce_loss": 0.06080329045653343, "step": 15290 }, { "epoch": 5.1000667111407605, "loss": 0.4482858180999756, "step": 15290 }, { "ce_loss": 0.12186356633901596, "epoch": 5.1000667111407605, "step": 15290 }, { "distill_loss": 0.14439421892166138, "epoch": 5.1000667111407605, "step": 15290 }, { "epoch": 5.1000667111407605, "ref_ce_loss": 0.13036789000034332, "step": 15290 }, { "epoch": 5.103402268178786, "loss": 0.3659, "step": 15300 }, { "epoch": 5.103402268178786, "grad_norm": 3.9782018661499023, "step": 15300 }, { "epoch": 5.103402268178786, "learning_rate": 9.184975680310901e-05, "step": 15300 }, { "epoch": 5.103402268178786, "loss": 0.2465575784444809, "step": 15300 }, { "ce_loss": 0.06427565962076187, "epoch": 5.103402268178786, "step": 15300 }, { "distill_loss": 0.08616838604211807, "epoch": 5.103402268178786, "step": 15300 }, { "epoch": 5.103402268178786, "ref_ce_loss": 0.06796655058860779, "step": 15300 }, { "epoch": 5.103402268178786, "loss": 0.25205734372138977, "step": 15300 }, { "ce_loss": 0.05255574360489845, "epoch": 5.103402268178786, "step": 15300 }, { "distill_loss": 0.0944940596818924, "epoch": 5.103402268178786, "step": 15300 }, { "epoch": 5.103402268178786, "ref_ce_loss": 0.0681745782494545, "step": 15300 }, { "epoch": 5.103402268178786, "loss": 0.36056971549987793, "step": 15300 }, { "ce_loss": 0.1121678277850151, "epoch": 5.103402268178786, "step": 15300 }, { "distill_loss": 0.13534539937973022, "epoch": 5.103402268178786, "step": 15300 }, { "epoch": 5.103402268178786, "ref_ce_loss": 0.07255236059427261, "step": 15300 }, { "epoch": 5.103402268178786, "loss": 0.7847564220428467, "step": 15300 }, { "ce_loss": 0.06709320098161697, "epoch": 5.103402268178786, "step": 15300 }, { "distill_loss": 0.13826727867126465, "epoch": 5.103402268178786, "step": 15300 }, { "epoch": 5.103402268178786, "ref_ce_loss": 0.049133818596601486, "step": 15300 }, { "epoch": 5.106737825216811, "loss": 0.3244, "step": 15310 }, { "epoch": 5.106737825216811, "grad_norm": 4.447413921356201, "step": 15310 }, { "epoch": 5.106737825216811, "learning_rate": 9.166308897359464e-05, "step": 15310 }, { "epoch": 5.106737825216811, "loss": 0.35078248381614685, "step": 15310 }, { "ce_loss": 0.12107393890619278, "epoch": 5.106737825216811, "step": 15310 }, { "distill_loss": 0.13566362857818604, "epoch": 5.106737825216811, "step": 15310 }, { "epoch": 5.106737825216811, "ref_ce_loss": 0.05730048939585686, "step": 15310 }, { "epoch": 5.106737825216811, "loss": 0.3367649018764496, "step": 15310 }, { "ce_loss": 0.06390248984098434, "epoch": 5.106737825216811, "step": 15310 }, { "distill_loss": 0.1456541270017624, "epoch": 5.106737825216811, "step": 15310 }, { "epoch": 5.106737825216811, "ref_ce_loss": 0.10176167637109756, "step": 15310 }, { "epoch": 5.106737825216811, "loss": 0.23953138291835785, "step": 15310 }, { "ce_loss": 0.07575532048940659, "epoch": 5.106737825216811, "step": 15310 }, { "distill_loss": 0.07758189737796783, "epoch": 5.106737825216811, "step": 15310 }, { "epoch": 5.106737825216811, "ref_ce_loss": 0.04956245794892311, "step": 15310 }, { "epoch": 5.106737825216811, "loss": 0.3669511079788208, "step": 15310 }, { "ce_loss": 0.058586835861206055, "epoch": 5.106737825216811, "step": 15310 }, { "distill_loss": 0.10620009154081345, "epoch": 5.106737825216811, "step": 15310 }, { "epoch": 5.106737825216811, "ref_ce_loss": 0.14764153957366943, "step": 15310 }, { "epoch": 5.1100733822548365, "loss": 0.3812, "step": 15320 }, { "epoch": 5.1100733822548365, "grad_norm": 2.4810564517974854, "step": 15320 }, { "epoch": 5.1100733822548365, "learning_rate": 9.147652752759394e-05, "step": 15320 }, { "epoch": 5.1100733822548365, "loss": 0.4458327889442444, "step": 15320 }, { "ce_loss": 0.09967893362045288, "epoch": 5.1100733822548365, "step": 15320 }, { "distill_loss": 0.13925136625766754, "epoch": 5.1100733822548365, "step": 15320 }, { "epoch": 5.1100733822548365, "ref_ce_loss": 0.07300533354282379, "step": 15320 }, { "epoch": 5.1100733822548365, "loss": 0.47262224555015564, "step": 15320 }, { "ce_loss": 0.18473461270332336, "epoch": 5.1100733822548365, "step": 15320 }, { "distill_loss": 0.1558404266834259, "epoch": 5.1100733822548365, "step": 15320 }, { "epoch": 5.1100733822548365, "ref_ce_loss": 0.11091091483831406, "step": 15320 }, { "epoch": 5.1100733822548365, "loss": 0.3459745943546295, "step": 15320 }, { "ce_loss": 0.07840926945209503, "epoch": 5.1100733822548365, "step": 15320 }, { "distill_loss": 0.09304723888635635, "epoch": 5.1100733822548365, "step": 15320 }, { "epoch": 5.1100733822548365, "ref_ce_loss": 0.08125700801610947, "step": 15320 }, { "epoch": 5.1100733822548365, "loss": 0.18763215839862823, "step": 15320 }, { "ce_loss": 0.034408580511808395, "epoch": 5.1100733822548365, "step": 15320 }, { "distill_loss": 0.0835486575961113, "epoch": 5.1100733822548365, "step": 15320 }, { "epoch": 5.1100733822548365, "ref_ce_loss": 0.045321013778448105, "step": 15320 }, { "epoch": 5.113408939292862, "loss": 0.3723, "step": 15330 }, { "epoch": 5.113408939292862, "grad_norm": 2.5682082176208496, "step": 15330 }, { "epoch": 5.113408939292862, "learning_rate": 9.129007280532144e-05, "step": 15330 }, { "epoch": 5.113408939292862, "loss": 0.41082343459129333, "step": 15330 }, { "ce_loss": 0.12252845615148544, "epoch": 5.113408939292862, "step": 15330 }, { "distill_loss": 0.1458989977836609, "epoch": 5.113408939292862, "step": 15330 }, { "epoch": 5.113408939292862, "ref_ce_loss": 0.10372576117515564, "step": 15330 }, { "epoch": 5.113408939292862, "loss": 0.26919203996658325, "step": 15330 }, { "ce_loss": 0.033415187150239944, "epoch": 5.113408939292862, "step": 15330 }, { "distill_loss": 0.09982331097126007, "epoch": 5.113408939292862, "step": 15330 }, { "epoch": 5.113408939292862, "ref_ce_loss": 0.0936480164527893, "step": 15330 }, { "epoch": 5.113408939292862, "loss": 0.3246932625770569, "step": 15330 }, { "ce_loss": 0.09546466171741486, "epoch": 5.113408939292862, "step": 15330 }, { "distill_loss": 0.10523758083581924, "epoch": 5.113408939292862, "step": 15330 }, { "epoch": 5.113408939292862, "ref_ce_loss": 0.07602840662002563, "step": 15330 }, { "epoch": 5.113408939292862, "loss": 0.17769919335842133, "step": 15330 }, { "ce_loss": 0.01439160481095314, "epoch": 5.113408939292862, "step": 15330 }, { "distill_loss": 0.08686622977256775, "epoch": 5.113408939292862, "step": 15330 }, { "epoch": 5.113408939292862, "ref_ce_loss": 0.05428411811590195, "step": 15330 }, { "epoch": 5.116744496330887, "loss": 0.3794, "step": 15340 }, { "epoch": 5.116744496330887, "grad_norm": 3.887604236602783, "step": 15340 }, { "epoch": 5.116744496330887, "learning_rate": 9.110372514679691e-05, "step": 15340 }, { "epoch": 5.116744496330887, "loss": 0.39693522453308105, "step": 15340 }, { "ce_loss": 0.11991281062364578, "epoch": 5.116744496330887, "step": 15340 }, { "distill_loss": 0.12842141091823578, "epoch": 5.116744496330887, "step": 15340 }, { "epoch": 5.116744496330887, "ref_ce_loss": 0.07633031904697418, "step": 15340 }, { "epoch": 5.116744496330887, "loss": 0.5600422620773315, "step": 15340 }, { "ce_loss": 0.10626500844955444, "epoch": 5.116744496330887, "step": 15340 }, { "distill_loss": 0.08879688382148743, "epoch": 5.116744496330887, "step": 15340 }, { "epoch": 5.116744496330887, "ref_ce_loss": 0.08295416831970215, "step": 15340 }, { "epoch": 5.116744496330887, "loss": 0.24426814913749695, "step": 15340 }, { "ce_loss": 0.0578928105533123, "epoch": 5.116744496330887, "step": 15340 }, { "distill_loss": 0.12534251809120178, "epoch": 5.116744496330887, "step": 15340 }, { "epoch": 5.116744496330887, "ref_ce_loss": 0.06088608503341675, "step": 15340 }, { "epoch": 5.116744496330887, "loss": 0.2940409481525421, "step": 15340 }, { "ce_loss": 0.033138569444417953, "epoch": 5.116744496330887, "step": 15340 }, { "distill_loss": 0.11227000504732132, "epoch": 5.116744496330887, "step": 15340 }, { "epoch": 5.116744496330887, "ref_ce_loss": 0.06146222725510597, "step": 15340 }, { "epoch": 5.120080053368913, "loss": 0.3713, "step": 15350 }, { "epoch": 5.120080053368913, "grad_norm": 1.7324687242507935, "step": 15350 }, { "epoch": 5.120080053368913, "learning_rate": 9.091748489184506e-05, "step": 15350 }, { "epoch": 5.120080053368913, "loss": 0.32019171118736267, "step": 15350 }, { "ce_loss": 0.09052632749080658, "epoch": 5.120080053368913, "step": 15350 }, { "distill_loss": 0.09617231041193008, "epoch": 5.120080053368913, "step": 15350 }, { "epoch": 5.120080053368913, "ref_ce_loss": 0.09810184687376022, "step": 15350 }, { "epoch": 5.120080053368913, "loss": 0.21579107642173767, "step": 15350 }, { "ce_loss": 0.02102779597043991, "epoch": 5.120080053368913, "step": 15350 }, { "distill_loss": 0.10144367069005966, "epoch": 5.120080053368913, "step": 15350 }, { "epoch": 5.120080053368913, "ref_ce_loss": 0.09315359592437744, "step": 15350 }, { "epoch": 5.120080053368913, "loss": 0.24842038750648499, "step": 15350 }, { "ce_loss": 0.09609237313270569, "epoch": 5.120080053368913, "step": 15350 }, { "distill_loss": 0.08486215770244598, "epoch": 5.120080053368913, "step": 15350 }, { "epoch": 5.120080053368913, "ref_ce_loss": 0.06740619242191315, "step": 15350 }, { "epoch": 5.120080053368913, "loss": 0.2568802237510681, "step": 15350 }, { "ce_loss": 0.10689549893140793, "epoch": 5.120080053368913, "step": 15350 }, { "distill_loss": 0.09282051026821136, "epoch": 5.120080053368913, "step": 15350 }, { "epoch": 5.120080053368913, "ref_ce_loss": 0.0404185950756073, "step": 15350 }, { "epoch": 5.123415610406938, "loss": 0.2897, "step": 15360 }, { "epoch": 5.123415610406938, "grad_norm": 2.086287260055542, "step": 15360 }, { "epoch": 5.123415610406938, "learning_rate": 9.073135238009464e-05, "step": 15360 }, { "epoch": 5.123415610406938, "loss": 0.24856829643249512, "step": 15360 }, { "ce_loss": 0.04894977807998657, "epoch": 5.123415610406938, "step": 15360 }, { "distill_loss": 0.09595238417387009, "epoch": 5.123415610406938, "step": 15360 }, { "epoch": 5.123415610406938, "ref_ce_loss": 0.04782169312238693, "step": 15360 }, { "epoch": 5.123415610406938, "loss": 0.2760925590991974, "step": 15360 }, { "ce_loss": 0.06440954655408859, "epoch": 5.123415610406938, "step": 15360 }, { "distill_loss": 0.12960557639598846, "epoch": 5.123415610406938, "step": 15360 }, { "epoch": 5.123415610406938, "ref_ce_loss": 0.05290549620985985, "step": 15360 }, { "epoch": 5.123415610406938, "loss": 0.2818790078163147, "step": 15360 }, { "ce_loss": 0.05980372801423073, "epoch": 5.123415610406938, "step": 15360 }, { "distill_loss": 0.14864784479141235, "epoch": 5.123415610406938, "step": 15360 }, { "epoch": 5.123415610406938, "ref_ce_loss": 0.05248422548174858, "step": 15360 }, { "epoch": 5.123415610406938, "loss": 0.21428939700126648, "step": 15360 }, { "ce_loss": 0.02077779360115528, "epoch": 5.123415610406938, "step": 15360 }, { "distill_loss": 0.11778127402067184, "epoch": 5.123415610406938, "step": 15360 }, { "epoch": 5.123415610406938, "ref_ce_loss": 0.07553339004516602, "step": 15360 }, { "epoch": 5.126751167444963, "loss": 0.3455, "step": 15370 }, { "epoch": 5.126751167444963, "grad_norm": 3.2243845462799072, "step": 15370 }, { "epoch": 5.126751167444963, "learning_rate": 9.054532795097787e-05, "step": 15370 }, { "epoch": 5.126751167444963, "loss": 0.39891764521598816, "step": 15370 }, { "ce_loss": 0.028408152982592583, "epoch": 5.126751167444963, "step": 15370 }, { "distill_loss": 0.07895895093679428, "epoch": 5.126751167444963, "step": 15370 }, { "epoch": 5.126751167444963, "ref_ce_loss": 0.06767138093709946, "step": 15370 }, { "epoch": 5.126751167444963, "loss": 0.3717252016067505, "step": 15370 }, { "ce_loss": 0.056756921112537384, "epoch": 5.126751167444963, "step": 15370 }, { "distill_loss": 0.1180783361196518, "epoch": 5.126751167444963, "step": 15370 }, { "epoch": 5.126751167444963, "ref_ce_loss": 0.04642847180366516, "step": 15370 }, { "epoch": 5.126751167444963, "loss": 0.3460141122341156, "step": 15370 }, { "ce_loss": 0.07608392834663391, "epoch": 5.126751167444963, "step": 15370 }, { "distill_loss": 0.12452583014965057, "epoch": 5.126751167444963, "step": 15370 }, { "epoch": 5.126751167444963, "ref_ce_loss": 0.07184207439422607, "step": 15370 }, { "epoch": 5.126751167444963, "loss": 0.33525413274765015, "step": 15370 }, { "ce_loss": 0.05221555754542351, "epoch": 5.126751167444963, "step": 15370 }, { "distill_loss": 0.10671718418598175, "epoch": 5.126751167444963, "step": 15370 }, { "epoch": 5.126751167444963, "ref_ce_loss": 0.06864657998085022, "step": 15370 }, { "epoch": 5.130086724482989, "loss": 0.3404, "step": 15380 }, { "epoch": 5.130086724482989, "grad_norm": 1.6246341466903687, "step": 15380 }, { "epoch": 5.130086724482989, "learning_rate": 9.035941194373002e-05, "step": 15380 }, { "epoch": 5.130086724482989, "loss": 0.29942047595977783, "step": 15380 }, { "ce_loss": 0.07252945005893707, "epoch": 5.130086724482989, "step": 15380 }, { "distill_loss": 0.11366333812475204, "epoch": 5.130086724482989, "step": 15380 }, { "epoch": 5.130086724482989, "ref_ce_loss": 0.08319073170423508, "step": 15380 }, { "epoch": 5.130086724482989, "loss": 0.9193277359008789, "step": 15380 }, { "ce_loss": 0.10665949434041977, "epoch": 5.130086724482989, "step": 15380 }, { "distill_loss": 0.11006686836481094, "epoch": 5.130086724482989, "step": 15380 }, { "epoch": 5.130086724482989, "ref_ce_loss": 0.052423495799303055, "step": 15380 }, { "epoch": 5.130086724482989, "loss": 0.24829605221748352, "step": 15380 }, { "ce_loss": 0.043987125158309937, "epoch": 5.130086724482989, "step": 15380 }, { "distill_loss": 0.10426180064678192, "epoch": 5.130086724482989, "step": 15380 }, { "epoch": 5.130086724482989, "ref_ce_loss": 0.07357150316238403, "step": 15380 }, { "epoch": 5.130086724482989, "loss": 0.3645699918270111, "step": 15380 }, { "ce_loss": 0.10562653839588165, "epoch": 5.130086724482989, "step": 15380 }, { "distill_loss": 0.11489978432655334, "epoch": 5.130086724482989, "step": 15380 }, { "epoch": 5.130086724482989, "ref_ce_loss": 0.07542836666107178, "step": 15380 }, { "epoch": 5.133422281521014, "loss": 0.3531, "step": 15390 }, { "epoch": 5.133422281521014, "grad_norm": 2.2442030906677246, "step": 15390 }, { "epoch": 5.133422281521014, "learning_rate": 9.01736046973884e-05, "step": 15390 }, { "epoch": 5.133422281521014, "loss": 0.4058710038661957, "step": 15390 }, { "ce_loss": 0.08470144122838974, "epoch": 5.133422281521014, "step": 15390 }, { "distill_loss": 0.14811912178993225, "epoch": 5.133422281521014, "step": 15390 }, { "epoch": 5.133422281521014, "ref_ce_loss": 0.10741455852985382, "step": 15390 }, { "epoch": 5.133422281521014, "loss": 0.23130950331687927, "step": 15390 }, { "ce_loss": 0.06238336116075516, "epoch": 5.133422281521014, "step": 15390 }, { "distill_loss": 0.08404608815908432, "epoch": 5.133422281521014, "step": 15390 }, { "epoch": 5.133422281521014, "ref_ce_loss": 0.06146818771958351, "step": 15390 }, { "epoch": 5.133422281521014, "loss": 0.3462778925895691, "step": 15390 }, { "ce_loss": 0.08793909847736359, "epoch": 5.133422281521014, "step": 15390 }, { "distill_loss": 0.11264742165803909, "epoch": 5.133422281521014, "step": 15390 }, { "epoch": 5.133422281521014, "ref_ce_loss": 0.04813896492123604, "step": 15390 }, { "epoch": 5.133422281521014, "loss": 0.25730276107788086, "step": 15390 }, { "ce_loss": 0.05243609845638275, "epoch": 5.133422281521014, "step": 15390 }, { "distill_loss": 0.10522018373012543, "epoch": 5.133422281521014, "step": 15390 }, { "epoch": 5.133422281521014, "ref_ce_loss": 0.07645328342914581, "step": 15390 }, { "epoch": 5.136757838559039, "loss": 0.3227, "step": 15400 }, { "epoch": 5.136757838559039, "grad_norm": 3.527695655822754, "step": 15400 }, { "epoch": 5.136757838559039, "learning_rate": 8.998790655079227e-05, "step": 15400 }, { "epoch": 5.136757838559039, "loss": 0.519390344619751, "step": 15400 }, { "ce_loss": 0.08348696678876877, "epoch": 5.136757838559039, "step": 15400 }, { "distill_loss": 0.1378469467163086, "epoch": 5.136757838559039, "step": 15400 }, { "epoch": 5.136757838559039, "ref_ce_loss": 0.10278098285198212, "step": 15400 }, { "epoch": 5.136757838559039, "loss": 0.32452601194381714, "step": 15400 }, { "ce_loss": 0.04311065003275871, "epoch": 5.136757838559039, "step": 15400 }, { "distill_loss": 0.08557398617267609, "epoch": 5.136757838559039, "step": 15400 }, { "epoch": 5.136757838559039, "ref_ce_loss": 0.05109192430973053, "step": 15400 }, { "epoch": 5.136757838559039, "loss": 0.6966140866279602, "step": 15400 }, { "ce_loss": 0.04068000242114067, "epoch": 5.136757838559039, "step": 15400 }, { "distill_loss": 0.10309980064630508, "epoch": 5.136757838559039, "step": 15400 }, { "epoch": 5.136757838559039, "ref_ce_loss": 0.08326347917318344, "step": 15400 }, { "epoch": 5.136757838559039, "loss": 0.3585582375526428, "step": 15400 }, { "ce_loss": 0.10225559026002884, "epoch": 5.136757838559039, "step": 15400 }, { "distill_loss": 0.08397220820188522, "epoch": 5.136757838559039, "step": 15400 }, { "epoch": 5.136757838559039, "ref_ce_loss": 0.09892605245113373, "step": 15400 }, { "epoch": 5.140093395597065, "loss": 0.3735, "step": 15410 }, { "epoch": 5.140093395597065, "grad_norm": 4.83349609375, "step": 15410 }, { "epoch": 5.140093395597065, "learning_rate": 8.980231784258181e-05, "step": 15410 }, { "epoch": 5.140093395597065, "loss": 0.277986615896225, "step": 15410 }, { "ce_loss": 0.05845331773161888, "epoch": 5.140093395597065, "step": 15410 }, { "distill_loss": 0.0897456556558609, "epoch": 5.140093395597065, "step": 15410 }, { "epoch": 5.140093395597065, "ref_ce_loss": 0.09144359081983566, "step": 15410 }, { "epoch": 5.140093395597065, "loss": 0.4590834081172943, "step": 15410 }, { "ce_loss": 0.14646229147911072, "epoch": 5.140093395597065, "step": 15410 }, { "distill_loss": 0.159382164478302, "epoch": 5.140093395597065, "step": 15410 }, { "epoch": 5.140093395597065, "ref_ce_loss": 0.11667078733444214, "step": 15410 }, { "epoch": 5.140093395597065, "loss": 0.2067529857158661, "step": 15410 }, { "ce_loss": 0.044581297785043716, "epoch": 5.140093395597065, "step": 15410 }, { "distill_loss": 0.08390741050243378, "epoch": 5.140093395597065, "step": 15410 }, { "epoch": 5.140093395597065, "ref_ce_loss": 0.05592619255185127, "step": 15410 }, { "epoch": 5.140093395597065, "loss": 0.20260076224803925, "step": 15410 }, { "ce_loss": 0.02507142350077629, "epoch": 5.140093395597065, "step": 15410 }, { "distill_loss": 0.07849801331758499, "epoch": 5.140093395597065, "step": 15410 }, { "epoch": 5.140093395597065, "ref_ce_loss": 0.052643340080976486, "step": 15410 }, { "epoch": 5.14342895263509, "loss": 0.3161, "step": 15420 }, { "epoch": 5.14342895263509, "grad_norm": 1.7739144563674927, "step": 15420 }, { "epoch": 5.14342895263509, "learning_rate": 8.961683891119746e-05, "step": 15420 }, { "epoch": 5.14342895263509, "loss": 0.18003131449222565, "step": 15420 }, { "ce_loss": 0.04009293392300606, "epoch": 5.14342895263509, "step": 15420 }, { "distill_loss": 0.0766187459230423, "epoch": 5.14342895263509, "step": 15420 }, { "epoch": 5.14342895263509, "ref_ce_loss": 0.046599678695201874, "step": 15420 }, { "epoch": 5.14342895263509, "loss": 0.16576485335826874, "step": 15420 }, { "ce_loss": 0.0029233212117105722, "epoch": 5.14342895263509, "step": 15420 }, { "distill_loss": 0.08446105569601059, "epoch": 5.14342895263509, "step": 15420 }, { "epoch": 5.14342895263509, "ref_ce_loss": 0.0579720139503479, "step": 15420 }, { "epoch": 5.14342895263509, "loss": 0.5865795612335205, "step": 15420 }, { "ce_loss": 0.1778968721628189, "epoch": 5.14342895263509, "step": 15420 }, { "distill_loss": 0.15113386511802673, "epoch": 5.14342895263509, "step": 15420 }, { "epoch": 5.14342895263509, "ref_ce_loss": 0.08872532844543457, "step": 15420 }, { "epoch": 5.14342895263509, "loss": 0.3178934156894684, "step": 15420 }, { "ce_loss": 0.11432060599327087, "epoch": 5.14342895263509, "step": 15420 }, { "distill_loss": 0.11662434786558151, "epoch": 5.14342895263509, "step": 15420 }, { "epoch": 5.14342895263509, "ref_ce_loss": 0.08683821558952332, "step": 15420 }, { "epoch": 5.146764509673115, "loss": 0.3325, "step": 15430 }, { "epoch": 5.146764509673115, "grad_norm": 2.481506824493408, "step": 15430 }, { "epoch": 5.146764509673115, "learning_rate": 8.943147009487982e-05, "step": 15430 }, { "epoch": 5.146764509673115, "loss": 0.24885477125644684, "step": 15430 }, { "ce_loss": 0.04392295703291893, "epoch": 5.146764509673115, "step": 15430 }, { "distill_loss": 0.07781894505023956, "epoch": 5.146764509673115, "step": 15430 }, { "epoch": 5.146764509673115, "ref_ce_loss": 0.050707168877124786, "step": 15430 }, { "epoch": 5.146764509673115, "loss": 0.38043659925460815, "step": 15430 }, { "ce_loss": 0.11024061590433121, "epoch": 5.146764509673115, "step": 15430 }, { "distill_loss": 0.1460980325937271, "epoch": 5.146764509673115, "step": 15430 }, { "epoch": 5.146764509673115, "ref_ce_loss": 0.0721927061676979, "step": 15430 }, { "epoch": 5.146764509673115, "loss": 0.2317284792661667, "step": 15430 }, { "ce_loss": 0.051855847239494324, "epoch": 5.146764509673115, "step": 15430 }, { "distill_loss": 0.09237996488809586, "epoch": 5.146764509673115, "step": 15430 }, { "epoch": 5.146764509673115, "ref_ce_loss": 0.0872517004609108, "step": 15430 }, { "epoch": 5.146764509673115, "loss": 0.2589787542819977, "step": 15430 }, { "ce_loss": 0.03987698629498482, "epoch": 5.146764509673115, "step": 15430 }, { "distill_loss": 0.08046362549066544, "epoch": 5.146764509673115, "step": 15430 }, { "epoch": 5.146764509673115, "ref_ce_loss": 0.034916963428258896, "step": 15430 }, { "epoch": 5.150100066711141, "loss": 0.3243, "step": 15440 }, { "epoch": 5.150100066711141, "grad_norm": 3.888387680053711, "step": 15440 }, { "epoch": 5.150100066711141, "learning_rate": 8.924621173166832e-05, "step": 15440 }, { "epoch": 5.150100066711141, "loss": 0.3526419997215271, "step": 15440 }, { "ce_loss": 0.06011297181248665, "epoch": 5.150100066711141, "step": 15440 }, { "distill_loss": 0.11874806135892868, "epoch": 5.150100066711141, "step": 15440 }, { "epoch": 5.150100066711141, "ref_ce_loss": 0.09072629362344742, "step": 15440 }, { "epoch": 5.150100066711141, "loss": 0.38294023275375366, "step": 15440 }, { "ce_loss": 0.1409599930047989, "epoch": 5.150100066711141, "step": 15440 }, { "distill_loss": 0.14294949173927307, "epoch": 5.150100066711141, "step": 15440 }, { "epoch": 5.150100066711141, "ref_ce_loss": 0.09855452924966812, "step": 15440 }, { "epoch": 5.150100066711141, "loss": 0.22868633270263672, "step": 15440 }, { "ce_loss": 0.06168137490749359, "epoch": 5.150100066711141, "step": 15440 }, { "distill_loss": 0.09113404154777527, "epoch": 5.150100066711141, "step": 15440 }, { "epoch": 5.150100066711141, "ref_ce_loss": 0.07526343315839767, "step": 15440 }, { "epoch": 5.150100066711141, "loss": 0.39461877942085266, "step": 15440 }, { "ce_loss": 0.12180479615926743, "epoch": 5.150100066711141, "step": 15440 }, { "distill_loss": 0.1197972521185875, "epoch": 5.150100066711141, "step": 15440 }, { "epoch": 5.150100066711141, "ref_ce_loss": 0.08709913492202759, "step": 15440 }, { "epoch": 5.153435623749166, "loss": 0.3175, "step": 15450 }, { "epoch": 5.153435623749166, "grad_norm": 1.7098679542541504, "step": 15450 }, { "epoch": 5.153435623749166, "learning_rate": 8.906106415940117e-05, "step": 15450 }, { "epoch": 5.153435623749166, "loss": 0.269246369600296, "step": 15450 }, { "ce_loss": 0.08349336683750153, "epoch": 5.153435623749166, "step": 15450 }, { "distill_loss": 0.08976483345031738, "epoch": 5.153435623749166, "step": 15450 }, { "epoch": 5.153435623749166, "ref_ce_loss": 0.07661431282758713, "step": 15450 }, { "epoch": 5.153435623749166, "loss": 0.887234091758728, "step": 15450 }, { "ce_loss": 0.07439880073070526, "epoch": 5.153435623749166, "step": 15450 }, { "distill_loss": 0.1128494068980217, "epoch": 5.153435623749166, "step": 15450 }, { "epoch": 5.153435623749166, "ref_ce_loss": 0.09533000737428665, "step": 15450 }, { "epoch": 5.153435623749166, "loss": 0.4059937596321106, "step": 15450 }, { "ce_loss": 0.06530610471963882, "epoch": 5.153435623749166, "step": 15450 }, { "distill_loss": 0.09470158070325851, "epoch": 5.153435623749166, "step": 15450 }, { "epoch": 5.153435623749166, "ref_ce_loss": 0.06246132403612137, "step": 15450 }, { "epoch": 5.153435623749166, "loss": 0.25273844599723816, "step": 15450 }, { "ce_loss": 0.08435259014368057, "epoch": 5.153435623749166, "step": 15450 }, { "distill_loss": 0.07865085452795029, "epoch": 5.153435623749166, "step": 15450 }, { "epoch": 5.153435623749166, "ref_ce_loss": 0.0672929659485817, "step": 15450 }, { "epoch": 5.156771180787191, "loss": 0.3104, "step": 15460 }, { "epoch": 5.156771180787191, "grad_norm": 2.205923080444336, "step": 15460 }, { "epoch": 5.156771180787191, "learning_rate": 8.887602771571466e-05, "step": 15460 }, { "epoch": 5.156771180787191, "loss": 0.19020073115825653, "step": 15460 }, { "ce_loss": 0.033548977226018906, "epoch": 5.156771180787191, "step": 15460 }, { "distill_loss": 0.0989307090640068, "epoch": 5.156771180787191, "step": 15460 }, { "epoch": 5.156771180787191, "ref_ce_loss": 0.057435162365436554, "step": 15460 }, { "epoch": 5.156771180787191, "loss": 0.2394929677248001, "step": 15460 }, { "ce_loss": 0.07328430563211441, "epoch": 5.156771180787191, "step": 15460 }, { "distill_loss": 0.08450906723737717, "epoch": 5.156771180787191, "step": 15460 }, { "epoch": 5.156771180787191, "ref_ce_loss": 0.056285351514816284, "step": 15460 }, { "epoch": 5.156771180787191, "loss": 0.20999081432819366, "step": 15460 }, { "ce_loss": 0.029533574357628822, "epoch": 5.156771180787191, "step": 15460 }, { "distill_loss": 0.07489515841007233, "epoch": 5.156771180787191, "step": 15460 }, { "epoch": 5.156771180787191, "ref_ce_loss": 0.07470680773258209, "step": 15460 }, { "epoch": 5.156771180787191, "loss": 0.22600901126861572, "step": 15460 }, { "ce_loss": 0.06494138389825821, "epoch": 5.156771180787191, "step": 15460 }, { "distill_loss": 0.10133645683526993, "epoch": 5.156771180787191, "step": 15460 }, { "epoch": 5.156771180787191, "ref_ce_loss": 0.05953230336308479, "step": 15460 }, { "epoch": 5.160106737825217, "loss": 0.3429, "step": 15470 }, { "epoch": 5.160106737825217, "grad_norm": 2.1586124897003174, "step": 15470 }, { "epoch": 5.160106737825217, "learning_rate": 8.86911027380421e-05, "step": 15470 }, { "epoch": 5.160106737825217, "loss": 0.24646462500095367, "step": 15470 }, { "ce_loss": 0.07469391077756882, "epoch": 5.160106737825217, "step": 15470 }, { "distill_loss": 0.09838026016950607, "epoch": 5.160106737825217, "step": 15470 }, { "epoch": 5.160106737825217, "ref_ce_loss": 0.04848563298583031, "step": 15470 }, { "epoch": 5.160106737825217, "loss": 0.3478372097015381, "step": 15470 }, { "ce_loss": 0.08327619731426239, "epoch": 5.160106737825217, "step": 15470 }, { "distill_loss": 0.13971808552742004, "epoch": 5.160106737825217, "step": 15470 }, { "epoch": 5.160106737825217, "ref_ce_loss": 0.10225691646337509, "step": 15470 }, { "epoch": 5.160106737825217, "loss": 0.20459289848804474, "step": 15470 }, { "ce_loss": 0.04662002623081207, "epoch": 5.160106737825217, "step": 15470 }, { "distill_loss": 0.07863349467515945, "epoch": 5.160106737825217, "step": 15470 }, { "epoch": 5.160106737825217, "ref_ce_loss": 0.05946270748972893, "step": 15470 }, { "epoch": 5.160106737825217, "loss": 0.29593658447265625, "step": 15470 }, { "ce_loss": 0.07237986475229263, "epoch": 5.160106737825217, "step": 15470 }, { "distill_loss": 0.11817031353712082, "epoch": 5.160106737825217, "step": 15470 }, { "epoch": 5.160106737825217, "ref_ce_loss": 0.04803453013300896, "step": 15470 }, { "epoch": 5.163442294863242, "loss": 0.3237, "step": 15480 }, { "epoch": 5.163442294863242, "grad_norm": 2.479261636734009, "step": 15480 }, { "epoch": 5.163442294863242, "learning_rate": 8.850628956361376e-05, "step": 15480 }, { "epoch": 5.163442294863242, "loss": 0.6319223642349243, "step": 15480 }, { "ce_loss": 0.10117001831531525, "epoch": 5.163442294863242, "step": 15480 }, { "distill_loss": 0.12282003462314606, "epoch": 5.163442294863242, "step": 15480 }, { "epoch": 5.163442294863242, "ref_ce_loss": 0.0641319677233696, "step": 15480 }, { "epoch": 5.163442294863242, "loss": 0.27091315388679504, "step": 15480 }, { "ce_loss": 0.08988615870475769, "epoch": 5.163442294863242, "step": 15480 }, { "distill_loss": 0.1101350411772728, "epoch": 5.163442294863242, "step": 15480 }, { "epoch": 5.163442294863242, "ref_ce_loss": 0.0706997960805893, "step": 15480 }, { "epoch": 5.163442294863242, "loss": 0.3440433144569397, "step": 15480 }, { "ce_loss": 0.0868951752781868, "epoch": 5.163442294863242, "step": 15480 }, { "distill_loss": 0.13729919493198395, "epoch": 5.163442294863242, "step": 15480 }, { "epoch": 5.163442294863242, "ref_ce_loss": 0.08966067433357239, "step": 15480 }, { "epoch": 5.163442294863242, "loss": 0.5818476676940918, "step": 15480 }, { "ce_loss": 0.11106500774621964, "epoch": 5.163442294863242, "step": 15480 }, { "distill_loss": 0.08393864333629608, "epoch": 5.163442294863242, "step": 15480 }, { "epoch": 5.163442294863242, "ref_ce_loss": 0.07005763053894043, "step": 15480 }, { "epoch": 5.1667778519012675, "loss": 0.3475, "step": 15490 }, { "epoch": 5.1667778519012675, "grad_norm": 3.3304479122161865, "step": 15490 }, { "epoch": 5.1667778519012675, "learning_rate": 8.832158852945596e-05, "step": 15490 }, { "epoch": 5.1667778519012675, "loss": 0.1807374358177185, "step": 15490 }, { "ce_loss": 0.01700657792389393, "epoch": 5.1667778519012675, "step": 15490 }, { "distill_loss": 0.06784248352050781, "epoch": 5.1667778519012675, "step": 15490 }, { "epoch": 5.1667778519012675, "ref_ce_loss": 0.05106615647673607, "step": 15490 }, { "epoch": 5.1667778519012675, "loss": 0.27543309330940247, "step": 15490 }, { "ce_loss": 0.08672452718019485, "epoch": 5.1667778519012675, "step": 15490 }, { "distill_loss": 0.08889733999967575, "epoch": 5.1667778519012675, "step": 15490 }, { "epoch": 5.1667778519012675, "ref_ce_loss": 0.09958288818597794, "step": 15490 }, { "epoch": 5.1667778519012675, "loss": 0.2450074553489685, "step": 15490 }, { "ce_loss": 0.0512431301176548, "epoch": 5.1667778519012675, "step": 15490 }, { "distill_loss": 0.10720741748809814, "epoch": 5.1667778519012675, "step": 15490 }, { "epoch": 5.1667778519012675, "ref_ce_loss": 0.0644150972366333, "step": 15490 }, { "epoch": 5.1667778519012675, "loss": 0.14427991211414337, "step": 15490 }, { "ce_loss": 0.02231277897953987, "epoch": 5.1667778519012675, "step": 15490 }, { "distill_loss": 0.06964313983917236, "epoch": 5.1667778519012675, "step": 15490 }, { "epoch": 5.1667778519012675, "ref_ce_loss": 0.05178777500987053, "step": 15490 }, { "epoch": 5.170113408939293, "loss": 0.301, "step": 15500 }, { "epoch": 5.170113408939293, "grad_norm": 3.6208198070526123, "step": 15500 }, { "epoch": 5.170113408939293, "learning_rate": 8.813699997239051e-05, "step": 15500 }, { "epoch": 5.170113408939293, "loss": 0.4033278226852417, "step": 15500 }, { "ce_loss": 0.14259237051010132, "epoch": 5.170113408939293, "step": 15500 }, { "distill_loss": 0.10619431734085083, "epoch": 5.170113408939293, "step": 15500 }, { "epoch": 5.170113408939293, "ref_ce_loss": 0.10307341068983078, "step": 15500 }, { "epoch": 5.170113408939293, "loss": 0.3729739487171173, "step": 15500 }, { "ce_loss": 0.1008945181965828, "epoch": 5.170113408939293, "step": 15500 }, { "distill_loss": 0.13121455907821655, "epoch": 5.170113408939293, "step": 15500 }, { "epoch": 5.170113408939293, "ref_ce_loss": 0.07845688611268997, "step": 15500 }, { "epoch": 5.170113408939293, "loss": 0.2734251618385315, "step": 15500 }, { "ce_loss": 0.06486482173204422, "epoch": 5.170113408939293, "step": 15500 }, { "distill_loss": 0.1164950504899025, "epoch": 5.170113408939293, "step": 15500 }, { "epoch": 5.170113408939293, "ref_ce_loss": 0.09184136241674423, "step": 15500 }, { "epoch": 5.170113408939293, "loss": 0.2883954644203186, "step": 15500 }, { "ce_loss": 0.03929543495178223, "epoch": 5.170113408939293, "step": 15500 }, { "distill_loss": 0.10786114633083344, "epoch": 5.170113408939293, "step": 15500 }, { "epoch": 5.170113408939293, "ref_ce_loss": 0.05867098271846771, "step": 15500 }, { "epoch": 5.173448965977318, "loss": 0.3157, "step": 15510 }, { "epoch": 5.173448965977318, "grad_norm": 1.871299386024475, "step": 15510 }, { "epoch": 5.173448965977318, "learning_rate": 8.795252422903419e-05, "step": 15510 }, { "epoch": 5.173448965977318, "loss": 0.32258161902427673, "step": 15510 }, { "ce_loss": 0.08582668751478195, "epoch": 5.173448965977318, "step": 15510 }, { "distill_loss": 0.11141744256019592, "epoch": 5.173448965977318, "step": 15510 }, { "epoch": 5.173448965977318, "ref_ce_loss": 0.08088202774524689, "step": 15510 }, { "epoch": 5.173448965977318, "loss": 0.7449796795845032, "step": 15510 }, { "ce_loss": 0.07312709093093872, "epoch": 5.173448965977318, "step": 15510 }, { "distill_loss": 0.10794167220592499, "epoch": 5.173448965977318, "step": 15510 }, { "epoch": 5.173448965977318, "ref_ce_loss": 0.057008083909749985, "step": 15510 }, { "epoch": 5.173448965977318, "loss": 0.2967376708984375, "step": 15510 }, { "ce_loss": 0.1069604903459549, "epoch": 5.173448965977318, "step": 15510 }, { "distill_loss": 0.08719157427549362, "epoch": 5.173448965977318, "step": 15510 }, { "epoch": 5.173448965977318, "ref_ce_loss": 0.07663427293300629, "step": 15510 }, { "epoch": 5.173448965977318, "loss": 0.23122243583202362, "step": 15510 }, { "ce_loss": 0.04379252344369888, "epoch": 5.173448965977318, "step": 15510 }, { "distill_loss": 0.06141211465001106, "epoch": 5.173448965977318, "step": 15510 }, { "epoch": 5.173448965977318, "ref_ce_loss": 0.06730645149946213, "step": 15510 }, { "epoch": 5.1767845230153435, "loss": 0.3259, "step": 15520 }, { "epoch": 5.1767845230153435, "grad_norm": 3.4619665145874023, "step": 15520 }, { "epoch": 5.1767845230153435, "learning_rate": 8.776816163579793e-05, "step": 15520 }, { "epoch": 5.1767845230153435, "loss": 0.3403814733028412, "step": 15520 }, { "ce_loss": 0.0703282505273819, "epoch": 5.1767845230153435, "step": 15520 }, { "distill_loss": 0.10586895048618317, "epoch": 5.1767845230153435, "step": 15520 }, { "epoch": 5.1767845230153435, "ref_ce_loss": 0.07988087087869644, "step": 15520 }, { "epoch": 5.1767845230153435, "loss": 0.32925379276275635, "step": 15520 }, { "ce_loss": 0.07478674501180649, "epoch": 5.1767845230153435, "step": 15520 }, { "distill_loss": 0.11532959342002869, "epoch": 5.1767845230153435, "step": 15520 }, { "epoch": 5.1767845230153435, "ref_ce_loss": 0.07584071159362793, "step": 15520 }, { "epoch": 5.1767845230153435, "loss": 0.3302445709705353, "step": 15520 }, { "ce_loss": 0.07153227180242538, "epoch": 5.1767845230153435, "step": 15520 }, { "distill_loss": 0.1040826067328453, "epoch": 5.1767845230153435, "step": 15520 }, { "epoch": 5.1767845230153435, "ref_ce_loss": 0.10868798196315765, "step": 15520 }, { "epoch": 5.1767845230153435, "loss": 0.3438566327095032, "step": 15520 }, { "ce_loss": 0.1533452570438385, "epoch": 5.1767845230153435, "step": 15520 }, { "distill_loss": 0.09927986562252045, "epoch": 5.1767845230153435, "step": 15520 }, { "epoch": 5.1767845230153435, "ref_ce_loss": 0.09111157059669495, "step": 15520 }, { "epoch": 5.180120080053369, "loss": 0.3633, "step": 15530 }, { "epoch": 5.180120080053369, "grad_norm": 2.976386070251465, "step": 15530 }, { "epoch": 5.180120080053369, "learning_rate": 8.758391252888638e-05, "step": 15530 }, { "epoch": 5.180120080053369, "loss": 0.27730706334114075, "step": 15530 }, { "ce_loss": 0.04665299504995346, "epoch": 5.180120080053369, "step": 15530 }, { "distill_loss": 0.11506421864032745, "epoch": 5.180120080053369, "step": 15530 }, { "epoch": 5.180120080053369, "ref_ce_loss": 0.05354463309049606, "step": 15530 }, { "epoch": 5.180120080053369, "loss": 0.2123173624277115, "step": 15530 }, { "ce_loss": 0.0355028361082077, "epoch": 5.180120080053369, "step": 15530 }, { "distill_loss": 0.10253003984689713, "epoch": 5.180120080053369, "step": 15530 }, { "epoch": 5.180120080053369, "ref_ce_loss": 0.07338284701108932, "step": 15530 }, { "epoch": 5.180120080053369, "loss": 0.2025415003299713, "step": 15530 }, { "ce_loss": 0.06089678779244423, "epoch": 5.180120080053369, "step": 15530 }, { "distill_loss": 0.07716229557991028, "epoch": 5.180120080053369, "step": 15530 }, { "epoch": 5.180120080053369, "ref_ce_loss": 0.045202966779470444, "step": 15530 }, { "epoch": 5.180120080053369, "loss": 0.2668789029121399, "step": 15530 }, { "ce_loss": 0.052694350481033325, "epoch": 5.180120080053369, "step": 15530 }, { "distill_loss": 0.1183210164308548, "epoch": 5.180120080053369, "step": 15530 }, { "epoch": 5.180120080053369, "ref_ce_loss": 0.062336359173059464, "step": 15530 }, { "epoch": 5.183455637091394, "loss": 0.3231, "step": 15540 }, { "epoch": 5.183455637091394, "grad_norm": 2.042680501937866, "step": 15540 }, { "epoch": 5.183455637091394, "learning_rate": 8.739977724429728e-05, "step": 15540 }, { "epoch": 5.183455637091394, "loss": 0.24325260519981384, "step": 15540 }, { "ce_loss": 0.08089492470026016, "epoch": 5.183455637091394, "step": 15540 }, { "distill_loss": 0.10611046105623245, "epoch": 5.183455637091394, "step": 15540 }, { "epoch": 5.183455637091394, "ref_ce_loss": 0.05608109384775162, "step": 15540 }, { "epoch": 5.183455637091394, "loss": 0.33710145950317383, "step": 15540 }, { "ce_loss": 0.05630598962306976, "epoch": 5.183455637091394, "step": 15540 }, { "distill_loss": 0.09411294013261795, "epoch": 5.183455637091394, "step": 15540 }, { "epoch": 5.183455637091394, "ref_ce_loss": 0.06278441846370697, "step": 15540 }, { "epoch": 5.183455637091394, "loss": 0.3103136420249939, "step": 15540 }, { "ce_loss": 0.0829385444521904, "epoch": 5.183455637091394, "step": 15540 }, { "distill_loss": 0.09629233926534653, "epoch": 5.183455637091394, "step": 15540 }, { "epoch": 5.183455637091394, "ref_ce_loss": 0.07343555986881256, "step": 15540 }, { "epoch": 5.183455637091394, "loss": 0.32832831144332886, "step": 15540 }, { "ce_loss": 0.06753069162368774, "epoch": 5.183455637091394, "step": 15540 }, { "distill_loss": 0.10950666666030884, "epoch": 5.183455637091394, "step": 15540 }, { "epoch": 5.183455637091394, "ref_ce_loss": 0.07036434859037399, "step": 15540 }, { "epoch": 5.18679119412942, "loss": 0.2975, "step": 15550 }, { "epoch": 5.18679119412942, "grad_norm": 2.76190447807312, "step": 15550 }, { "epoch": 5.18679119412942, "learning_rate": 8.721575611782067e-05, "step": 15550 }, { "epoch": 5.18679119412942, "loss": 0.5175027847290039, "step": 15550 }, { "ce_loss": 0.051099736243486404, "epoch": 5.18679119412942, "step": 15550 }, { "distill_loss": 0.08739770948886871, "epoch": 5.18679119412942, "step": 15550 }, { "epoch": 5.18679119412942, "ref_ce_loss": 0.07908324152231216, "step": 15550 }, { "epoch": 5.18679119412942, "loss": 0.3586389124393463, "step": 15550 }, { "ce_loss": 0.05912512168288231, "epoch": 5.18679119412942, "step": 15550 }, { "distill_loss": 0.10872924327850342, "epoch": 5.18679119412942, "step": 15550 }, { "epoch": 5.18679119412942, "ref_ce_loss": 0.08318324387073517, "step": 15550 }, { "epoch": 5.18679119412942, "loss": 0.25400081276893616, "step": 15550 }, { "ce_loss": 0.041680727154016495, "epoch": 5.18679119412942, "step": 15550 }, { "distill_loss": 0.1011752262711525, "epoch": 5.18679119412942, "step": 15550 }, { "epoch": 5.18679119412942, "ref_ce_loss": 0.037580229341983795, "step": 15550 }, { "epoch": 5.18679119412942, "loss": 0.28214314579963684, "step": 15550 }, { "ce_loss": 0.03214351460337639, "epoch": 5.18679119412942, "step": 15550 }, { "distill_loss": 0.08211422711610794, "epoch": 5.18679119412942, "step": 15550 }, { "epoch": 5.18679119412942, "ref_ce_loss": 0.07695797830820084, "step": 15550 }, { "epoch": 5.190126751167445, "loss": 0.3197, "step": 15560 }, { "epoch": 5.190126751167445, "grad_norm": 1.7665784358978271, "step": 15560 }, { "epoch": 5.190126751167445, "learning_rate": 8.703184948503859e-05, "step": 15560 }, { "epoch": 5.190126751167445, "loss": 0.46674734354019165, "step": 15560 }, { "ce_loss": 0.06610564887523651, "epoch": 5.190126751167445, "step": 15560 }, { "distill_loss": 0.1039234921336174, "epoch": 5.190126751167445, "step": 15560 }, { "epoch": 5.190126751167445, "ref_ce_loss": 0.06362764537334442, "step": 15560 }, { "epoch": 5.190126751167445, "loss": 0.29293379187583923, "step": 15560 }, { "ce_loss": 0.1026669591665268, "epoch": 5.190126751167445, "step": 15560 }, { "distill_loss": 0.08947969228029251, "epoch": 5.190126751167445, "step": 15560 }, { "epoch": 5.190126751167445, "ref_ce_loss": 0.07734373211860657, "step": 15560 }, { "epoch": 5.190126751167445, "loss": 0.36403539776802063, "step": 15560 }, { "ce_loss": 0.10855984687805176, "epoch": 5.190126751167445, "step": 15560 }, { "distill_loss": 0.10646361112594604, "epoch": 5.190126751167445, "step": 15560 }, { "epoch": 5.190126751167445, "ref_ce_loss": 0.07490496337413788, "step": 15560 }, { "epoch": 5.190126751167445, "loss": 0.16791392862796783, "step": 15560 }, { "ce_loss": 0.03769616410136223, "epoch": 5.190126751167445, "step": 15560 }, { "distill_loss": 0.06060273200273514, "epoch": 5.190126751167445, "step": 15560 }, { "epoch": 5.190126751167445, "ref_ce_loss": 0.05328072980046272, "step": 15560 }, { "epoch": 5.19346230820547, "loss": 0.3015, "step": 15570 }, { "epoch": 5.19346230820547, "grad_norm": 2.2745091915130615, "step": 15570 }, { "epoch": 5.19346230820547, "learning_rate": 8.684805768132409e-05, "step": 15570 }, { "epoch": 5.19346230820547, "loss": 0.28142502903938293, "step": 15570 }, { "ce_loss": 0.05141424387693405, "epoch": 5.19346230820547, "step": 15570 }, { "distill_loss": 0.07907916605472565, "epoch": 5.19346230820547, "step": 15570 }, { "epoch": 5.19346230820547, "ref_ce_loss": 0.06478884071111679, "step": 15570 }, { "epoch": 5.19346230820547, "loss": 0.25589194893836975, "step": 15570 }, { "ce_loss": 0.04737861454486847, "epoch": 5.19346230820547, "step": 15570 }, { "distill_loss": 0.0656491070985794, "epoch": 5.19346230820547, "step": 15570 }, { "epoch": 5.19346230820547, "ref_ce_loss": 0.09997665882110596, "step": 15570 }, { "epoch": 5.19346230820547, "loss": 0.29582786560058594, "step": 15570 }, { "ce_loss": 0.0774170458316803, "epoch": 5.19346230820547, "step": 15570 }, { "distill_loss": 0.08943985402584076, "epoch": 5.19346230820547, "step": 15570 }, { "epoch": 5.19346230820547, "ref_ce_loss": 0.05729973688721657, "step": 15570 }, { "epoch": 5.19346230820547, "loss": 0.38386616110801697, "step": 15570 }, { "ce_loss": 0.05994442105293274, "epoch": 5.19346230820547, "step": 15570 }, { "distill_loss": 0.11905072629451752, "epoch": 5.19346230820547, "step": 15570 }, { "epoch": 5.19346230820547, "ref_ce_loss": 0.07005809992551804, "step": 15570 }, { "epoch": 5.196797865243496, "loss": 0.3038, "step": 15580 }, { "epoch": 5.196797865243496, "grad_norm": 1.919990062713623, "step": 15580 }, { "epoch": 5.196797865243496, "learning_rate": 8.666438104184091e-05, "step": 15580 }, { "epoch": 5.196797865243496, "loss": 0.29969045519828796, "step": 15580 }, { "ce_loss": 0.07858096808195114, "epoch": 5.196797865243496, "step": 15580 }, { "distill_loss": 0.10086540132761002, "epoch": 5.196797865243496, "step": 15580 }, { "epoch": 5.196797865243496, "ref_ce_loss": 0.052758295089006424, "step": 15580 }, { "epoch": 5.196797865243496, "loss": 0.3363777697086334, "step": 15580 }, { "ce_loss": 0.05518624559044838, "epoch": 5.196797865243496, "step": 15580 }, { "distill_loss": 0.11150319129228592, "epoch": 5.196797865243496, "step": 15580 }, { "epoch": 5.196797865243496, "ref_ce_loss": 0.07867610454559326, "step": 15580 }, { "epoch": 5.196797865243496, "loss": 0.24121202528476715, "step": 15580 }, { "ce_loss": 0.06729081273078918, "epoch": 5.196797865243496, "step": 15580 }, { "distill_loss": 0.08743242919445038, "epoch": 5.196797865243496, "step": 15580 }, { "epoch": 5.196797865243496, "ref_ce_loss": 0.032432373613119125, "step": 15580 }, { "epoch": 5.196797865243496, "loss": 0.23406265676021576, "step": 15580 }, { "ce_loss": 0.057209137827157974, "epoch": 5.196797865243496, "step": 15580 }, { "distill_loss": 0.07753507792949677, "epoch": 5.196797865243496, "step": 15580 }, { "epoch": 5.196797865243496, "ref_ce_loss": 0.07158780097961426, "step": 15580 }, { "epoch": 5.200133422281521, "loss": 0.2973, "step": 15590 }, { "epoch": 5.200133422281521, "grad_norm": 2.575624942779541, "step": 15590 }, { "epoch": 5.200133422281521, "learning_rate": 8.648081990154298e-05, "step": 15590 }, { "epoch": 5.200133422281521, "loss": 0.6412194967269897, "step": 15590 }, { "ce_loss": 0.105444997549057, "epoch": 5.200133422281521, "step": 15590 }, { "distill_loss": 0.11191614717245102, "epoch": 5.200133422281521, "step": 15590 }, { "epoch": 5.200133422281521, "ref_ce_loss": 0.10877172648906708, "step": 15590 }, { "epoch": 5.200133422281521, "loss": 0.2196560502052307, "step": 15590 }, { "ce_loss": 0.038214799016714096, "epoch": 5.200133422281521, "step": 15590 }, { "distill_loss": 0.1001727506518364, "epoch": 5.200133422281521, "step": 15590 }, { "epoch": 5.200133422281521, "ref_ce_loss": 0.06055865436792374, "step": 15590 }, { "epoch": 5.200133422281521, "loss": 0.22853578627109528, "step": 15590 }, { "ce_loss": 0.06929147988557816, "epoch": 5.200133422281521, "step": 15590 }, { "distill_loss": 0.0751098096370697, "epoch": 5.200133422281521, "step": 15590 }, { "epoch": 5.200133422281521, "ref_ce_loss": 0.08401620388031006, "step": 15590 }, { "epoch": 5.200133422281521, "loss": 0.2704339921474457, "step": 15590 }, { "ce_loss": 0.06760897487401962, "epoch": 5.200133422281521, "step": 15590 }, { "distill_loss": 0.12975336611270905, "epoch": 5.200133422281521, "step": 15590 }, { "epoch": 5.200133422281521, "ref_ce_loss": 0.07298162579536438, "step": 15590 }, { "epoch": 5.203468979319546, "loss": 0.3892, "step": 15600 }, { "epoch": 5.203468979319546, "grad_norm": 3.99639630317688, "step": 15600 }, { "epoch": 5.203468979319546, "learning_rate": 8.62973745951732e-05, "step": 15600 }, { "epoch": 5.203468979319546, "loss": 0.5640752911567688, "step": 15600 }, { "ce_loss": 0.06329736858606339, "epoch": 5.203468979319546, "step": 15600 }, { "distill_loss": 0.10339559614658356, "epoch": 5.203468979319546, "step": 15600 }, { "epoch": 5.203468979319546, "ref_ce_loss": 0.07652582228183746, "step": 15600 }, { "epoch": 5.203468979319546, "loss": 0.2741006016731262, "step": 15600 }, { "ce_loss": 0.039778295904397964, "epoch": 5.203468979319546, "step": 15600 }, { "distill_loss": 0.08627346158027649, "epoch": 5.203468979319546, "step": 15600 }, { "epoch": 5.203468979319546, "ref_ce_loss": 0.08227671682834625, "step": 15600 }, { "epoch": 5.203468979319546, "loss": 0.5446295142173767, "step": 15600 }, { "ce_loss": 0.06431914120912552, "epoch": 5.203468979319546, "step": 15600 }, { "distill_loss": 0.09512020647525787, "epoch": 5.203468979319546, "step": 15600 }, { "epoch": 5.203468979319546, "ref_ce_loss": 0.07713976502418518, "step": 15600 }, { "epoch": 5.203468979319546, "loss": 0.2804473340511322, "step": 15600 }, { "ce_loss": 0.022517312318086624, "epoch": 5.203468979319546, "step": 15600 }, { "distill_loss": 0.0781346783041954, "epoch": 5.203468979319546, "step": 15600 }, { "epoch": 5.203468979319546, "ref_ce_loss": 0.06672561168670654, "step": 15600 }, { "epoch": 5.206804536357572, "loss": 0.3379, "step": 15610 }, { "epoch": 5.206804536357572, "grad_norm": 3.4212775230407715, "step": 15610 }, { "epoch": 5.206804536357572, "learning_rate": 8.61140454572636e-05, "step": 15610 }, { "epoch": 5.206804536357572, "loss": 0.33479341864585876, "step": 15610 }, { "ce_loss": 0.036282870918512344, "epoch": 5.206804536357572, "step": 15610 }, { "distill_loss": 0.08696107566356659, "epoch": 5.206804536357572, "step": 15610 }, { "epoch": 5.206804536357572, "ref_ce_loss": 0.03990389034152031, "step": 15610 }, { "epoch": 5.206804536357572, "loss": 0.3305779695510864, "step": 15610 }, { "ce_loss": 0.0724162608385086, "epoch": 5.206804536357572, "step": 15610 }, { "distill_loss": 0.11297457665205002, "epoch": 5.206804536357572, "step": 15610 }, { "epoch": 5.206804536357572, "ref_ce_loss": 0.06067904457449913, "step": 15610 }, { "epoch": 5.206804536357572, "loss": 0.22460030019283295, "step": 15610 }, { "ce_loss": 0.023353662341833115, "epoch": 5.206804536357572, "step": 15610 }, { "distill_loss": 0.0965401828289032, "epoch": 5.206804536357572, "step": 15610 }, { "epoch": 5.206804536357572, "ref_ce_loss": 0.06605301052331924, "step": 15610 }, { "epoch": 5.206804536357572, "loss": 0.2593958377838135, "step": 15610 }, { "ce_loss": 0.03210054337978363, "epoch": 5.206804536357572, "step": 15610 }, { "distill_loss": 0.08811073005199432, "epoch": 5.206804536357572, "step": 15610 }, { "epoch": 5.206804536357572, "ref_ce_loss": 0.0475555881857872, "step": 15610 }, { "epoch": 5.210140093395597, "loss": 0.3564, "step": 15620 }, { "epoch": 5.210140093395597, "grad_norm": 2.2904813289642334, "step": 15620 }, { "epoch": 5.210140093395597, "learning_rate": 8.593083282213406e-05, "step": 15620 }, { "epoch": 5.210140093395597, "loss": 0.23321533203125, "step": 15620 }, { "ce_loss": 0.07071798294782639, "epoch": 5.210140093395597, "step": 15620 }, { "distill_loss": 0.09500987827777863, "epoch": 5.210140093395597, "step": 15620 }, { "epoch": 5.210140093395597, "ref_ce_loss": 0.0674041360616684, "step": 15620 }, { "epoch": 5.210140093395597, "loss": 0.30617424845695496, "step": 15620 }, { "ce_loss": 0.06755051016807556, "epoch": 5.210140093395597, "step": 15620 }, { "distill_loss": 0.10673744976520538, "epoch": 5.210140093395597, "step": 15620 }, { "epoch": 5.210140093395597, "ref_ce_loss": 0.09195166826248169, "step": 15620 }, { "epoch": 5.210140093395597, "loss": 0.4118172824382782, "step": 15620 }, { "ce_loss": 0.15275129675865173, "epoch": 5.210140093395597, "step": 15620 }, { "distill_loss": 0.13609610497951508, "epoch": 5.210140093395597, "step": 15620 }, { "epoch": 5.210140093395597, "ref_ce_loss": 0.0815357118844986, "step": 15620 }, { "epoch": 5.210140093395597, "loss": 0.44408926367759705, "step": 15620 }, { "ce_loss": 0.10387061536312103, "epoch": 5.210140093395597, "step": 15620 }, { "distill_loss": 0.08926932513713837, "epoch": 5.210140093395597, "step": 15620 }, { "epoch": 5.210140093395597, "ref_ce_loss": 0.052122075110673904, "step": 15620 }, { "epoch": 5.213475650433622, "loss": 0.3449, "step": 15630 }, { "epoch": 5.213475650433622, "grad_norm": 4.765140533447266, "step": 15630 }, { "epoch": 5.213475650433622, "learning_rate": 8.574773702389224e-05, "step": 15630 }, { "epoch": 5.213475650433622, "loss": 0.38619959354400635, "step": 15630 }, { "ce_loss": 0.07783541828393936, "epoch": 5.213475650433622, "step": 15630 }, { "distill_loss": 0.09005855023860931, "epoch": 5.213475650433622, "step": 15630 }, { "epoch": 5.213475650433622, "ref_ce_loss": 0.08827793598175049, "step": 15630 }, { "epoch": 5.213475650433622, "loss": 0.3454456925392151, "step": 15630 }, { "ce_loss": 0.08330843597650528, "epoch": 5.213475650433622, "step": 15630 }, { "distill_loss": 0.11718885600566864, "epoch": 5.213475650433622, "step": 15630 }, { "epoch": 5.213475650433622, "ref_ce_loss": 0.06938067078590393, "step": 15630 }, { "epoch": 5.213475650433622, "loss": 0.2988470792770386, "step": 15630 }, { "ce_loss": 0.06816891580820084, "epoch": 5.213475650433622, "step": 15630 }, { "distill_loss": 0.09233548492193222, "epoch": 5.213475650433622, "step": 15630 }, { "epoch": 5.213475650433622, "ref_ce_loss": 0.09514269977807999, "step": 15630 }, { "epoch": 5.213475650433622, "loss": 0.28821972012519836, "step": 15630 }, { "ce_loss": 0.06957666575908661, "epoch": 5.213475650433622, "step": 15630 }, { "distill_loss": 0.09795127809047699, "epoch": 5.213475650433622, "step": 15630 }, { "epoch": 5.213475650433622, "ref_ce_loss": 0.07120463997125626, "step": 15630 }, { "epoch": 5.216811207471648, "loss": 0.3606, "step": 15640 }, { "epoch": 5.216811207471648, "grad_norm": 4.76863956451416, "step": 15640 }, { "epoch": 5.216811207471648, "learning_rate": 8.556475839643263e-05, "step": 15640 }, { "epoch": 5.216811207471648, "loss": 0.43406257033348083, "step": 15640 }, { "ce_loss": 0.10813289135694504, "epoch": 5.216811207471648, "step": 15640 }, { "distill_loss": 0.13066282868385315, "epoch": 5.216811207471648, "step": 15640 }, { "epoch": 5.216811207471648, "ref_ce_loss": 0.09365534037351608, "step": 15640 }, { "epoch": 5.216811207471648, "loss": 0.34955552220344543, "step": 15640 }, { "ce_loss": 0.1082388311624527, "epoch": 5.216811207471648, "step": 15640 }, { "distill_loss": 0.11656052619218826, "epoch": 5.216811207471648, "step": 15640 }, { "epoch": 5.216811207471648, "ref_ce_loss": 0.09189897775650024, "step": 15640 }, { "epoch": 5.216811207471648, "loss": 0.2614520192146301, "step": 15640 }, { "ce_loss": 0.02691487781703472, "epoch": 5.216811207471648, "step": 15640 }, { "distill_loss": 0.11289530992507935, "epoch": 5.216811207471648, "step": 15640 }, { "epoch": 5.216811207471648, "ref_ce_loss": 0.0644138976931572, "step": 15640 }, { "epoch": 5.216811207471648, "loss": 0.6140693426132202, "step": 15640 }, { "ce_loss": 0.04645974934101105, "epoch": 5.216811207471648, "step": 15640 }, { "distill_loss": 0.11816476285457611, "epoch": 5.216811207471648, "step": 15640 }, { "epoch": 5.216811207471648, "ref_ce_loss": 0.07548999786376953, "step": 15640 }, { "epoch": 5.220146764509673, "loss": 0.3625, "step": 15650 }, { "epoch": 5.220146764509673, "grad_norm": 2.3993215560913086, "step": 15650 }, { "epoch": 5.220146764509673, "learning_rate": 8.538189727343607e-05, "step": 15650 }, { "epoch": 5.220146764509673, "loss": 0.2743743360042572, "step": 15650 }, { "ce_loss": 0.06477046012878418, "epoch": 5.220146764509673, "step": 15650 }, { "distill_loss": 0.09259688854217529, "epoch": 5.220146764509673, "step": 15650 }, { "epoch": 5.220146764509673, "ref_ce_loss": 0.05351939797401428, "step": 15650 }, { "epoch": 5.220146764509673, "loss": 0.21701106429100037, "step": 15650 }, { "ce_loss": 0.03109646402299404, "epoch": 5.220146764509673, "step": 15650 }, { "distill_loss": 0.0818566232919693, "epoch": 5.220146764509673, "step": 15650 }, { "epoch": 5.220146764509673, "ref_ce_loss": 0.06140496954321861, "step": 15650 }, { "epoch": 5.220146764509673, "loss": 0.29552048444747925, "step": 15650 }, { "ce_loss": 0.09033332020044327, "epoch": 5.220146764509673, "step": 15650 }, { "distill_loss": 0.09672441333532333, "epoch": 5.220146764509673, "step": 15650 }, { "epoch": 5.220146764509673, "ref_ce_loss": 0.08516726642847061, "step": 15650 }, { "epoch": 5.220146764509673, "loss": 0.21063679456710815, "step": 15650 }, { "ce_loss": 0.03232778608798981, "epoch": 5.220146764509673, "step": 15650 }, { "distill_loss": 0.09249082952737808, "epoch": 5.220146764509673, "step": 15650 }, { "epoch": 5.220146764509673, "ref_ce_loss": 0.06132323667407036, "step": 15650 }, { "epoch": 5.223482321547698, "loss": 0.3319, "step": 15660 }, { "epoch": 5.223482321547698, "grad_norm": 2.9076249599456787, "step": 15660 }, { "epoch": 5.223482321547698, "learning_rate": 8.519915398836912e-05, "step": 15660 }, { "epoch": 5.223482321547698, "loss": 0.24805200099945068, "step": 15660 }, { "ce_loss": 0.08938482403755188, "epoch": 5.223482321547698, "step": 15660 }, { "distill_loss": 0.0876886323094368, "epoch": 5.223482321547698, "step": 15660 }, { "epoch": 5.223482321547698, "ref_ce_loss": 0.07093243300914764, "step": 15660 }, { "epoch": 5.223482321547698, "loss": 0.2605586349964142, "step": 15660 }, { "ce_loss": 0.05598803982138634, "epoch": 5.223482321547698, "step": 15660 }, { "distill_loss": 0.10434643924236298, "epoch": 5.223482321547698, "step": 15660 }, { "epoch": 5.223482321547698, "ref_ce_loss": 0.07283356040716171, "step": 15660 }, { "epoch": 5.223482321547698, "loss": 0.4666854739189148, "step": 15660 }, { "ce_loss": 0.10625138133764267, "epoch": 5.223482321547698, "step": 15660 }, { "distill_loss": 0.12450964003801346, "epoch": 5.223482321547698, "step": 15660 }, { "epoch": 5.223482321547698, "ref_ce_loss": 0.06658724695444107, "step": 15660 }, { "epoch": 5.223482321547698, "loss": 0.20904622972011566, "step": 15660 }, { "ce_loss": 0.03591933846473694, "epoch": 5.223482321547698, "step": 15660 }, { "distill_loss": 0.0913221463561058, "epoch": 5.223482321547698, "step": 15660 }, { "epoch": 5.223482321547698, "ref_ce_loss": 0.08163376152515411, "step": 15660 }, { "epoch": 5.226817878585724, "loss": 0.3339, "step": 15670 }, { "epoch": 5.226817878585724, "grad_norm": 3.4926912784576416, "step": 15670 }, { "epoch": 5.226817878585724, "learning_rate": 8.501652887448354e-05, "step": 15670 }, { "epoch": 5.226817878585724, "loss": 0.3464776277542114, "step": 15670 }, { "ce_loss": 0.053136661648750305, "epoch": 5.226817878585724, "step": 15670 }, { "distill_loss": 0.13123872876167297, "epoch": 5.226817878585724, "step": 15670 }, { "epoch": 5.226817878585724, "ref_ce_loss": 0.09237124770879745, "step": 15670 }, { "epoch": 5.226817878585724, "loss": 0.4693766236305237, "step": 15670 }, { "ce_loss": 0.09558730572462082, "epoch": 5.226817878585724, "step": 15670 }, { "distill_loss": 0.10919008404016495, "epoch": 5.226817878585724, "step": 15670 }, { "epoch": 5.226817878585724, "ref_ce_loss": 0.08798684179782867, "step": 15670 }, { "epoch": 5.226817878585724, "loss": 0.8525584936141968, "step": 15670 }, { "ce_loss": 0.13166871666908264, "epoch": 5.226817878585724, "step": 15670 }, { "distill_loss": 0.12148617953062057, "epoch": 5.226817878585724, "step": 15670 }, { "epoch": 5.226817878585724, "ref_ce_loss": 0.08422347158193588, "step": 15670 }, { "epoch": 5.226817878585724, "loss": 0.5308492183685303, "step": 15670 }, { "ce_loss": 0.12234484404325485, "epoch": 5.226817878585724, "step": 15670 }, { "distill_loss": 0.1193431094288826, "epoch": 5.226817878585724, "step": 15670 }, { "epoch": 5.226817878585724, "ref_ce_loss": 0.0752011314034462, "step": 15670 }, { "epoch": 5.230153435623749, "loss": 0.3824, "step": 15680 }, { "epoch": 5.230153435623749, "grad_norm": 1.839232087135315, "step": 15680 }, { "epoch": 5.230153435623749, "learning_rate": 8.483402226481531e-05, "step": 15680 }, { "epoch": 5.230153435623749, "loss": 0.9070611000061035, "step": 15680 }, { "ce_loss": 0.10845021158456802, "epoch": 5.230153435623749, "step": 15680 }, { "distill_loss": 0.15904279053211212, "epoch": 5.230153435623749, "step": 15680 }, { "epoch": 5.230153435623749, "ref_ce_loss": 0.10060586035251617, "step": 15680 }, { "epoch": 5.230153435623749, "loss": 0.20827336609363556, "step": 15680 }, { "ce_loss": 0.0360378660261631, "epoch": 5.230153435623749, "step": 15680 }, { "distill_loss": 0.08658089488744736, "epoch": 5.230153435623749, "step": 15680 }, { "epoch": 5.230153435623749, "ref_ce_loss": 0.0515529103577137, "step": 15680 }, { "epoch": 5.230153435623749, "loss": 0.3215510845184326, "step": 15680 }, { "ce_loss": 0.0539592020213604, "epoch": 5.230153435623749, "step": 15680 }, { "distill_loss": 0.13022923469543457, "epoch": 5.230153435623749, "step": 15680 }, { "epoch": 5.230153435623749, "ref_ce_loss": 0.0769435316324234, "step": 15680 }, { "epoch": 5.230153435623749, "loss": 0.17728765308856964, "step": 15680 }, { "ce_loss": 0.020045332610607147, "epoch": 5.230153435623749, "step": 15680 }, { "distill_loss": 0.072093665599823, "epoch": 5.230153435623749, "step": 15680 }, { "epoch": 5.230153435623749, "ref_ce_loss": 0.06019889935851097, "step": 15680 }, { "epoch": 5.2334889926617745, "loss": 0.3081, "step": 15690 }, { "epoch": 5.2334889926617745, "grad_norm": 3.0346570014953613, "step": 15690 }, { "epoch": 5.2334889926617745, "learning_rate": 8.46516344921846e-05, "step": 15690 }, { "epoch": 5.2334889926617745, "loss": 0.27006831765174866, "step": 15690 }, { "ce_loss": 0.0540575347840786, "epoch": 5.2334889926617745, "step": 15690 }, { "distill_loss": 0.11163632571697235, "epoch": 5.2334889926617745, "step": 15690 }, { "epoch": 5.2334889926617745, "ref_ce_loss": 0.07398149371147156, "step": 15690 }, { "epoch": 5.2334889926617745, "loss": 0.25738635659217834, "step": 15690 }, { "ce_loss": 0.09728316217660904, "epoch": 5.2334889926617745, "step": 15690 }, { "distill_loss": 0.10439582914113998, "epoch": 5.2334889926617745, "step": 15690 }, { "epoch": 5.2334889926617745, "ref_ce_loss": 0.05562935024499893, "step": 15690 }, { "epoch": 5.2334889926617745, "loss": 0.32098954916000366, "step": 15690 }, { "ce_loss": 0.04642213135957718, "epoch": 5.2334889926617745, "step": 15690 }, { "distill_loss": 0.07827766239643097, "epoch": 5.2334889926617745, "step": 15690 }, { "epoch": 5.2334889926617745, "ref_ce_loss": 0.07306087762117386, "step": 15690 }, { "epoch": 5.2334889926617745, "loss": 0.34884780645370483, "step": 15690 }, { "ce_loss": 0.04866236820816994, "epoch": 5.2334889926617745, "step": 15690 }, { "distill_loss": 0.11488591134548187, "epoch": 5.2334889926617745, "step": 15690 }, { "epoch": 5.2334889926617745, "ref_ce_loss": 0.06572486460208893, "step": 15690 }, { "epoch": 5.2368245496998, "loss": 0.3341, "step": 15700 }, { "epoch": 5.2368245496998, "grad_norm": 1.6081671714782715, "step": 15700 }, { "epoch": 5.2368245496998, "learning_rate": 8.446936588919475e-05, "step": 15700 }, { "epoch": 5.2368245496998, "loss": 0.3010448217391968, "step": 15700 }, { "ce_loss": 0.08089329302310944, "epoch": 5.2368245496998, "step": 15700 }, { "distill_loss": 0.09048581123352051, "epoch": 5.2368245496998, "step": 15700 }, { "epoch": 5.2368245496998, "ref_ce_loss": 0.07929427921772003, "step": 15700 }, { "epoch": 5.2368245496998, "loss": 0.35104861855506897, "step": 15700 }, { "ce_loss": 0.11544732749462128, "epoch": 5.2368245496998, "step": 15700 }, { "distill_loss": 0.13719846308231354, "epoch": 5.2368245496998, "step": 15700 }, { "epoch": 5.2368245496998, "ref_ce_loss": 0.08232993632555008, "step": 15700 }, { "epoch": 5.2368245496998, "loss": 0.2056066393852234, "step": 15700 }, { "ce_loss": 0.04238485172390938, "epoch": 5.2368245496998, "step": 15700 }, { "distill_loss": 0.08624627441167831, "epoch": 5.2368245496998, "step": 15700 }, { "epoch": 5.2368245496998, "ref_ce_loss": 0.04173652455210686, "step": 15700 }, { "epoch": 5.2368245496998, "loss": 0.24264144897460938, "step": 15700 }, { "ce_loss": 0.03431288152933121, "epoch": 5.2368245496998, "step": 15700 }, { "distill_loss": 0.10987095534801483, "epoch": 5.2368245496998, "step": 15700 }, { "epoch": 5.2368245496998, "ref_ce_loss": 0.0590670071542263, "step": 15700 }, { "epoch": 5.240160106737825, "loss": 0.305, "step": 15710 }, { "epoch": 5.240160106737825, "grad_norm": 1.7107326984405518, "step": 15710 }, { "epoch": 5.240160106737825, "learning_rate": 8.428721678823178e-05, "step": 15710 }, { "epoch": 5.240160106737825, "loss": 0.22281035780906677, "step": 15710 }, { "ce_loss": 0.01576351746916771, "epoch": 5.240160106737825, "step": 15710 }, { "distill_loss": 0.08575361967086792, "epoch": 5.240160106737825, "step": 15710 }, { "epoch": 5.240160106737825, "ref_ce_loss": 0.06620697677135468, "step": 15710 }, { "epoch": 5.240160106737825, "loss": 0.2195720225572586, "step": 15710 }, { "ce_loss": 0.032049402594566345, "epoch": 5.240160106737825, "step": 15710 }, { "distill_loss": 0.12723985314369202, "epoch": 5.240160106737825, "step": 15710 }, { "epoch": 5.240160106737825, "ref_ce_loss": 0.06024665758013725, "step": 15710 }, { "epoch": 5.240160106737825, "loss": 0.2883130609989166, "step": 15710 }, { "ce_loss": 0.06870332360267639, "epoch": 5.240160106737825, "step": 15710 }, { "distill_loss": 0.11934572458267212, "epoch": 5.240160106737825, "step": 15710 }, { "epoch": 5.240160106737825, "ref_ce_loss": 0.10023310780525208, "step": 15710 }, { "epoch": 5.240160106737825, "loss": 0.28461188077926636, "step": 15710 }, { "ce_loss": 0.08343104273080826, "epoch": 5.240160106737825, "step": 15710 }, { "distill_loss": 0.11157909035682678, "epoch": 5.240160106737825, "step": 15710 }, { "epoch": 5.240160106737825, "ref_ce_loss": 0.08946184813976288, "step": 15710 }, { "epoch": 5.2434956637758505, "loss": 0.3315, "step": 15720 }, { "epoch": 5.2434956637758505, "grad_norm": 3.451080322265625, "step": 15720 }, { "epoch": 5.2434956637758505, "learning_rate": 8.41051875214639e-05, "step": 15720 }, { "epoch": 5.2434956637758505, "loss": 0.361646831035614, "step": 15720 }, { "ce_loss": 0.10292549431324005, "epoch": 5.2434956637758505, "step": 15720 }, { "distill_loss": 0.11092507839202881, "epoch": 5.2434956637758505, "step": 15720 }, { "epoch": 5.2434956637758505, "ref_ce_loss": 0.07696814090013504, "step": 15720 }, { "epoch": 5.2434956637758505, "loss": 0.22763587534427643, "step": 15720 }, { "ce_loss": 0.044729799032211304, "epoch": 5.2434956637758505, "step": 15720 }, { "distill_loss": 0.08984865248203278, "epoch": 5.2434956637758505, "step": 15720 }, { "epoch": 5.2434956637758505, "ref_ce_loss": 0.05932828038930893, "step": 15720 }, { "epoch": 5.2434956637758505, "loss": 0.21750256419181824, "step": 15720 }, { "ce_loss": 0.0500781424343586, "epoch": 5.2434956637758505, "step": 15720 }, { "distill_loss": 0.08333169668912888, "epoch": 5.2434956637758505, "step": 15720 }, { "epoch": 5.2434956637758505, "ref_ce_loss": 0.07430606335401535, "step": 15720 }, { "epoch": 5.2434956637758505, "loss": 0.3150693476200104, "step": 15720 }, { "ce_loss": 0.06628770381212234, "epoch": 5.2434956637758505, "step": 15720 }, { "distill_loss": 0.09029305726289749, "epoch": 5.2434956637758505, "step": 15720 }, { "epoch": 5.2434956637758505, "ref_ce_loss": 0.08698195964097977, "step": 15720 }, { "epoch": 5.246831220813876, "loss": 0.3561, "step": 15730 }, { "epoch": 5.246831220813876, "grad_norm": 4.539619445800781, "step": 15730 }, { "epoch": 5.246831220813876, "learning_rate": 8.392327842084052e-05, "step": 15730 }, { "epoch": 5.246831220813876, "loss": 0.31229639053344727, "step": 15730 }, { "ce_loss": 0.0605168417096138, "epoch": 5.246831220813876, "step": 15730 }, { "distill_loss": 0.12786982953548431, "epoch": 5.246831220813876, "step": 15730 }, { "epoch": 5.246831220813876, "ref_ce_loss": 0.1003168374300003, "step": 15730 }, { "epoch": 5.246831220813876, "loss": 0.3192044794559479, "step": 15730 }, { "ce_loss": 0.10276396572589874, "epoch": 5.246831220813876, "step": 15730 }, { "distill_loss": 0.12135171890258789, "epoch": 5.246831220813876, "step": 15730 }, { "epoch": 5.246831220813876, "ref_ce_loss": 0.06497390568256378, "step": 15730 }, { "epoch": 5.246831220813876, "loss": 0.37759923934936523, "step": 15730 }, { "ce_loss": 0.07716540992259979, "epoch": 5.246831220813876, "step": 15730 }, { "distill_loss": 0.0948992669582367, "epoch": 5.246831220813876, "step": 15730 }, { "epoch": 5.246831220813876, "ref_ce_loss": 0.056300580501556396, "step": 15730 }, { "epoch": 5.246831220813876, "loss": 0.22098159790039062, "step": 15730 }, { "ce_loss": 0.04677729308605194, "epoch": 5.246831220813876, "step": 15730 }, { "distill_loss": 0.11466146260499954, "epoch": 5.246831220813876, "step": 15730 }, { "epoch": 5.246831220813876, "ref_ce_loss": 0.05951519310474396, "step": 15730 }, { "epoch": 5.250166777851901, "loss": 0.3294, "step": 15740 }, { "epoch": 5.250166777851901, "grad_norm": 4.578763008117676, "step": 15740 }, { "epoch": 5.250166777851901, "learning_rate": 8.374148981809216e-05, "step": 15740 }, { "epoch": 5.250166777851901, "loss": 0.2728346884250641, "step": 15740 }, { "ce_loss": 0.06905033439397812, "epoch": 5.250166777851901, "step": 15740 }, { "distill_loss": 0.09745746850967407, "epoch": 5.250166777851901, "step": 15740 }, { "epoch": 5.250166777851901, "ref_ce_loss": 0.08488545566797256, "step": 15740 }, { "epoch": 5.250166777851901, "loss": 0.4803808927536011, "step": 15740 }, { "ce_loss": 0.022304605692625046, "epoch": 5.250166777851901, "step": 15740 }, { "distill_loss": 0.10700996965169907, "epoch": 5.250166777851901, "step": 15740 }, { "epoch": 5.250166777851901, "ref_ce_loss": 0.04373454675078392, "step": 15740 }, { "epoch": 5.250166777851901, "loss": 0.25218603014945984, "step": 15740 }, { "ce_loss": 0.10156643390655518, "epoch": 5.250166777851901, "step": 15740 }, { "distill_loss": 0.09463807940483093, "epoch": 5.250166777851901, "step": 15740 }, { "epoch": 5.250166777851901, "ref_ce_loss": 0.040142521262168884, "step": 15740 }, { "epoch": 5.250166777851901, "loss": 0.309573769569397, "step": 15740 }, { "ce_loss": 0.08238134533166885, "epoch": 5.250166777851901, "step": 15740 }, { "distill_loss": 0.10327878594398499, "epoch": 5.250166777851901, "step": 15740 }, { "epoch": 5.250166777851901, "ref_ce_loss": 0.09019621461629868, "step": 15740 }, { "epoch": 5.253502334889927, "loss": 0.3562, "step": 15750 }, { "epoch": 5.253502334889927, "grad_norm": 2.2583131790161133, "step": 15750 }, { "epoch": 5.253502334889927, "learning_rate": 8.355982204472953e-05, "step": 15750 }, { "epoch": 5.253502334889927, "loss": 0.4027312695980072, "step": 15750 }, { "ce_loss": 0.057967305183410645, "epoch": 5.253502334889927, "step": 15750 }, { "distill_loss": 0.12211738526821136, "epoch": 5.253502334889927, "step": 15750 }, { "epoch": 5.253502334889927, "ref_ce_loss": 0.051287103444337845, "step": 15750 }, { "epoch": 5.253502334889927, "loss": 0.25625723600387573, "step": 15750 }, { "ce_loss": 0.061924465000629425, "epoch": 5.253502334889927, "step": 15750 }, { "distill_loss": 0.1128692552447319, "epoch": 5.253502334889927, "step": 15750 }, { "epoch": 5.253502334889927, "ref_ce_loss": 0.046483397483825684, "step": 15750 }, { "epoch": 5.253502334889927, "loss": 0.4449954330921173, "step": 15750 }, { "ce_loss": 0.026363397017121315, "epoch": 5.253502334889927, "step": 15750 }, { "distill_loss": 0.12303638458251953, "epoch": 5.253502334889927, "step": 15750 }, { "epoch": 5.253502334889927, "ref_ce_loss": 0.08882566541433334, "step": 15750 }, { "epoch": 5.253502334889927, "loss": 0.8488062024116516, "step": 15750 }, { "ce_loss": 0.07099393010139465, "epoch": 5.253502334889927, "step": 15750 }, { "distill_loss": 0.10903570055961609, "epoch": 5.253502334889927, "step": 15750 }, { "epoch": 5.253502334889927, "ref_ce_loss": 0.08942622691392899, "step": 15750 }, { "epoch": 5.256837891927952, "loss": 0.3294, "step": 15760 }, { "epoch": 5.256837891927952, "grad_norm": 2.5139570236206055, "step": 15760 }, { "epoch": 5.256837891927952, "learning_rate": 8.337827543204296e-05, "step": 15760 }, { "epoch": 5.256837891927952, "loss": 0.24721331894397736, "step": 15760 }, { "ce_loss": 0.04199664294719696, "epoch": 5.256837891927952, "step": 15760 }, { "distill_loss": 0.11916942894458771, "epoch": 5.256837891927952, "step": 15760 }, { "epoch": 5.256837891927952, "ref_ce_loss": 0.05043063685297966, "step": 15760 }, { "epoch": 5.256837891927952, "loss": 0.2464386373758316, "step": 15760 }, { "ce_loss": 0.029941368848085403, "epoch": 5.256837891927952, "step": 15760 }, { "distill_loss": 0.09624037146568298, "epoch": 5.256837891927952, "step": 15760 }, { "epoch": 5.256837891927952, "ref_ce_loss": 0.08408761769533157, "step": 15760 }, { "epoch": 5.256837891927952, "loss": 0.3620655834674835, "step": 15760 }, { "ce_loss": 0.11664766073226929, "epoch": 5.256837891927952, "step": 15760 }, { "distill_loss": 0.12865331768989563, "epoch": 5.256837891927952, "step": 15760 }, { "epoch": 5.256837891927952, "ref_ce_loss": 0.06753537803888321, "step": 15760 }, { "epoch": 5.256837891927952, "loss": 0.16280080378055573, "step": 15760 }, { "ce_loss": 0.030394496396183968, "epoch": 5.256837891927952, "step": 15760 }, { "distill_loss": 0.08359411358833313, "epoch": 5.256837891927952, "step": 15760 }, { "epoch": 5.256837891927952, "ref_ce_loss": 0.024733874946832657, "step": 15760 }, { "epoch": 5.260173448965977, "loss": 0.3624, "step": 15770 }, { "epoch": 5.260173448965977, "grad_norm": 2.7002289295196533, "step": 15770 }, { "epoch": 5.260173448965977, "learning_rate": 8.319685031110196e-05, "step": 15770 }, { "epoch": 5.260173448965977, "loss": 0.20755645632743835, "step": 15770 }, { "ce_loss": 0.007387253921478987, "epoch": 5.260173448965977, "step": 15770 }, { "distill_loss": 0.08905821293592453, "epoch": 5.260173448965977, "step": 15770 }, { "epoch": 5.260173448965977, "ref_ce_loss": 0.06709232181310654, "step": 15770 }, { "epoch": 5.260173448965977, "loss": 0.38863542675971985, "step": 15770 }, { "ce_loss": 0.1310935616493225, "epoch": 5.260173448965977, "step": 15770 }, { "distill_loss": 0.15545101463794708, "epoch": 5.260173448965977, "step": 15770 }, { "epoch": 5.260173448965977, "ref_ce_loss": 0.08572543412446976, "step": 15770 }, { "epoch": 5.260173448965977, "loss": 0.3048025965690613, "step": 15770 }, { "ce_loss": 0.029002025723457336, "epoch": 5.260173448965977, "step": 15770 }, { "distill_loss": 0.09924504905939102, "epoch": 5.260173448965977, "step": 15770 }, { "epoch": 5.260173448965977, "ref_ce_loss": 0.08231877535581589, "step": 15770 }, { "epoch": 5.260173448965977, "loss": 0.4077030122280121, "step": 15770 }, { "ce_loss": 0.07996194064617157, "epoch": 5.260173448965977, "step": 15770 }, { "distill_loss": 0.16033872961997986, "epoch": 5.260173448965977, "step": 15770 }, { "epoch": 5.260173448965977, "ref_ce_loss": 0.0761852115392685, "step": 15770 }, { "epoch": 5.263509006004003, "loss": 0.3434, "step": 15780 }, { "epoch": 5.263509006004003, "grad_norm": 3.2176711559295654, "step": 15780 }, { "epoch": 5.263509006004003, "learning_rate": 8.301554701275423e-05, "step": 15780 }, { "epoch": 5.263509006004003, "loss": 0.3810529410839081, "step": 15780 }, { "ce_loss": 0.1390722543001175, "epoch": 5.263509006004003, "step": 15780 }, { "distill_loss": 0.12283456325531006, "epoch": 5.263509006004003, "step": 15780 }, { "epoch": 5.263509006004003, "ref_ce_loss": 0.0769791379570961, "step": 15780 }, { "epoch": 5.263509006004003, "loss": 0.42334967851638794, "step": 15780 }, { "ce_loss": 0.17356954514980316, "epoch": 5.263509006004003, "step": 15780 }, { "distill_loss": 0.186984121799469, "epoch": 5.263509006004003, "step": 15780 }, { "epoch": 5.263509006004003, "ref_ce_loss": 0.06272926926612854, "step": 15780 }, { "epoch": 5.263509006004003, "loss": 0.28403088450431824, "step": 15780 }, { "ce_loss": 0.0569252148270607, "epoch": 5.263509006004003, "step": 15780 }, { "distill_loss": 0.13977956771850586, "epoch": 5.263509006004003, "step": 15780 }, { "epoch": 5.263509006004003, "ref_ce_loss": 0.05604541301727295, "step": 15780 }, { "epoch": 5.263509006004003, "loss": 0.5414626598358154, "step": 15780 }, { "ce_loss": 0.07642010599374771, "epoch": 5.263509006004003, "step": 15780 }, { "distill_loss": 0.116451695561409, "epoch": 5.263509006004003, "step": 15780 }, { "epoch": 5.263509006004003, "ref_ce_loss": 0.03948172554373741, "step": 15780 }, { "epoch": 5.266844563042028, "loss": 0.3594, "step": 15790 }, { "epoch": 5.266844563042028, "grad_norm": 3.9597408771514893, "step": 15790 }, { "epoch": 5.266844563042028, "learning_rate": 8.283436586762556e-05, "step": 15790 }, { "epoch": 5.266844563042028, "loss": 0.3665906488895416, "step": 15790 }, { "ce_loss": 0.09277671575546265, "epoch": 5.266844563042028, "step": 15790 }, { "distill_loss": 0.15249072015285492, "epoch": 5.266844563042028, "step": 15790 }, { "epoch": 5.266844563042028, "ref_ce_loss": 0.10048972815275192, "step": 15790 }, { "epoch": 5.266844563042028, "loss": 0.2455630600452423, "step": 15790 }, { "ce_loss": 0.0487009696662426, "epoch": 5.266844563042028, "step": 15790 }, { "distill_loss": 0.092554971575737, "epoch": 5.266844563042028, "step": 15790 }, { "epoch": 5.266844563042028, "ref_ce_loss": 0.050037186592817307, "step": 15790 }, { "epoch": 5.266844563042028, "loss": 0.3036814033985138, "step": 15790 }, { "ce_loss": 0.0706629678606987, "epoch": 5.266844563042028, "step": 15790 }, { "distill_loss": 0.12567178905010223, "epoch": 5.266844563042028, "step": 15790 }, { "epoch": 5.266844563042028, "ref_ce_loss": 0.07737930864095688, "step": 15790 }, { "epoch": 5.266844563042028, "loss": 0.2579875886440277, "step": 15790 }, { "ce_loss": 0.026779716834425926, "epoch": 5.266844563042028, "step": 15790 }, { "distill_loss": 0.09495803713798523, "epoch": 5.266844563042028, "step": 15790 }, { "epoch": 5.266844563042028, "ref_ce_loss": 0.08277291059494019, "step": 15790 }, { "epoch": 5.270180120080053, "loss": 0.3435, "step": 15800 }, { "epoch": 5.270180120080053, "grad_norm": 2.0573623180389404, "step": 15800 }, { "epoch": 5.270180120080053, "learning_rate": 8.265330720611883e-05, "step": 15800 }, { "epoch": 5.270180120080053, "loss": 0.3189752697944641, "step": 15800 }, { "ce_loss": 0.08407336473464966, "epoch": 5.270180120080053, "step": 15800 }, { "distill_loss": 0.11684277653694153, "epoch": 5.270180120080053, "step": 15800 }, { "epoch": 5.270180120080053, "ref_ce_loss": 0.08707991987466812, "step": 15800 }, { "epoch": 5.270180120080053, "loss": 0.21804724633693695, "step": 15800 }, { "ce_loss": 0.03380461037158966, "epoch": 5.270180120080053, "step": 15800 }, { "distill_loss": 0.10913065075874329, "epoch": 5.270180120080053, "step": 15800 }, { "epoch": 5.270180120080053, "ref_ce_loss": 0.056957922875881195, "step": 15800 }, { "epoch": 5.270180120080053, "loss": 0.33023056387901306, "step": 15800 }, { "ce_loss": 0.048063404858112335, "epoch": 5.270180120080053, "step": 15800 }, { "distill_loss": 0.11467498540878296, "epoch": 5.270180120080053, "step": 15800 }, { "epoch": 5.270180120080053, "ref_ce_loss": 0.06214866414666176, "step": 15800 }, { "epoch": 5.270180120080053, "loss": 0.2651209831237793, "step": 15800 }, { "ce_loss": 0.06443707644939423, "epoch": 5.270180120080053, "step": 15800 }, { "distill_loss": 0.11779454350471497, "epoch": 5.270180120080053, "step": 15800 }, { "epoch": 5.270180120080053, "ref_ce_loss": 0.06641155481338501, "step": 15800 }, { "epoch": 5.273515677118079, "loss": 0.3767, "step": 15810 }, { "epoch": 5.273515677118079, "grad_norm": 3.061605215072632, "step": 15810 }, { "epoch": 5.273515677118079, "learning_rate": 8.247237135841367e-05, "step": 15810 }, { "epoch": 5.273515677118079, "loss": 0.30033445358276367, "step": 15810 }, { "ce_loss": 0.07328201830387115, "epoch": 5.273515677118079, "step": 15810 }, { "distill_loss": 0.10676218569278717, "epoch": 5.273515677118079, "step": 15810 }, { "epoch": 5.273515677118079, "ref_ce_loss": 0.0811660885810852, "step": 15810 }, { "epoch": 5.273515677118079, "loss": 0.28703418374061584, "step": 15810 }, { "ce_loss": 0.04377947747707367, "epoch": 5.273515677118079, "step": 15810 }, { "distill_loss": 0.1450316160917282, "epoch": 5.273515677118079, "step": 15810 }, { "epoch": 5.273515677118079, "ref_ce_loss": 0.05514253303408623, "step": 15810 }, { "epoch": 5.273515677118079, "loss": 0.35974201560020447, "step": 15810 }, { "ce_loss": 0.04705319181084633, "epoch": 5.273515677118079, "step": 15810 }, { "distill_loss": 0.11349070072174072, "epoch": 5.273515677118079, "step": 15810 }, { "epoch": 5.273515677118079, "ref_ce_loss": 0.055691562592983246, "step": 15810 }, { "epoch": 5.273515677118079, "loss": 0.31948322057724, "step": 15810 }, { "ce_loss": 0.04331135377287865, "epoch": 5.273515677118079, "step": 15810 }, { "distill_loss": 0.1292535811662674, "epoch": 5.273515677118079, "step": 15810 }, { "epoch": 5.273515677118079, "ref_ce_loss": 0.08645766228437424, "step": 15810 }, { "epoch": 5.276851234156104, "loss": 0.366, "step": 15820 }, { "epoch": 5.276851234156104, "grad_norm": 2.201096534729004, "step": 15820 }, { "epoch": 5.276851234156104, "learning_rate": 8.229155865446575e-05, "step": 15820 }, { "epoch": 5.276851234156104, "loss": 0.32561996579170227, "step": 15820 }, { "ce_loss": 0.1004878580570221, "epoch": 5.276851234156104, "step": 15820 }, { "distill_loss": 0.09116347879171371, "epoch": 5.276851234156104, "step": 15820 }, { "epoch": 5.276851234156104, "ref_ce_loss": 0.07758591324090958, "step": 15820 }, { "epoch": 5.276851234156104, "loss": 0.19084353744983673, "step": 15820 }, { "ce_loss": 0.05388427525758743, "epoch": 5.276851234156104, "step": 15820 }, { "distill_loss": 0.08523177355527878, "epoch": 5.276851234156104, "step": 15820 }, { "epoch": 5.276851234156104, "ref_ce_loss": 0.05162469670176506, "step": 15820 }, { "epoch": 5.276851234156104, "loss": 0.34104123711586, "step": 15820 }, { "ce_loss": 0.11164067685604095, "epoch": 5.276851234156104, "step": 15820 }, { "distill_loss": 0.11855912208557129, "epoch": 5.276851234156104, "step": 15820 }, { "epoch": 5.276851234156104, "ref_ce_loss": 0.1106509119272232, "step": 15820 }, { "epoch": 5.276851234156104, "loss": 0.23398077487945557, "step": 15820 }, { "ce_loss": 0.08900479227304459, "epoch": 5.276851234156104, "step": 15820 }, { "distill_loss": 0.07735835760831833, "epoch": 5.276851234156104, "step": 15820 }, { "epoch": 5.276851234156104, "ref_ce_loss": 0.05137255787849426, "step": 15820 }, { "epoch": 5.280186791194129, "loss": 0.3493, "step": 15830 }, { "epoch": 5.280186791194129, "grad_norm": 2.384784460067749, "step": 15830 }, { "epoch": 5.280186791194129, "learning_rate": 8.211086942400596e-05, "step": 15830 }, { "epoch": 5.280186791194129, "loss": 0.23829659819602966, "step": 15830 }, { "ce_loss": 0.04226939007639885, "epoch": 5.280186791194129, "step": 15830 }, { "distill_loss": 0.10614483058452606, "epoch": 5.280186791194129, "step": 15830 }, { "epoch": 5.280186791194129, "ref_ce_loss": 0.06560316681861877, "step": 15830 }, { "epoch": 5.280186791194129, "loss": 0.2921256124973297, "step": 15830 }, { "ce_loss": 0.05392614006996155, "epoch": 5.280186791194129, "step": 15830 }, { "distill_loss": 0.1369142383337021, "epoch": 5.280186791194129, "step": 15830 }, { "epoch": 5.280186791194129, "ref_ce_loss": 0.05423854663968086, "step": 15830 }, { "epoch": 5.280186791194129, "loss": 0.2778950035572052, "step": 15830 }, { "ce_loss": 0.11162376403808594, "epoch": 5.280186791194129, "step": 15830 }, { "distill_loss": 0.08147963136434555, "epoch": 5.280186791194129, "step": 15830 }, { "epoch": 5.280186791194129, "ref_ce_loss": 0.0669204592704773, "step": 15830 }, { "epoch": 5.280186791194129, "loss": 0.3342251181602478, "step": 15830 }, { "ce_loss": 0.06660055369138718, "epoch": 5.280186791194129, "step": 15830 }, { "distill_loss": 0.14492513239383698, "epoch": 5.280186791194129, "step": 15830 }, { "epoch": 5.280186791194129, "ref_ce_loss": 0.09336922317743301, "step": 15830 }, { "epoch": 5.283522348232155, "loss": 0.3306, "step": 15840 }, { "epoch": 5.283522348232155, "grad_norm": 2.8796446323394775, "step": 15840 }, { "epoch": 5.283522348232155, "learning_rate": 8.193030399654027e-05, "step": 15840 }, { "epoch": 5.283522348232155, "loss": 0.4431094527244568, "step": 15840 }, { "ce_loss": 0.11614301055669785, "epoch": 5.283522348232155, "step": 15840 }, { "distill_loss": 0.09896580129861832, "epoch": 5.283522348232155, "step": 15840 }, { "epoch": 5.283522348232155, "ref_ce_loss": 0.06625843048095703, "step": 15840 }, { "epoch": 5.283522348232155, "loss": 0.2803962528705597, "step": 15840 }, { "ce_loss": 0.061902016401290894, "epoch": 5.283522348232155, "step": 15840 }, { "distill_loss": 0.1138758510351181, "epoch": 5.283522348232155, "step": 15840 }, { "epoch": 5.283522348232155, "ref_ce_loss": 0.06861334294080734, "step": 15840 }, { "epoch": 5.283522348232155, "loss": 0.18780617415905, "step": 15840 }, { "ce_loss": 0.03424258530139923, "epoch": 5.283522348232155, "step": 15840 }, { "distill_loss": 0.09116180986166, "epoch": 5.283522348232155, "step": 15840 }, { "epoch": 5.283522348232155, "ref_ce_loss": 0.04498676210641861, "step": 15840 }, { "epoch": 5.283522348232155, "loss": 0.36882030963897705, "step": 15840 }, { "ce_loss": 0.08330081403255463, "epoch": 5.283522348232155, "step": 15840 }, { "distill_loss": 0.15934601426124573, "epoch": 5.283522348232155, "step": 15840 }, { "epoch": 5.283522348232155, "ref_ce_loss": 0.08243943750858307, "step": 15840 }, { "epoch": 5.28685790527018, "loss": 0.3278, "step": 15850 }, { "epoch": 5.28685790527018, "grad_norm": 2.047136068344116, "step": 15850 }, { "epoch": 5.28685790527018, "learning_rate": 8.174986270134887e-05, "step": 15850 }, { "epoch": 5.28685790527018, "loss": 0.2891218066215515, "step": 15850 }, { "ce_loss": 0.03525947779417038, "epoch": 5.28685790527018, "step": 15850 }, { "distill_loss": 0.1262560933828354, "epoch": 5.28685790527018, "step": 15850 }, { "epoch": 5.28685790527018, "ref_ce_loss": 0.08171357214450836, "step": 15850 }, { "epoch": 5.28685790527018, "loss": 0.31089332699775696, "step": 15850 }, { "ce_loss": 0.05321585014462471, "epoch": 5.28685790527018, "step": 15850 }, { "distill_loss": 0.12867404520511627, "epoch": 5.28685790527018, "step": 15850 }, { "epoch": 5.28685790527018, "ref_ce_loss": 0.09286804497241974, "step": 15850 }, { "epoch": 5.28685790527018, "loss": 0.2964179515838623, "step": 15850 }, { "ce_loss": 0.07718577980995178, "epoch": 5.28685790527018, "step": 15850 }, { "distill_loss": 0.12015065550804138, "epoch": 5.28685790527018, "step": 15850 }, { "epoch": 5.28685790527018, "ref_ce_loss": 0.07145271450281143, "step": 15850 }, { "epoch": 5.28685790527018, "loss": 0.19068409502506256, "step": 15850 }, { "ce_loss": 0.04036962613463402, "epoch": 5.28685790527018, "step": 15850 }, { "distill_loss": 0.10206376016139984, "epoch": 5.28685790527018, "step": 15850 }, { "epoch": 5.28685790527018, "ref_ce_loss": 0.04821521416306496, "step": 15850 }, { "epoch": 5.290193462308205, "loss": 0.3316, "step": 15860 }, { "epoch": 5.290193462308205, "grad_norm": 1.8144029378890991, "step": 15860 }, { "epoch": 5.290193462308205, "learning_rate": 8.156954586748528e-05, "step": 15860 }, { "epoch": 5.290193462308205, "loss": 0.3066614270210266, "step": 15860 }, { "ce_loss": 0.06004337593913078, "epoch": 5.290193462308205, "step": 15860 }, { "distill_loss": 0.13541190326213837, "epoch": 5.290193462308205, "step": 15860 }, { "epoch": 5.290193462308205, "ref_ce_loss": 0.0621790774166584, "step": 15860 }, { "epoch": 5.290193462308205, "loss": 0.2303938865661621, "step": 15860 }, { "ce_loss": 0.0584101639688015, "epoch": 5.290193462308205, "step": 15860 }, { "distill_loss": 0.0883975550532341, "epoch": 5.290193462308205, "step": 15860 }, { "epoch": 5.290193462308205, "ref_ce_loss": 0.05530661344528198, "step": 15860 }, { "epoch": 5.290193462308205, "loss": 0.4860450029373169, "step": 15860 }, { "ce_loss": 0.07743857055902481, "epoch": 5.290193462308205, "step": 15860 }, { "distill_loss": 0.09735628217458725, "epoch": 5.290193462308205, "step": 15860 }, { "epoch": 5.290193462308205, "ref_ce_loss": 0.10022785514593124, "step": 15860 }, { "epoch": 5.290193462308205, "loss": 0.25716519355773926, "step": 15860 }, { "ce_loss": 0.06579107791185379, "epoch": 5.290193462308205, "step": 15860 }, { "distill_loss": 0.09430144727230072, "epoch": 5.290193462308205, "step": 15860 }, { "epoch": 5.290193462308205, "ref_ce_loss": 0.05018524080514908, "step": 15860 }, { "epoch": 5.293529019346231, "loss": 0.3463, "step": 15870 }, { "epoch": 5.293529019346231, "grad_norm": 2.370684862136841, "step": 15870 }, { "epoch": 5.293529019346231, "learning_rate": 8.138935382377653e-05, "step": 15870 }, { "epoch": 5.293529019346231, "loss": 0.24651719629764557, "step": 15870 }, { "ce_loss": 0.08347075432538986, "epoch": 5.293529019346231, "step": 15870 }, { "distill_loss": 0.10999614000320435, "epoch": 5.293529019346231, "step": 15870 }, { "epoch": 5.293529019346231, "ref_ce_loss": 0.052934352308511734, "step": 15870 }, { "epoch": 5.293529019346231, "loss": 0.49767571687698364, "step": 15870 }, { "ce_loss": 0.11719560623168945, "epoch": 5.293529019346231, "step": 15870 }, { "distill_loss": 0.13959640264511108, "epoch": 5.293529019346231, "step": 15870 }, { "epoch": 5.293529019346231, "ref_ce_loss": 0.12286118417978287, "step": 15870 }, { "epoch": 5.293529019346231, "loss": 0.3884763717651367, "step": 15870 }, { "ce_loss": 0.07949317991733551, "epoch": 5.293529019346231, "step": 15870 }, { "distill_loss": 0.13388612866401672, "epoch": 5.293529019346231, "step": 15870 }, { "epoch": 5.293529019346231, "ref_ce_loss": 0.10045711696147919, "step": 15870 }, { "epoch": 5.293529019346231, "loss": 0.24368393421173096, "step": 15870 }, { "ce_loss": 0.05252111330628395, "epoch": 5.293529019346231, "step": 15870 }, { "distill_loss": 0.1036781445145607, "epoch": 5.293529019346231, "step": 15870 }, { "epoch": 5.293529019346231, "ref_ce_loss": 0.05971752852201462, "step": 15870 }, { "epoch": 5.296864576384256, "loss": 0.2945, "step": 15880 }, { "epoch": 5.296864576384256, "grad_norm": 1.8943488597869873, "step": 15880 }, { "epoch": 5.296864576384256, "learning_rate": 8.120928689882166e-05, "step": 15880 }, { "epoch": 5.296864576384256, "loss": 0.6232532262802124, "step": 15880 }, { "ce_loss": 0.0998808890581131, "epoch": 5.296864576384256, "step": 15880 }, { "distill_loss": 0.09275517612695694, "epoch": 5.296864576384256, "step": 15880 }, { "epoch": 5.296864576384256, "ref_ce_loss": 0.08853812515735626, "step": 15880 }, { "epoch": 5.296864576384256, "loss": 0.45832279324531555, "step": 15880 }, { "ce_loss": 0.02765379287302494, "epoch": 5.296864576384256, "step": 15880 }, { "distill_loss": 0.12325002253055573, "epoch": 5.296864576384256, "step": 15880 }, { "epoch": 5.296864576384256, "ref_ce_loss": 0.09086009860038757, "step": 15880 }, { "epoch": 5.296864576384256, "loss": 0.3936709761619568, "step": 15880 }, { "ce_loss": 0.09706886112689972, "epoch": 5.296864576384256, "step": 15880 }, { "distill_loss": 0.11568113416433334, "epoch": 5.296864576384256, "step": 15880 }, { "epoch": 5.296864576384256, "ref_ce_loss": 0.08714719116687775, "step": 15880 }, { "epoch": 5.296864576384256, "loss": 0.3076615035533905, "step": 15880 }, { "ce_loss": 0.09075099229812622, "epoch": 5.296864576384256, "step": 15880 }, { "distill_loss": 0.10523568093776703, "epoch": 5.296864576384256, "step": 15880 }, { "epoch": 5.296864576384256, "ref_ce_loss": 0.08547863364219666, "step": 15880 }, { "epoch": 5.3002001334222815, "loss": 0.3435, "step": 15890 }, { "epoch": 5.3002001334222815, "grad_norm": 4.160302639007568, "step": 15890 }, { "epoch": 5.3002001334222815, "learning_rate": 8.102934542099176e-05, "step": 15890 }, { "epoch": 5.3002001334222815, "loss": 0.3581370413303375, "step": 15890 }, { "ce_loss": 0.0933266431093216, "epoch": 5.3002001334222815, "step": 15890 }, { "distill_loss": 0.1521734744310379, "epoch": 5.3002001334222815, "step": 15890 }, { "epoch": 5.3002001334222815, "ref_ce_loss": 0.0629705861210823, "step": 15890 }, { "epoch": 5.3002001334222815, "loss": 0.37069106101989746, "step": 15890 }, { "ce_loss": 0.05081234127283096, "epoch": 5.3002001334222815, "step": 15890 }, { "distill_loss": 0.09451035410165787, "epoch": 5.3002001334222815, "step": 15890 }, { "epoch": 5.3002001334222815, "ref_ce_loss": 0.08984711766242981, "step": 15890 }, { "epoch": 5.3002001334222815, "loss": 0.4759184420108795, "step": 15890 }, { "ce_loss": 0.05965457856655121, "epoch": 5.3002001334222815, "step": 15890 }, { "distill_loss": 0.09128181636333466, "epoch": 5.3002001334222815, "step": 15890 }, { "epoch": 5.3002001334222815, "ref_ce_loss": 0.08862599730491638, "step": 15890 }, { "epoch": 5.3002001334222815, "loss": 0.24063745141029358, "step": 15890 }, { "ce_loss": 0.06953100860118866, "epoch": 5.3002001334222815, "step": 15890 }, { "distill_loss": 0.111015185713768, "epoch": 5.3002001334222815, "step": 15890 }, { "epoch": 5.3002001334222815, "ref_ce_loss": 0.06000909581780434, "step": 15890 }, { "epoch": 5.303535690460307, "loss": 0.3347, "step": 15900 }, { "epoch": 5.303535690460307, "grad_norm": 3.271259307861328, "step": 15900 }, { "epoch": 5.303535690460307, "learning_rate": 8.08495297184292e-05, "step": 15900 }, { "epoch": 5.303535690460307, "loss": 0.2543596029281616, "step": 15900 }, { "ce_loss": 0.038422923535108566, "epoch": 5.303535690460307, "step": 15900 }, { "distill_loss": 0.11588691174983978, "epoch": 5.303535690460307, "step": 15900 }, { "epoch": 5.303535690460307, "ref_ce_loss": 0.0747591182589531, "step": 15900 }, { "epoch": 5.303535690460307, "loss": 0.19756633043289185, "step": 15900 }, { "ce_loss": 0.03136131539940834, "epoch": 5.303535690460307, "step": 15900 }, { "distill_loss": 0.09700172394514084, "epoch": 5.303535690460307, "step": 15900 }, { "epoch": 5.303535690460307, "ref_ce_loss": 0.053317707031965256, "step": 15900 }, { "epoch": 5.303535690460307, "loss": 0.4816906452178955, "step": 15900 }, { "ce_loss": 0.12503883242607117, "epoch": 5.303535690460307, "step": 15900 }, { "distill_loss": 0.14960643649101257, "epoch": 5.303535690460307, "step": 15900 }, { "epoch": 5.303535690460307, "ref_ce_loss": 0.06251875311136246, "step": 15900 }, { "epoch": 5.303535690460307, "loss": 0.30442604422569275, "step": 15900 }, { "ce_loss": 0.09769739955663681, "epoch": 5.303535690460307, "step": 15900 }, { "distill_loss": 0.09515450149774551, "epoch": 5.303535690460307, "step": 15900 }, { "epoch": 5.303535690460307, "ref_ce_loss": 0.09030892699956894, "step": 15900 }, { "epoch": 5.306871247498332, "loss": 0.3591, "step": 15910 }, { "epoch": 5.306871247498332, "grad_norm": 3.0482804775238037, "step": 15910 }, { "epoch": 5.306871247498332, "learning_rate": 8.066984011904669e-05, "step": 15910 }, { "epoch": 5.306871247498332, "loss": 0.2710125744342804, "step": 15910 }, { "ce_loss": 0.06486635655164719, "epoch": 5.306871247498332, "step": 15910 }, { "distill_loss": 0.0854596421122551, "epoch": 5.306871247498332, "step": 15910 }, { "epoch": 5.306871247498332, "ref_ce_loss": 0.0646849274635315, "step": 15910 }, { "epoch": 5.306871247498332, "loss": 0.3071233332157135, "step": 15910 }, { "ce_loss": 0.05939917638897896, "epoch": 5.306871247498332, "step": 15910 }, { "distill_loss": 0.09524413198232651, "epoch": 5.306871247498332, "step": 15910 }, { "epoch": 5.306871247498332, "ref_ce_loss": 0.06739193946123123, "step": 15910 }, { "epoch": 5.306871247498332, "loss": 0.2773936688899994, "step": 15910 }, { "ce_loss": 0.028691411018371582, "epoch": 5.306871247498332, "step": 15910 }, { "distill_loss": 0.09702437371015549, "epoch": 5.306871247498332, "step": 15910 }, { "epoch": 5.306871247498332, "ref_ce_loss": 0.09791380912065506, "step": 15910 }, { "epoch": 5.306871247498332, "loss": 0.251413494348526, "step": 15910 }, { "ce_loss": 0.06908174604177475, "epoch": 5.306871247498332, "step": 15910 }, { "distill_loss": 0.09879547357559204, "epoch": 5.306871247498332, "step": 15910 }, { "epoch": 5.306871247498332, "ref_ce_loss": 0.05396436154842377, "step": 15910 }, { "epoch": 5.3102068045363575, "loss": 0.3403, "step": 15920 }, { "epoch": 5.3102068045363575, "grad_norm": 3.299243927001953, "step": 15920 }, { "epoch": 5.3102068045363575, "learning_rate": 8.049027695052733e-05, "step": 15920 }, { "epoch": 5.3102068045363575, "loss": 0.32065755128860474, "step": 15920 }, { "ce_loss": 0.10038189589977264, "epoch": 5.3102068045363575, "step": 15920 }, { "distill_loss": 0.12436893582344055, "epoch": 5.3102068045363575, "step": 15920 }, { "epoch": 5.3102068045363575, "ref_ce_loss": 0.0547565296292305, "step": 15920 }, { "epoch": 5.3102068045363575, "loss": 0.8609752058982849, "step": 15920 }, { "ce_loss": 0.1599891632795334, "epoch": 5.3102068045363575, "step": 15920 }, { "distill_loss": 0.16662029922008514, "epoch": 5.3102068045363575, "step": 15920 }, { "epoch": 5.3102068045363575, "ref_ce_loss": 0.11310508102178574, "step": 15920 }, { "epoch": 5.3102068045363575, "loss": 0.28081509470939636, "step": 15920 }, { "ce_loss": 0.0595562569797039, "epoch": 5.3102068045363575, "step": 15920 }, { "distill_loss": 0.12638959288597107, "epoch": 5.3102068045363575, "step": 15920 }, { "epoch": 5.3102068045363575, "ref_ce_loss": 0.09455878287553787, "step": 15920 }, { "epoch": 5.3102068045363575, "loss": 0.45147788524627686, "step": 15920 }, { "ce_loss": 0.08775141090154648, "epoch": 5.3102068045363575, "step": 15920 }, { "distill_loss": 0.10882918536663055, "epoch": 5.3102068045363575, "step": 15920 }, { "epoch": 5.3102068045363575, "ref_ce_loss": 0.10873819887638092, "step": 15920 }, { "epoch": 5.313542361574383, "loss": 0.3594, "step": 15930 }, { "epoch": 5.313542361574383, "grad_norm": 2.0879247188568115, "step": 15930 }, { "epoch": 5.313542361574383, "learning_rate": 8.031084054032346e-05, "step": 15930 }, { "epoch": 5.313542361574383, "loss": 0.4031994938850403, "step": 15930 }, { "ce_loss": 0.05465778335928917, "epoch": 5.313542361574383, "step": 15930 }, { "distill_loss": 0.14434510469436646, "epoch": 5.313542361574383, "step": 15930 }, { "epoch": 5.313542361574383, "ref_ce_loss": 0.07141335308551788, "step": 15930 }, { "epoch": 5.313542361574383, "loss": 0.20144657790660858, "step": 15930 }, { "ce_loss": 0.056741055101156235, "epoch": 5.313542361574383, "step": 15930 }, { "distill_loss": 0.08500836789608002, "epoch": 5.313542361574383, "step": 15930 }, { "epoch": 5.313542361574383, "ref_ce_loss": 0.0596328042447567, "step": 15930 }, { "epoch": 5.313542361574383, "loss": 0.33083662390708923, "step": 15930 }, { "ce_loss": 0.05826576426625252, "epoch": 5.313542361574383, "step": 15930 }, { "distill_loss": 0.14457902312278748, "epoch": 5.313542361574383, "step": 15930 }, { "epoch": 5.313542361574383, "ref_ce_loss": 0.0672142505645752, "step": 15930 }, { "epoch": 5.313542361574383, "loss": 0.4372578263282776, "step": 15930 }, { "ce_loss": 0.11812935769557953, "epoch": 5.313542361574383, "step": 15930 }, { "distill_loss": 0.1362246423959732, "epoch": 5.313542361574383, "step": 15930 }, { "epoch": 5.313542361574383, "ref_ce_loss": 0.09717027097940445, "step": 15930 }, { "epoch": 5.316877918612408, "loss": 0.3511, "step": 15940 }, { "epoch": 5.316877918612408, "grad_norm": 1.8663395643234253, "step": 15940 }, { "epoch": 5.316877918612408, "learning_rate": 8.013153121565628e-05, "step": 15940 }, { "epoch": 5.316877918612408, "loss": 0.26273471117019653, "step": 15940 }, { "ce_loss": 0.055545687675476074, "epoch": 5.316877918612408, "step": 15940 }, { "distill_loss": 0.11238569021224976, "epoch": 5.316877918612408, "step": 15940 }, { "epoch": 5.316877918612408, "ref_ce_loss": 0.07137372344732285, "step": 15940 }, { "epoch": 5.316877918612408, "loss": 0.37487444281578064, "step": 15940 }, { "ce_loss": 0.05269518867135048, "epoch": 5.316877918612408, "step": 15940 }, { "distill_loss": 0.11019299924373627, "epoch": 5.316877918612408, "step": 15940 }, { "epoch": 5.316877918612408, "ref_ce_loss": 0.09977027773857117, "step": 15940 }, { "epoch": 5.316877918612408, "loss": 0.35324230790138245, "step": 15940 }, { "ce_loss": 0.057631898671388626, "epoch": 5.316877918612408, "step": 15940 }, { "distill_loss": 0.09934467077255249, "epoch": 5.316877918612408, "step": 15940 }, { "epoch": 5.316877918612408, "ref_ce_loss": 0.07707151770591736, "step": 15940 }, { "epoch": 5.316877918612408, "loss": 0.29354506731033325, "step": 15940 }, { "ce_loss": 0.06054156646132469, "epoch": 5.316877918612408, "step": 15940 }, { "distill_loss": 0.11593253910541534, "epoch": 5.316877918612408, "step": 15940 }, { "epoch": 5.316877918612408, "ref_ce_loss": 0.09283595532178879, "step": 15940 }, { "epoch": 5.320213475650434, "loss": 0.3001, "step": 15950 }, { "epoch": 5.320213475650434, "grad_norm": 2.898165464401245, "step": 15950 }, { "epoch": 5.320213475650434, "learning_rate": 7.995234930351538e-05, "step": 15950 }, { "epoch": 5.320213475650434, "loss": 0.3119295835494995, "step": 15950 }, { "ce_loss": 0.1078239381313324, "epoch": 5.320213475650434, "step": 15950 }, { "distill_loss": 0.1365964561700821, "epoch": 5.320213475650434, "step": 15950 }, { "epoch": 5.320213475650434, "ref_ce_loss": 0.06740289181470871, "step": 15950 }, { "epoch": 5.320213475650434, "loss": 0.23001202940940857, "step": 15950 }, { "ce_loss": 0.05635315924882889, "epoch": 5.320213475650434, "step": 15950 }, { "distill_loss": 0.09114664047956467, "epoch": 5.320213475650434, "step": 15950 }, { "epoch": 5.320213475650434, "ref_ce_loss": 0.056779682636260986, "step": 15950 }, { "epoch": 5.320213475650434, "loss": 0.30822890996932983, "step": 15950 }, { "ce_loss": 0.1031709685921669, "epoch": 5.320213475650434, "step": 15950 }, { "distill_loss": 0.12962231040000916, "epoch": 5.320213475650434, "step": 15950 }, { "epoch": 5.320213475650434, "ref_ce_loss": 0.05823402479290962, "step": 15950 }, { "epoch": 5.320213475650434, "loss": 0.2623670995235443, "step": 15950 }, { "ce_loss": 0.06514668464660645, "epoch": 5.320213475650434, "step": 15950 }, { "distill_loss": 0.09800301492214203, "epoch": 5.320213475650434, "step": 15950 }, { "epoch": 5.320213475650434, "ref_ce_loss": 0.07522417604923248, "step": 15950 }, { "epoch": 5.323549032688459, "loss": 0.357, "step": 15960 }, { "epoch": 5.323549032688459, "grad_norm": 1.9690219163894653, "step": 15960 }, { "epoch": 5.323549032688459, "learning_rate": 7.977329513065774e-05, "step": 15960 }, { "epoch": 5.323549032688459, "loss": 0.26475730538368225, "step": 15960 }, { "ce_loss": 0.05676068738102913, "epoch": 5.323549032688459, "step": 15960 }, { "distill_loss": 0.1186981201171875, "epoch": 5.323549032688459, "step": 15960 }, { "epoch": 5.323549032688459, "ref_ce_loss": 0.0697123259305954, "step": 15960 }, { "epoch": 5.323549032688459, "loss": 0.2583927512168884, "step": 15960 }, { "ce_loss": 0.042958781123161316, "epoch": 5.323549032688459, "step": 15960 }, { "distill_loss": 0.11501401662826538, "epoch": 5.323549032688459, "step": 15960 }, { "epoch": 5.323549032688459, "ref_ce_loss": 0.05779067054390907, "step": 15960 }, { "epoch": 5.323549032688459, "loss": 0.22738508880138397, "step": 15960 }, { "ce_loss": 0.07362529635429382, "epoch": 5.323549032688459, "step": 15960 }, { "distill_loss": 0.10928896069526672, "epoch": 5.323549032688459, "step": 15960 }, { "epoch": 5.323549032688459, "ref_ce_loss": 0.044383592903614044, "step": 15960 }, { "epoch": 5.323549032688459, "loss": 0.3220760226249695, "step": 15960 }, { "ce_loss": 0.06132000312209129, "epoch": 5.323549032688459, "step": 15960 }, { "distill_loss": 0.10518957674503326, "epoch": 5.323549032688459, "step": 15960 }, { "epoch": 5.323549032688459, "ref_ce_loss": 0.09508439898490906, "step": 15960 }, { "epoch": 5.326884589726484, "loss": 0.2984, "step": 15970 }, { "epoch": 5.326884589726484, "grad_norm": 3.3353259563446045, "step": 15970 }, { "epoch": 5.326884589726484, "learning_rate": 7.959436902360762e-05, "step": 15970 }, { "epoch": 5.326884589726484, "loss": 0.32827815413475037, "step": 15970 }, { "ce_loss": 0.12878939509391785, "epoch": 5.326884589726484, "step": 15970 }, { "distill_loss": 0.12784335017204285, "epoch": 5.326884589726484, "step": 15970 }, { "epoch": 5.326884589726484, "ref_ce_loss": 0.05205840244889259, "step": 15970 }, { "epoch": 5.326884589726484, "loss": 0.1396113932132721, "step": 15970 }, { "ce_loss": 0.014687249436974525, "epoch": 5.326884589726484, "step": 15970 }, { "distill_loss": 0.07145895808935165, "epoch": 5.326884589726484, "step": 15970 }, { "epoch": 5.326884589726484, "ref_ce_loss": 0.05341213196516037, "step": 15970 }, { "epoch": 5.326884589726484, "loss": 0.25794681906700134, "step": 15970 }, { "ce_loss": 0.1045415922999382, "epoch": 5.326884589726484, "step": 15970 }, { "distill_loss": 0.09720547497272491, "epoch": 5.326884589726484, "step": 15970 }, { "epoch": 5.326884589726484, "ref_ce_loss": 0.055952686816453934, "step": 15970 }, { "epoch": 5.326884589726484, "loss": 0.33006682991981506, "step": 15970 }, { "ce_loss": 0.07984444499015808, "epoch": 5.326884589726484, "step": 15970 }, { "distill_loss": 0.13971658051013947, "epoch": 5.326884589726484, "step": 15970 }, { "epoch": 5.326884589726484, "ref_ce_loss": 0.1103811264038086, "step": 15970 }, { "epoch": 5.33022014676451, "loss": 0.3209, "step": 15980 }, { "epoch": 5.33022014676451, "grad_norm": 3.469141721725464, "step": 15980 }, { "epoch": 5.33022014676451, "learning_rate": 7.941557130865565e-05, "step": 15980 }, { "epoch": 5.33022014676451, "loss": 0.42776209115982056, "step": 15980 }, { "ce_loss": 0.04647199064493179, "epoch": 5.33022014676451, "step": 15980 }, { "distill_loss": 0.11702238768339157, "epoch": 5.33022014676451, "step": 15980 }, { "epoch": 5.33022014676451, "ref_ce_loss": 0.10681089758872986, "step": 15980 }, { "epoch": 5.33022014676451, "loss": 0.13588321208953857, "step": 15980 }, { "ce_loss": 0.013034440577030182, "epoch": 5.33022014676451, "step": 15980 }, { "distill_loss": 0.06615424901247025, "epoch": 5.33022014676451, "step": 15980 }, { "epoch": 5.33022014676451, "ref_ce_loss": 0.04182055965065956, "step": 15980 }, { "epoch": 5.33022014676451, "loss": 0.35110753774642944, "step": 15980 }, { "ce_loss": 0.08016197383403778, "epoch": 5.33022014676451, "step": 15980 }, { "distill_loss": 0.11940719187259674, "epoch": 5.33022014676451, "step": 15980 }, { "epoch": 5.33022014676451, "ref_ce_loss": 0.057936813682317734, "step": 15980 }, { "epoch": 5.33022014676451, "loss": 0.6050047874450684, "step": 15980 }, { "ce_loss": 0.0430837981402874, "epoch": 5.33022014676451, "step": 15980 }, { "distill_loss": 0.12484762072563171, "epoch": 5.33022014676451, "step": 15980 }, { "epoch": 5.33022014676451, "ref_ce_loss": 0.08736221492290497, "step": 15980 }, { "epoch": 5.333555703802535, "loss": 0.3415, "step": 15990 }, { "epoch": 5.333555703802535, "grad_norm": 2.1499571800231934, "step": 15990 }, { "epoch": 5.333555703802535, "learning_rate": 7.923690231185833e-05, "step": 15990 }, { "epoch": 5.333555703802535, "loss": 0.6607826948165894, "step": 15990 }, { "ce_loss": 0.055438101291656494, "epoch": 5.333555703802535, "step": 15990 }, { "distill_loss": 0.14260601997375488, "epoch": 5.333555703802535, "step": 15990 }, { "epoch": 5.333555703802535, "ref_ce_loss": 0.08038537204265594, "step": 15990 }, { "epoch": 5.333555703802535, "loss": 0.5964380502700806, "step": 15990 }, { "ce_loss": 0.07627970725297928, "epoch": 5.333555703802535, "step": 15990 }, { "distill_loss": 0.09748944640159607, "epoch": 5.333555703802535, "step": 15990 }, { "epoch": 5.333555703802535, "ref_ce_loss": 0.10148187726736069, "step": 15990 }, { "epoch": 5.333555703802535, "loss": 0.3148542642593384, "step": 15990 }, { "ce_loss": 0.08906195312738419, "epoch": 5.333555703802535, "step": 15990 }, { "distill_loss": 0.11657960712909698, "epoch": 5.333555703802535, "step": 15990 }, { "epoch": 5.333555703802535, "ref_ce_loss": 0.07575056701898575, "step": 15990 }, { "epoch": 5.333555703802535, "loss": 0.28883641958236694, "step": 15990 }, { "ce_loss": 0.05575671046972275, "epoch": 5.333555703802535, "step": 15990 }, { "distill_loss": 0.12796026468276978, "epoch": 5.333555703802535, "step": 15990 }, { "epoch": 5.333555703802535, "ref_ce_loss": 0.050566114485263824, "step": 15990 }, { "epoch": 5.33689126084056, "loss": 0.3842, "step": 16000 }, { "epoch": 5.33689126084056, "grad_norm": 4.591987133026123, "step": 16000 }, { "epoch": 5.33689126084056, "learning_rate": 7.905836235903747e-05, "step": 16000 }, { "epoch": 5.33689126084056, "loss": 0.304741770029068, "step": 16000 }, { "ce_loss": 0.09324005246162415, "epoch": 5.33689126084056, "step": 16000 }, { "distill_loss": 0.11520503461360931, "epoch": 5.33689126084056, "step": 16000 }, { "epoch": 5.33689126084056, "ref_ce_loss": 0.0958837941288948, "step": 16000 }, { "epoch": 5.33689126084056, "loss": 0.2455434650182724, "step": 16000 }, { "ce_loss": 0.05510518327355385, "epoch": 5.33689126084056, "step": 16000 }, { "distill_loss": 0.11733116209506989, "epoch": 5.33689126084056, "step": 16000 }, { "epoch": 5.33689126084056, "ref_ce_loss": 0.03748934715986252, "step": 16000 }, { "epoch": 5.33689126084056, "loss": 0.2709517776966095, "step": 16000 }, { "ce_loss": 0.09849530458450317, "epoch": 5.33689126084056, "step": 16000 }, { "distill_loss": 0.0976298451423645, "epoch": 5.33689126084056, "step": 16000 }, { "epoch": 5.33689126084056, "ref_ce_loss": 0.06337139010429382, "step": 16000 }, { "epoch": 5.33689126084056, "loss": 0.35590457916259766, "step": 16000 }, { "ce_loss": 0.08008065819740295, "epoch": 5.33689126084056, "step": 16000 }, { "distill_loss": 0.10405315458774567, "epoch": 5.33689126084056, "step": 16000 }, { "epoch": 5.33689126084056, "ref_ce_loss": 0.09750795364379883, "step": 16000 }, { "epoch": 5.340226817878586, "loss": 0.3149, "step": 16010 }, { "epoch": 5.340226817878586, "grad_norm": 2.3251688480377197, "step": 16010 }, { "epoch": 5.340226817878586, "learning_rate": 7.887995177577942e-05, "step": 16010 }, { "epoch": 5.340226817878586, "loss": 0.3700042963027954, "step": 16010 }, { "ce_loss": 0.07157202810049057, "epoch": 5.340226817878586, "step": 16010 }, { "distill_loss": 0.11362545937299728, "epoch": 5.340226817878586, "step": 16010 }, { "epoch": 5.340226817878586, "ref_ce_loss": 0.05933113023638725, "step": 16010 }, { "epoch": 5.340226817878586, "loss": 0.6948315501213074, "step": 16010 }, { "ce_loss": 0.08553972840309143, "epoch": 5.340226817878586, "step": 16010 }, { "distill_loss": 0.1357281357049942, "epoch": 5.340226817878586, "step": 16010 }, { "epoch": 5.340226817878586, "ref_ce_loss": 0.07757649570703506, "step": 16010 }, { "epoch": 5.340226817878586, "loss": 0.18372049927711487, "step": 16010 }, { "ce_loss": 0.00982450321316719, "epoch": 5.340226817878586, "step": 16010 }, { "distill_loss": 0.0782545730471611, "epoch": 5.340226817878586, "step": 16010 }, { "epoch": 5.340226817878586, "ref_ce_loss": 0.05903314799070358, "step": 16010 }, { "epoch": 5.340226817878586, "loss": 0.5596423149108887, "step": 16010 }, { "ce_loss": 0.07517491281032562, "epoch": 5.340226817878586, "step": 16010 }, { "distill_loss": 0.1120464950799942, "epoch": 5.340226817878586, "step": 16010 }, { "epoch": 5.340226817878586, "ref_ce_loss": 0.1148577556014061, "step": 16010 }, { "epoch": 5.343562374916611, "loss": 0.3711, "step": 16020 }, { "epoch": 5.343562374916611, "grad_norm": 2.9900999069213867, "step": 16020 }, { "epoch": 5.343562374916611, "learning_rate": 7.870167088743476e-05, "step": 16020 }, { "epoch": 5.343562374916611, "loss": 0.29928722977638245, "step": 16020 }, { "ce_loss": 0.10394757241010666, "epoch": 5.343562374916611, "step": 16020 }, { "distill_loss": 0.12747430801391602, "epoch": 5.343562374916611, "step": 16020 }, { "epoch": 5.343562374916611, "ref_ce_loss": 0.06767245382070541, "step": 16020 }, { "epoch": 5.343562374916611, "loss": 0.5128778219223022, "step": 16020 }, { "ce_loss": 0.10501803457736969, "epoch": 5.343562374916611, "step": 16020 }, { "distill_loss": 0.20294877886772156, "epoch": 5.343562374916611, "step": 16020 }, { "epoch": 5.343562374916611, "ref_ce_loss": 0.08334807306528091, "step": 16020 }, { "epoch": 5.343562374916611, "loss": 0.2776087820529938, "step": 16020 }, { "ce_loss": 0.07922020554542542, "epoch": 5.343562374916611, "step": 16020 }, { "distill_loss": 0.13950836658477783, "epoch": 5.343562374916611, "step": 16020 }, { "epoch": 5.343562374916611, "ref_ce_loss": 0.05875676870346069, "step": 16020 }, { "epoch": 5.343562374916611, "loss": 0.5638596415519714, "step": 16020 }, { "ce_loss": 0.1316651850938797, "epoch": 5.343562374916611, "step": 16020 }, { "distill_loss": 0.17273733019828796, "epoch": 5.343562374916611, "step": 16020 }, { "epoch": 5.343562374916611, "ref_ce_loss": 0.10516467690467834, "step": 16020 }, { "epoch": 5.346897931954636, "loss": 0.3894, "step": 16030 }, { "epoch": 5.346897931954636, "grad_norm": 2.5836825370788574, "step": 16030 }, { "epoch": 5.346897931954636, "learning_rate": 7.852352001911752e-05, "step": 16030 }, { "epoch": 5.346897931954636, "loss": 0.45821037888526917, "step": 16030 }, { "ce_loss": 0.05548872798681259, "epoch": 5.346897931954636, "step": 16030 }, { "distill_loss": 0.09916659444570541, "epoch": 5.346897931954636, "step": 16030 }, { "epoch": 5.346897931954636, "ref_ce_loss": 0.05415504053235054, "step": 16030 }, { "epoch": 5.346897931954636, "loss": 0.3140331208705902, "step": 16030 }, { "ce_loss": 0.08456014841794968, "epoch": 5.346897931954636, "step": 16030 }, { "distill_loss": 0.10989885777235031, "epoch": 5.346897931954636, "step": 16030 }, { "epoch": 5.346897931954636, "ref_ce_loss": 0.056715261191129684, "step": 16030 }, { "epoch": 5.346897931954636, "loss": 0.26189103722572327, "step": 16030 }, { "ce_loss": 0.062468308955430984, "epoch": 5.346897931954636, "step": 16030 }, { "distill_loss": 0.11793512850999832, "epoch": 5.346897931954636, "step": 16030 }, { "epoch": 5.346897931954636, "ref_ce_loss": 0.08089719712734222, "step": 16030 }, { "epoch": 5.346897931954636, "loss": 0.23085886240005493, "step": 16030 }, { "ce_loss": 0.06034333258867264, "epoch": 5.346897931954636, "step": 16030 }, { "distill_loss": 0.10786504298448563, "epoch": 5.346897931954636, "step": 16030 }, { "epoch": 5.346897931954636, "ref_ce_loss": 0.04875631257891655, "step": 16030 }, { "epoch": 5.350233488992662, "loss": 0.3485, "step": 16040 }, { "epoch": 5.350233488992662, "grad_norm": 2.6916708946228027, "step": 16040 }, { "epoch": 5.350233488992662, "learning_rate": 7.834549949570459e-05, "step": 16040 }, { "epoch": 5.350233488992662, "loss": 0.23726388812065125, "step": 16040 }, { "ce_loss": 0.058267880231142044, "epoch": 5.350233488992662, "step": 16040 }, { "distill_loss": 0.11253169924020767, "epoch": 5.350233488992662, "step": 16040 }, { "epoch": 5.350233488992662, "ref_ce_loss": 0.06635519862174988, "step": 16040 }, { "epoch": 5.350233488992662, "loss": 0.3128780126571655, "step": 16040 }, { "ce_loss": 0.06482836604118347, "epoch": 5.350233488992662, "step": 16040 }, { "distill_loss": 0.117494598031044, "epoch": 5.350233488992662, "step": 16040 }, { "epoch": 5.350233488992662, "ref_ce_loss": 0.0603063702583313, "step": 16040 }, { "epoch": 5.350233488992662, "loss": 0.2820833623409271, "step": 16040 }, { "ce_loss": 0.08603330701589584, "epoch": 5.350233488992662, "step": 16040 }, { "distill_loss": 0.1037563756108284, "epoch": 5.350233488992662, "step": 16040 }, { "epoch": 5.350233488992662, "ref_ce_loss": 0.09191624075174332, "step": 16040 }, { "epoch": 5.350233488992662, "loss": 0.2623083293437958, "step": 16040 }, { "ce_loss": 0.06306131929159164, "epoch": 5.350233488992662, "step": 16040 }, { "distill_loss": 0.10764463245868683, "epoch": 5.350233488992662, "step": 16040 }, { "epoch": 5.350233488992662, "ref_ce_loss": 0.06455370038747787, "step": 16040 }, { "epoch": 5.353569046030687, "loss": 0.3229, "step": 16050 }, { "epoch": 5.353569046030687, "grad_norm": 1.7106281518936157, "step": 16050 }, { "epoch": 5.353569046030687, "learning_rate": 7.816760964183524e-05, "step": 16050 }, { "epoch": 5.353569046030687, "loss": 0.29164308309555054, "step": 16050 }, { "ce_loss": 0.013757818378508091, "epoch": 5.353569046030687, "step": 16050 }, { "distill_loss": 0.1073288768529892, "epoch": 5.353569046030687, "step": 16050 }, { "epoch": 5.353569046030687, "ref_ce_loss": 0.05741368606686592, "step": 16050 }, { "epoch": 5.353569046030687, "loss": 0.3547406494617462, "step": 16050 }, { "ce_loss": 0.03559216111898422, "epoch": 5.353569046030687, "step": 16050 }, { "distill_loss": 0.11831948161125183, "epoch": 5.353569046030687, "step": 16050 }, { "epoch": 5.353569046030687, "ref_ce_loss": 0.05722172558307648, "step": 16050 }, { "epoch": 5.353569046030687, "loss": 0.2418895661830902, "step": 16050 }, { "ce_loss": 0.03584042191505432, "epoch": 5.353569046030687, "step": 16050 }, { "distill_loss": 0.10411054641008377, "epoch": 5.353569046030687, "step": 16050 }, { "epoch": 5.353569046030687, "ref_ce_loss": 0.06963283568620682, "step": 16050 }, { "epoch": 5.353569046030687, "loss": 0.4107963442802429, "step": 16050 }, { "ce_loss": 0.14093990623950958, "epoch": 5.353569046030687, "step": 16050 }, { "distill_loss": 0.16926631331443787, "epoch": 5.353569046030687, "step": 16050 }, { "epoch": 5.353569046030687, "ref_ce_loss": 0.0756896585226059, "step": 16050 }, { "epoch": 5.356904603068712, "loss": 0.289, "step": 16060 }, { "epoch": 5.356904603068712, "grad_norm": 1.9935156106948853, "step": 16060 }, { "epoch": 5.356904603068712, "learning_rate": 7.798985078191028e-05, "step": 16060 }, { "epoch": 5.356904603068712, "loss": 0.2680877149105072, "step": 16060 }, { "ce_loss": 0.038785114884376526, "epoch": 5.356904603068712, "step": 16060 }, { "distill_loss": 0.10870185494422913, "epoch": 5.356904603068712, "step": 16060 }, { "epoch": 5.356904603068712, "ref_ce_loss": 0.0915147140622139, "step": 16060 }, { "epoch": 5.356904603068712, "loss": 0.23653815686702728, "step": 16060 }, { "ce_loss": 0.04102471470832825, "epoch": 5.356904603068712, "step": 16060 }, { "distill_loss": 0.11374015361070633, "epoch": 5.356904603068712, "step": 16060 }, { "epoch": 5.356904603068712, "ref_ce_loss": 0.08148464560508728, "step": 16060 }, { "epoch": 5.356904603068712, "loss": 0.2153933346271515, "step": 16060 }, { "ce_loss": 0.04689103737473488, "epoch": 5.356904603068712, "step": 16060 }, { "distill_loss": 0.08903343975543976, "epoch": 5.356904603068712, "step": 16060 }, { "epoch": 5.356904603068712, "ref_ce_loss": 0.05705301836133003, "step": 16060 }, { "epoch": 5.356904603068712, "loss": 0.3498300611972809, "step": 16060 }, { "ce_loss": 0.08931352943181992, "epoch": 5.356904603068712, "step": 16060 }, { "distill_loss": 0.12247570604085922, "epoch": 5.356904603068712, "step": 16060 }, { "epoch": 5.356904603068712, "ref_ce_loss": 0.06273306906223297, "step": 16060 }, { "epoch": 5.360240160106738, "loss": 0.3439, "step": 16070 }, { "epoch": 5.360240160106738, "grad_norm": 4.033847808837891, "step": 16070 }, { "epoch": 5.360240160106738, "learning_rate": 7.781222324009181e-05, "step": 16070 }, { "epoch": 5.360240160106738, "loss": 0.3406941592693329, "step": 16070 }, { "ce_loss": 0.052824467420578, "epoch": 5.360240160106738, "step": 16070 }, { "distill_loss": 0.12347150593996048, "epoch": 5.360240160106738, "step": 16070 }, { "epoch": 5.360240160106738, "ref_ce_loss": 0.048340242356061935, "step": 16070 }, { "epoch": 5.360240160106738, "loss": 0.39220917224884033, "step": 16070 }, { "ce_loss": 0.09756062924861908, "epoch": 5.360240160106738, "step": 16070 }, { "distill_loss": 0.12279407680034637, "epoch": 5.360240160106738, "step": 16070 }, { "epoch": 5.360240160106738, "ref_ce_loss": 0.079924076795578, "step": 16070 }, { "epoch": 5.360240160106738, "loss": 0.31329527497291565, "step": 16070 }, { "ce_loss": 0.06221854314208031, "epoch": 5.360240160106738, "step": 16070 }, { "distill_loss": 0.12405749410390854, "epoch": 5.360240160106738, "step": 16070 }, { "epoch": 5.360240160106738, "ref_ce_loss": 0.08381244540214539, "step": 16070 }, { "epoch": 5.360240160106738, "loss": 0.32966047525405884, "step": 16070 }, { "ce_loss": 0.05017072707414627, "epoch": 5.360240160106738, "step": 16070 }, { "distill_loss": 0.08353301137685776, "epoch": 5.360240160106738, "step": 16070 }, { "epoch": 5.360240160106738, "ref_ce_loss": 0.06393556296825409, "step": 16070 }, { "epoch": 5.363575717144763, "loss": 0.3435, "step": 16080 }, { "epoch": 5.363575717144763, "grad_norm": 2.8178539276123047, "step": 16080 }, { "epoch": 5.363575717144763, "learning_rate": 7.763472734030239e-05, "step": 16080 }, { "epoch": 5.363575717144763, "loss": 0.5294066667556763, "step": 16080 }, { "ce_loss": 0.06446986645460129, "epoch": 5.363575717144763, "step": 16080 }, { "distill_loss": 0.11884837597608566, "epoch": 5.363575717144763, "step": 16080 }, { "epoch": 5.363575717144763, "ref_ce_loss": 0.07165850698947906, "step": 16080 }, { "epoch": 5.363575717144763, "loss": 0.25549739599227905, "step": 16080 }, { "ce_loss": 0.038696497678756714, "epoch": 5.363575717144763, "step": 16080 }, { "distill_loss": 0.0930166095495224, "epoch": 5.363575717144763, "step": 16080 }, { "epoch": 5.363575717144763, "ref_ce_loss": 0.09175001084804535, "step": 16080 }, { "epoch": 5.363575717144763, "loss": 0.2837029993534088, "step": 16080 }, { "ce_loss": 0.04583244025707245, "epoch": 5.363575717144763, "step": 16080 }, { "distill_loss": 0.10166701674461365, "epoch": 5.363575717144763, "step": 16080 }, { "epoch": 5.363575717144763, "ref_ce_loss": 0.0670868456363678, "step": 16080 }, { "epoch": 5.363575717144763, "loss": 0.27036380767822266, "step": 16080 }, { "ce_loss": 0.007987498305737972, "epoch": 5.363575717144763, "step": 16080 }, { "distill_loss": 0.08497060835361481, "epoch": 5.363575717144763, "step": 16080 }, { "epoch": 5.363575717144763, "ref_ce_loss": 0.05139423906803131, "step": 16080 }, { "epoch": 5.3669112741827885, "loss": 0.3336, "step": 16090 }, { "epoch": 5.3669112741827885, "grad_norm": 2.1278111934661865, "step": 16090 }, { "epoch": 5.3669112741827885, "learning_rate": 7.745736340622453e-05, "step": 16090 }, { "epoch": 5.3669112741827885, "loss": 0.31296786665916443, "step": 16090 }, { "ce_loss": 0.04599738121032715, "epoch": 5.3669112741827885, "step": 16090 }, { "distill_loss": 0.13405273854732513, "epoch": 5.3669112741827885, "step": 16090 }, { "epoch": 5.3669112741827885, "ref_ce_loss": 0.08967866003513336, "step": 16090 }, { "epoch": 5.3669112741827885, "loss": 0.37006106972694397, "step": 16090 }, { "ce_loss": 0.09404456615447998, "epoch": 5.3669112741827885, "step": 16090 }, { "distill_loss": 0.14915797114372253, "epoch": 5.3669112741827885, "step": 16090 }, { "epoch": 5.3669112741827885, "ref_ce_loss": 0.09474969655275345, "step": 16090 }, { "epoch": 5.3669112741827885, "loss": 0.38359758257865906, "step": 16090 }, { "ce_loss": 0.1558644324541092, "epoch": 5.3669112741827885, "step": 16090 }, { "distill_loss": 0.12019581347703934, "epoch": 5.3669112741827885, "step": 16090 }, { "epoch": 5.3669112741827885, "ref_ce_loss": 0.08622688800096512, "step": 16090 }, { "epoch": 5.3669112741827885, "loss": 0.3088744878768921, "step": 16090 }, { "ce_loss": 0.06967803835868835, "epoch": 5.3669112741827885, "step": 16090 }, { "distill_loss": 0.10145791620016098, "epoch": 5.3669112741827885, "step": 16090 }, { "epoch": 5.3669112741827885, "ref_ce_loss": 0.07597280293703079, "step": 16090 }, { "epoch": 5.370246831220814, "loss": 0.3361, "step": 16100 }, { "epoch": 5.370246831220814, "grad_norm": 2.2339279651641846, "step": 16100 }, { "epoch": 5.370246831220814, "learning_rate": 7.728013176130009e-05, "step": 16100 }, { "epoch": 5.370246831220814, "loss": 0.20128178596496582, "step": 16100 }, { "ce_loss": 0.04678283631801605, "epoch": 5.370246831220814, "step": 16100 }, { "distill_loss": 0.08286233991384506, "epoch": 5.370246831220814, "step": 16100 }, { "epoch": 5.370246831220814, "ref_ce_loss": 0.05337541177868843, "step": 16100 }, { "epoch": 5.370246831220814, "loss": 0.3456668555736542, "step": 16100 }, { "ce_loss": 0.044075287878513336, "epoch": 5.370246831220814, "step": 16100 }, { "distill_loss": 0.10809840261936188, "epoch": 5.370246831220814, "step": 16100 }, { "epoch": 5.370246831220814, "ref_ce_loss": 0.05553989112377167, "step": 16100 }, { "epoch": 5.370246831220814, "loss": 0.2573688328266144, "step": 16100 }, { "ce_loss": 0.06272100657224655, "epoch": 5.370246831220814, "step": 16100 }, { "distill_loss": 0.11115021258592606, "epoch": 5.370246831220814, "step": 16100 }, { "epoch": 5.370246831220814, "ref_ce_loss": 0.06553357094526291, "step": 16100 }, { "epoch": 5.370246831220814, "loss": 0.315351665019989, "step": 16100 }, { "ce_loss": 0.06446407735347748, "epoch": 5.370246831220814, "step": 16100 }, { "distill_loss": 0.10399211943149567, "epoch": 5.370246831220814, "step": 16100 }, { "epoch": 5.370246831220814, "ref_ce_loss": 0.05777052417397499, "step": 16100 }, { "epoch": 5.373582388258839, "loss": 0.3533, "step": 16110 }, { "epoch": 5.373582388258839, "grad_norm": 4.180839538574219, "step": 16110 }, { "epoch": 5.373582388258839, "learning_rate": 7.710303272872974e-05, "step": 16110 }, { "epoch": 5.373582388258839, "loss": 0.3932115435600281, "step": 16110 }, { "ce_loss": 0.11435271799564362, "epoch": 5.373582388258839, "step": 16110 }, { "distill_loss": 0.14050374925136566, "epoch": 5.373582388258839, "step": 16110 }, { "epoch": 5.373582388258839, "ref_ce_loss": 0.07154138386249542, "step": 16110 }, { "epoch": 5.373582388258839, "loss": 0.6406397223472595, "step": 16110 }, { "ce_loss": 0.09394969046115875, "epoch": 5.373582388258839, "step": 16110 }, { "distill_loss": 0.13561400771141052, "epoch": 5.373582388258839, "step": 16110 }, { "epoch": 5.373582388258839, "ref_ce_loss": 0.06444769352674484, "step": 16110 }, { "epoch": 5.373582388258839, "loss": 0.506658136844635, "step": 16110 }, { "ce_loss": 0.1610911637544632, "epoch": 5.373582388258839, "step": 16110 }, { "distill_loss": 0.11807151138782501, "epoch": 5.373582388258839, "step": 16110 }, { "epoch": 5.373582388258839, "ref_ce_loss": 0.09151536226272583, "step": 16110 }, { "epoch": 5.373582388258839, "loss": 0.3110598623752594, "step": 16110 }, { "ce_loss": 0.08785425871610641, "epoch": 5.373582388258839, "step": 16110 }, { "distill_loss": 0.09481608867645264, "epoch": 5.373582388258839, "step": 16110 }, { "epoch": 5.373582388258839, "ref_ce_loss": 0.07922735810279846, "step": 16110 }, { "epoch": 5.3769179452968645, "loss": 0.3283, "step": 16120 }, { "epoch": 5.3769179452968645, "grad_norm": 2.140110969543457, "step": 16120 }, { "epoch": 5.3769179452968645, "learning_rate": 7.69260666314721e-05, "step": 16120 }, { "epoch": 5.3769179452968645, "loss": 0.4330398738384247, "step": 16120 }, { "ce_loss": 0.13992437720298767, "epoch": 5.3769179452968645, "step": 16120 }, { "distill_loss": 0.1396828144788742, "epoch": 5.3769179452968645, "step": 16120 }, { "epoch": 5.3769179452968645, "ref_ce_loss": 0.08856870234012604, "step": 16120 }, { "epoch": 5.3769179452968645, "loss": 0.2571561336517334, "step": 16120 }, { "ce_loss": 0.04879157990217209, "epoch": 5.3769179452968645, "step": 16120 }, { "distill_loss": 0.11554741859436035, "epoch": 5.3769179452968645, "step": 16120 }, { "epoch": 5.3769179452968645, "ref_ce_loss": 0.05843444913625717, "step": 16120 }, { "epoch": 5.3769179452968645, "loss": 0.2118985950946808, "step": 16120 }, { "ce_loss": 0.04279850423336029, "epoch": 5.3769179452968645, "step": 16120 }, { "distill_loss": 0.09749726951122284, "epoch": 5.3769179452968645, "step": 16120 }, { "epoch": 5.3769179452968645, "ref_ce_loss": 0.07149596512317657, "step": 16120 }, { "epoch": 5.3769179452968645, "loss": 0.41140979528427124, "step": 16120 }, { "ce_loss": 0.11395663022994995, "epoch": 5.3769179452968645, "step": 16120 }, { "distill_loss": 0.11845146119594574, "epoch": 5.3769179452968645, "step": 16120 }, { "epoch": 5.3769179452968645, "ref_ce_loss": 0.05464179068803787, "step": 16120 }, { "epoch": 5.38025350233489, "loss": 0.2989, "step": 16130 }, { "epoch": 5.38025350233489, "grad_norm": 1.9423120021820068, "step": 16130 }, { "epoch": 5.38025350233489, "learning_rate": 7.67492337922437e-05, "step": 16130 }, { "epoch": 5.38025350233489, "loss": 0.2500401437282562, "step": 16130 }, { "ce_loss": 0.01709035225212574, "epoch": 5.38025350233489, "step": 16130 }, { "distill_loss": 0.09783089905977249, "epoch": 5.38025350233489, "step": 16130 }, { "epoch": 5.38025350233489, "ref_ce_loss": 0.058439701795578, "step": 16130 }, { "epoch": 5.38025350233489, "loss": 0.30501842498779297, "step": 16130 }, { "ce_loss": 0.09198647737503052, "epoch": 5.38025350233489, "step": 16130 }, { "distill_loss": 0.09595755487680435, "epoch": 5.38025350233489, "step": 16130 }, { "epoch": 5.38025350233489, "ref_ce_loss": 0.08432227373123169, "step": 16130 }, { "epoch": 5.38025350233489, "loss": 0.40725916624069214, "step": 16130 }, { "ce_loss": 0.07747211307287216, "epoch": 5.38025350233489, "step": 16130 }, { "distill_loss": 0.08231954276561737, "epoch": 5.38025350233489, "step": 16130 }, { "epoch": 5.38025350233489, "ref_ce_loss": 0.0659797340631485, "step": 16130 }, { "epoch": 5.38025350233489, "loss": 0.7572364807128906, "step": 16130 }, { "ce_loss": 0.08224272727966309, "epoch": 5.38025350233489, "step": 16130 }, { "distill_loss": 0.13095402717590332, "epoch": 5.38025350233489, "step": 16130 }, { "epoch": 5.38025350233489, "ref_ce_loss": 0.08251205086708069, "step": 16130 }, { "epoch": 5.383589059372915, "loss": 0.3603, "step": 16140 }, { "epoch": 5.383589059372915, "grad_norm": 1.8171334266662598, "step": 16140 }, { "epoch": 5.383589059372915, "learning_rate": 7.657253453351765e-05, "step": 16140 }, { "epoch": 5.383589059372915, "loss": 0.30455857515335083, "step": 16140 }, { "ce_loss": 0.10068642348051071, "epoch": 5.383589059372915, "step": 16140 }, { "distill_loss": 0.11382429301738739, "epoch": 5.383589059372915, "step": 16140 }, { "epoch": 5.383589059372915, "ref_ce_loss": 0.06328225135803223, "step": 16140 }, { "epoch": 5.383589059372915, "loss": 0.38374200463294983, "step": 16140 }, { "ce_loss": 0.11451509594917297, "epoch": 5.383589059372915, "step": 16140 }, { "distill_loss": 0.12740738689899445, "epoch": 5.383589059372915, "step": 16140 }, { "epoch": 5.383589059372915, "ref_ce_loss": 0.1416238397359848, "step": 16140 }, { "epoch": 5.383589059372915, "loss": 0.38903430104255676, "step": 16140 }, { "ce_loss": 0.10609883815050125, "epoch": 5.383589059372915, "step": 16140 }, { "distill_loss": 0.1371476799249649, "epoch": 5.383589059372915, "step": 16140 }, { "epoch": 5.383589059372915, "ref_ce_loss": 0.07509582489728928, "step": 16140 }, { "epoch": 5.383589059372915, "loss": 0.30147454142570496, "step": 16140 }, { "ce_loss": 0.049935102462768555, "epoch": 5.383589059372915, "step": 16140 }, { "distill_loss": 0.09309284389019012, "epoch": 5.383589059372915, "step": 16140 }, { "epoch": 5.383589059372915, "ref_ce_loss": 0.06776655465364456, "step": 16140 }, { "epoch": 5.386924616410941, "loss": 0.3575, "step": 16150 }, { "epoch": 5.386924616410941, "grad_norm": 3.655974864959717, "step": 16150 }, { "epoch": 5.386924616410941, "learning_rate": 7.639596917752391e-05, "step": 16150 }, { "epoch": 5.386924616410941, "loss": 0.28991663455963135, "step": 16150 }, { "ce_loss": 0.04967931658029556, "epoch": 5.386924616410941, "step": 16150 }, { "distill_loss": 0.13226084411144257, "epoch": 5.386924616410941, "step": 16150 }, { "epoch": 5.386924616410941, "ref_ce_loss": 0.07171325385570526, "step": 16150 }, { "epoch": 5.386924616410941, "loss": 0.4191509485244751, "step": 16150 }, { "ce_loss": 0.07444164901971817, "epoch": 5.386924616410941, "step": 16150 }, { "distill_loss": 0.10191843658685684, "epoch": 5.386924616410941, "step": 16150 }, { "epoch": 5.386924616410941, "ref_ce_loss": 0.09404486417770386, "step": 16150 }, { "epoch": 5.386924616410941, "loss": 0.2821800708770752, "step": 16150 }, { "ce_loss": 0.06736411899328232, "epoch": 5.386924616410941, "step": 16150 }, { "distill_loss": 0.10012883692979813, "epoch": 5.386924616410941, "step": 16150 }, { "epoch": 5.386924616410941, "ref_ce_loss": 0.06843230873346329, "step": 16150 }, { "epoch": 5.386924616410941, "loss": 0.3395196795463562, "step": 16150 }, { "ce_loss": 0.07241164892911911, "epoch": 5.386924616410941, "step": 16150 }, { "distill_loss": 0.11182572692632675, "epoch": 5.386924616410941, "step": 16150 }, { "epoch": 5.386924616410941, "ref_ce_loss": 0.06843020766973495, "step": 16150 }, { "epoch": 5.390260173448966, "loss": 0.3259, "step": 16160 }, { "epoch": 5.390260173448966, "grad_norm": 2.4135913848876953, "step": 16160 }, { "epoch": 5.390260173448966, "learning_rate": 7.621953804624801e-05, "step": 16160 }, { "epoch": 5.390260173448966, "loss": 0.38991570472717285, "step": 16160 }, { "ce_loss": 0.11925956606864929, "epoch": 5.390260173448966, "step": 16160 }, { "distill_loss": 0.11014068126678467, "epoch": 5.390260173448966, "step": 16160 }, { "epoch": 5.390260173448966, "ref_ce_loss": 0.060893464833498, "step": 16160 }, { "epoch": 5.390260173448966, "loss": 0.23624181747436523, "step": 16160 }, { "ce_loss": 0.045740626752376556, "epoch": 5.390260173448966, "step": 16160 }, { "distill_loss": 0.09637036919593811, "epoch": 5.390260173448966, "step": 16160 }, { "epoch": 5.390260173448966, "ref_ce_loss": 0.07027741521596909, "step": 16160 }, { "epoch": 5.390260173448966, "loss": 0.2636074721813202, "step": 16160 }, { "ce_loss": 0.03585965558886528, "epoch": 5.390260173448966, "step": 16160 }, { "distill_loss": 0.08669553697109222, "epoch": 5.390260173448966, "step": 16160 }, { "epoch": 5.390260173448966, "ref_ce_loss": 0.05847055837512016, "step": 16160 }, { "epoch": 5.390260173448966, "loss": 0.6537226438522339, "step": 16160 }, { "ce_loss": 0.08080638200044632, "epoch": 5.390260173448966, "step": 16160 }, { "distill_loss": 0.1580420434474945, "epoch": 5.390260173448966, "step": 16160 }, { "epoch": 5.390260173448966, "ref_ce_loss": 0.0891571044921875, "step": 16160 }, { "epoch": 5.393595730486991, "loss": 0.3647, "step": 16170 }, { "epoch": 5.393595730486991, "grad_norm": 1.73436439037323, "step": 16170 }, { "epoch": 5.393595730486991, "learning_rate": 7.604324146143065e-05, "step": 16170 }, { "epoch": 5.393595730486991, "loss": 0.3857395648956299, "step": 16170 }, { "ce_loss": 0.040800731629133224, "epoch": 5.393595730486991, "step": 16170 }, { "distill_loss": 0.08445484936237335, "epoch": 5.393595730486991, "step": 16170 }, { "epoch": 5.393595730486991, "ref_ce_loss": 0.07349581271409988, "step": 16170 }, { "epoch": 5.393595730486991, "loss": 0.36219871044158936, "step": 16170 }, { "ce_loss": 0.09910664707422256, "epoch": 5.393595730486991, "step": 16170 }, { "distill_loss": 0.13986824452877045, "epoch": 5.393595730486991, "step": 16170 }, { "epoch": 5.393595730486991, "ref_ce_loss": 0.08963736146688461, "step": 16170 }, { "epoch": 5.393595730486991, "loss": 0.3613256812095642, "step": 16170 }, { "ce_loss": 0.12792716920375824, "epoch": 5.393595730486991, "step": 16170 }, { "distill_loss": 0.14017927646636963, "epoch": 5.393595730486991, "step": 16170 }, { "epoch": 5.393595730486991, "ref_ce_loss": 0.09304498136043549, "step": 16170 }, { "epoch": 5.393595730486991, "loss": 0.38659751415252686, "step": 16170 }, { "ce_loss": 0.06034409999847412, "epoch": 5.393595730486991, "step": 16170 }, { "distill_loss": 0.0787641704082489, "epoch": 5.393595730486991, "step": 16170 }, { "epoch": 5.393595730486991, "ref_ce_loss": 0.08379752933979034, "step": 16170 }, { "epoch": 5.396931287525017, "loss": 0.3448, "step": 16180 }, { "epoch": 5.396931287525017, "grad_norm": 3.2747390270233154, "step": 16180 }, { "epoch": 5.396931287525017, "learning_rate": 7.586707974456736e-05, "step": 16180 }, { "epoch": 5.396931287525017, "loss": 0.35655203461647034, "step": 16180 }, { "ce_loss": 0.124222531914711, "epoch": 5.396931287525017, "step": 16180 }, { "distill_loss": 0.1000693291425705, "epoch": 5.396931287525017, "step": 16180 }, { "epoch": 5.396931287525017, "ref_ce_loss": 0.10748447477817535, "step": 16180 }, { "epoch": 5.396931287525017, "loss": 0.4372347295284271, "step": 16180 }, { "ce_loss": 0.04162885248661041, "epoch": 5.396931287525017, "step": 16180 }, { "distill_loss": 0.09314451366662979, "epoch": 5.396931287525017, "step": 16180 }, { "epoch": 5.396931287525017, "ref_ce_loss": 0.09335581213235855, "step": 16180 }, { "epoch": 5.396931287525017, "loss": 0.241681307554245, "step": 16180 }, { "ce_loss": 0.03342455253005028, "epoch": 5.396931287525017, "step": 16180 }, { "distill_loss": 0.08295845240354538, "epoch": 5.396931287525017, "step": 16180 }, { "epoch": 5.396931287525017, "ref_ce_loss": 0.07485225051641464, "step": 16180 }, { "epoch": 5.396931287525017, "loss": 0.31029412150382996, "step": 16180 }, { "ce_loss": 0.09854529798030853, "epoch": 5.396931287525017, "step": 16180 }, { "distill_loss": 0.11103243380784988, "epoch": 5.396931287525017, "step": 16180 }, { "epoch": 5.396931287525017, "ref_ce_loss": 0.06289716809988022, "step": 16180 }, { "epoch": 5.400266844563042, "loss": 0.3405, "step": 16190 }, { "epoch": 5.400266844563042, "grad_norm": 4.361851215362549, "step": 16190 }, { "epoch": 5.400266844563042, "learning_rate": 7.569105321690752e-05, "step": 16190 }, { "epoch": 5.400266844563042, "loss": 0.2974907457828522, "step": 16190 }, { "ce_loss": 0.039553042501211166, "epoch": 5.400266844563042, "step": 16190 }, { "distill_loss": 0.12818855047225952, "epoch": 5.400266844563042, "step": 16190 }, { "epoch": 5.400266844563042, "ref_ce_loss": 0.0701039582490921, "step": 16190 }, { "epoch": 5.400266844563042, "loss": 0.21528084576129913, "step": 16190 }, { "ce_loss": 0.06046629697084427, "epoch": 5.400266844563042, "step": 16190 }, { "distill_loss": 0.08600938320159912, "epoch": 5.400266844563042, "step": 16190 }, { "epoch": 5.400266844563042, "ref_ce_loss": 0.05630891025066376, "step": 16190 }, { "epoch": 5.400266844563042, "loss": 0.2935210168361664, "step": 16190 }, { "ce_loss": 0.09160295873880386, "epoch": 5.400266844563042, "step": 16190 }, { "distill_loss": 0.10282592475414276, "epoch": 5.400266844563042, "step": 16190 }, { "epoch": 5.400266844563042, "ref_ce_loss": 0.0734093189239502, "step": 16190 }, { "epoch": 5.400266844563042, "loss": 0.24886572360992432, "step": 16190 }, { "ce_loss": 0.03738119453191757, "epoch": 5.400266844563042, "step": 16190 }, { "distill_loss": 0.10953368246555328, "epoch": 5.400266844563042, "step": 16190 }, { "epoch": 5.400266844563042, "ref_ce_loss": 0.04157862067222595, "step": 16190 }, { "epoch": 5.403602401601067, "loss": 0.3089, "step": 16200 }, { "epoch": 5.403602401601067, "grad_norm": 2.205479621887207, "step": 16200 }, { "epoch": 5.403602401601067, "learning_rate": 7.551516219945406e-05, "step": 16200 }, { "epoch": 5.403602401601067, "loss": 0.2098427563905716, "step": 16200 }, { "ce_loss": 0.030090736225247383, "epoch": 5.403602401601067, "step": 16200 }, { "distill_loss": 0.09558893740177155, "epoch": 5.403602401601067, "step": 16200 }, { "epoch": 5.403602401601067, "ref_ce_loss": 0.055292759090662, "step": 16200 }, { "epoch": 5.403602401601067, "loss": 0.3873053193092346, "step": 16200 }, { "ce_loss": 0.06212102994322777, "epoch": 5.403602401601067, "step": 16200 }, { "distill_loss": 0.0861755907535553, "epoch": 5.403602401601067, "step": 16200 }, { "epoch": 5.403602401601067, "ref_ce_loss": 0.12519577145576477, "step": 16200 }, { "epoch": 5.403602401601067, "loss": 0.317695289850235, "step": 16200 }, { "ce_loss": 0.06708138436079025, "epoch": 5.403602401601067, "step": 16200 }, { "distill_loss": 0.11926654726266861, "epoch": 5.403602401601067, "step": 16200 }, { "epoch": 5.403602401601067, "ref_ce_loss": 0.10601387917995453, "step": 16200 }, { "epoch": 5.403602401601067, "loss": 0.39536064863204956, "step": 16200 }, { "ce_loss": 0.13016223907470703, "epoch": 5.403602401601067, "step": 16200 }, { "distill_loss": 0.09881344437599182, "epoch": 5.403602401601067, "step": 16200 }, { "epoch": 5.403602401601067, "ref_ce_loss": 0.08258476853370667, "step": 16200 }, { "epoch": 5.406937958639093, "loss": 0.3505, "step": 16210 }, { "epoch": 5.406937958639093, "grad_norm": 3.33132004737854, "step": 16210 }, { "epoch": 5.406937958639093, "learning_rate": 7.533940701296298e-05, "step": 16210 }, { "epoch": 5.406937958639093, "loss": 0.507758378982544, "step": 16210 }, { "ce_loss": 0.11343388259410858, "epoch": 5.406937958639093, "step": 16210 }, { "distill_loss": 0.13680988550186157, "epoch": 5.406937958639093, "step": 16210 }, { "epoch": 5.406937958639093, "ref_ce_loss": 0.07887157797813416, "step": 16210 }, { "epoch": 5.406937958639093, "loss": 0.20069199800491333, "step": 16210 }, { "ce_loss": 0.02618763968348503, "epoch": 5.406937958639093, "step": 16210 }, { "distill_loss": 0.08316341787576675, "epoch": 5.406937958639093, "step": 16210 }, { "epoch": 5.406937958639093, "ref_ce_loss": 0.06166895106434822, "step": 16210 }, { "epoch": 5.406937958639093, "loss": 0.343496710062027, "step": 16210 }, { "ce_loss": 0.07581013441085815, "epoch": 5.406937958639093, "step": 16210 }, { "distill_loss": 0.11562657356262207, "epoch": 5.406937958639093, "step": 16210 }, { "epoch": 5.406937958639093, "ref_ce_loss": 0.07594557106494904, "step": 16210 }, { "epoch": 5.406937958639093, "loss": 0.3016616404056549, "step": 16210 }, { "ce_loss": 0.10311080515384674, "epoch": 5.406937958639093, "step": 16210 }, { "distill_loss": 0.09994910657405853, "epoch": 5.406937958639093, "step": 16210 }, { "epoch": 5.406937958639093, "ref_ce_loss": 0.07250438630580902, "step": 16210 }, { "epoch": 5.410273515677118, "loss": 0.3128, "step": 16220 }, { "epoch": 5.410273515677118, "grad_norm": 2.0081982612609863, "step": 16220 }, { "epoch": 5.410273515677118, "learning_rate": 7.516378797794228e-05, "step": 16220 }, { "epoch": 5.410273515677118, "loss": 0.341609925031662, "step": 16220 }, { "ce_loss": 0.06672375649213791, "epoch": 5.410273515677118, "step": 16220 }, { "distill_loss": 0.10152589529752731, "epoch": 5.410273515677118, "step": 16220 }, { "epoch": 5.410273515677118, "ref_ce_loss": 0.1350449025630951, "step": 16220 }, { "epoch": 5.410273515677118, "loss": 0.35093531012535095, "step": 16220 }, { "ce_loss": 0.11491996794939041, "epoch": 5.410273515677118, "step": 16220 }, { "distill_loss": 0.12406434863805771, "epoch": 5.410273515677118, "step": 16220 }, { "epoch": 5.410273515677118, "ref_ce_loss": 0.07030671089887619, "step": 16220 }, { "epoch": 5.410273515677118, "loss": 0.21612723171710968, "step": 16220 }, { "ce_loss": 0.0429653525352478, "epoch": 5.410273515677118, "step": 16220 }, { "distill_loss": 0.107479527592659, "epoch": 5.410273515677118, "step": 16220 }, { "epoch": 5.410273515677118, "ref_ce_loss": 0.0655955970287323, "step": 16220 }, { "epoch": 5.410273515677118, "loss": 0.26148879528045654, "step": 16220 }, { "ce_loss": 0.00934822577983141, "epoch": 5.410273515677118, "step": 16220 }, { "distill_loss": 0.08704058080911636, "epoch": 5.410273515677118, "step": 16220 }, { "epoch": 5.410273515677118, "ref_ce_loss": 0.059622831642627716, "step": 16220 }, { "epoch": 5.413609072715143, "loss": 0.3393, "step": 16230 }, { "epoch": 5.413609072715143, "grad_norm": 3.420259475708008, "step": 16230 }, { "epoch": 5.413609072715143, "learning_rate": 7.49883054146518e-05, "step": 16230 }, { "epoch": 5.413609072715143, "loss": 0.26101815700531006, "step": 16230 }, { "ce_loss": 0.03194340318441391, "epoch": 5.413609072715143, "step": 16230 }, { "distill_loss": 0.07808404415845871, "epoch": 5.413609072715143, "step": 16230 }, { "epoch": 5.413609072715143, "ref_ce_loss": 0.06558459252119064, "step": 16230 }, { "epoch": 5.413609072715143, "loss": 0.4477325677871704, "step": 16230 }, { "ce_loss": 0.053452376276254654, "epoch": 5.413609072715143, "step": 16230 }, { "distill_loss": 0.09510692209005356, "epoch": 5.413609072715143, "step": 16230 }, { "epoch": 5.413609072715143, "ref_ce_loss": 0.0880439504981041, "step": 16230 }, { "epoch": 5.413609072715143, "loss": 0.29558783769607544, "step": 16230 }, { "ce_loss": 0.10143938660621643, "epoch": 5.413609072715143, "step": 16230 }, { "distill_loss": 0.12191247940063477, "epoch": 5.413609072715143, "step": 16230 }, { "epoch": 5.413609072715143, "ref_ce_loss": 0.07203470170497894, "step": 16230 }, { "epoch": 5.413609072715143, "loss": 0.2230197787284851, "step": 16230 }, { "ce_loss": 0.053148508071899414, "epoch": 5.413609072715143, "step": 16230 }, { "distill_loss": 0.10254699736833572, "epoch": 5.413609072715143, "step": 16230 }, { "epoch": 5.413609072715143, "ref_ce_loss": 0.05377613380551338, "step": 16230 }, { "epoch": 5.416944629753169, "loss": 0.3343, "step": 16240 }, { "epoch": 5.416944629753169, "grad_norm": 2.3163440227508545, "step": 16240 }, { "epoch": 5.416944629753169, "learning_rate": 7.481295964310263e-05, "step": 16240 }, { "epoch": 5.416944629753169, "loss": 0.27103978395462036, "step": 16240 }, { "ce_loss": 0.07148025184869766, "epoch": 5.416944629753169, "step": 16240 }, { "distill_loss": 0.10926101356744766, "epoch": 5.416944629753169, "step": 16240 }, { "epoch": 5.416944629753169, "ref_ce_loss": 0.09023928642272949, "step": 16240 }, { "epoch": 5.416944629753169, "loss": 0.30508822202682495, "step": 16240 }, { "ce_loss": 0.0872962698340416, "epoch": 5.416944629753169, "step": 16240 }, { "distill_loss": 0.12328356504440308, "epoch": 5.416944629753169, "step": 16240 }, { "epoch": 5.416944629753169, "ref_ce_loss": 0.06537864357233047, "step": 16240 }, { "epoch": 5.416944629753169, "loss": 0.2479790449142456, "step": 16240 }, { "ce_loss": 0.05155964568257332, "epoch": 5.416944629753169, "step": 16240 }, { "distill_loss": 0.09833475947380066, "epoch": 5.416944629753169, "step": 16240 }, { "epoch": 5.416944629753169, "ref_ce_loss": 0.059581857174634933, "step": 16240 }, { "epoch": 5.416944629753169, "loss": 0.4399837851524353, "step": 16240 }, { "ce_loss": 0.09402266144752502, "epoch": 5.416944629753169, "step": 16240 }, { "distill_loss": 0.12486834824085236, "epoch": 5.416944629753169, "step": 16240 }, { "epoch": 5.416944629753169, "ref_ce_loss": 0.06681935489177704, "step": 16240 }, { "epoch": 5.420280186791194, "loss": 0.3358, "step": 16250 }, { "epoch": 5.420280186791194, "grad_norm": 2.0072333812713623, "step": 16250 }, { "epoch": 5.420280186791194, "learning_rate": 7.463775098305612e-05, "step": 16250 }, { "epoch": 5.420280186791194, "loss": 0.23055459558963776, "step": 16250 }, { "ce_loss": 0.05385155603289604, "epoch": 5.420280186791194, "step": 16250 }, { "distill_loss": 0.10702510178089142, "epoch": 5.420280186791194, "step": 16250 }, { "epoch": 5.420280186791194, "ref_ce_loss": 0.059084367007017136, "step": 16250 }, { "epoch": 5.420280186791194, "loss": 0.24993188679218292, "step": 16250 }, { "ce_loss": 0.048931531608104706, "epoch": 5.420280186791194, "step": 16250 }, { "distill_loss": 0.10967432707548141, "epoch": 5.420280186791194, "step": 16250 }, { "epoch": 5.420280186791194, "ref_ce_loss": 0.05932708829641342, "step": 16250 }, { "epoch": 5.420280186791194, "loss": 0.5765171647071838, "step": 16250 }, { "ce_loss": 0.12102974206209183, "epoch": 5.420280186791194, "step": 16250 }, { "distill_loss": 0.15688693523406982, "epoch": 5.420280186791194, "step": 16250 }, { "epoch": 5.420280186791194, "ref_ce_loss": 0.11446768790483475, "step": 16250 }, { "epoch": 5.420280186791194, "loss": 0.44742411375045776, "step": 16250 }, { "ce_loss": 0.04978395998477936, "epoch": 5.420280186791194, "step": 16250 }, { "distill_loss": 0.09400615096092224, "epoch": 5.420280186791194, "step": 16250 }, { "epoch": 5.420280186791194, "ref_ce_loss": 0.11323413252830505, "step": 16250 }, { "epoch": 5.423615743829219, "loss": 0.3884, "step": 16260 }, { "epoch": 5.423615743829219, "grad_norm": 2.7448010444641113, "step": 16260 }, { "epoch": 5.423615743829219, "learning_rate": 7.446267975402385e-05, "step": 16260 }, { "epoch": 5.423615743829219, "loss": 0.23236140608787537, "step": 16260 }, { "ce_loss": 0.06890847533941269, "epoch": 5.423615743829219, "step": 16260 }, { "distill_loss": 0.09854884445667267, "epoch": 5.423615743829219, "step": 16260 }, { "epoch": 5.423615743829219, "ref_ce_loss": 0.06464102864265442, "step": 16260 }, { "epoch": 5.423615743829219, "loss": 0.2084226757287979, "step": 16260 }, { "ce_loss": 0.026601558551192284, "epoch": 5.423615743829219, "step": 16260 }, { "distill_loss": 0.09681437164545059, "epoch": 5.423615743829219, "step": 16260 }, { "epoch": 5.423615743829219, "ref_ce_loss": 0.05920984968543053, "step": 16260 }, { "epoch": 5.423615743829219, "loss": 0.4451378583908081, "step": 16260 }, { "ce_loss": 0.13203752040863037, "epoch": 5.423615743829219, "step": 16260 }, { "distill_loss": 0.13270580768585205, "epoch": 5.423615743829219, "step": 16260 }, { "epoch": 5.423615743829219, "ref_ce_loss": 0.12923528254032135, "step": 16260 }, { "epoch": 5.423615743829219, "loss": 0.37428125739097595, "step": 16260 }, { "ce_loss": 0.13738876581192017, "epoch": 5.423615743829219, "step": 16260 }, { "distill_loss": 0.1335502564907074, "epoch": 5.423615743829219, "step": 16260 }, { "epoch": 5.423615743829219, "ref_ce_loss": 0.08215318620204926, "step": 16260 }, { "epoch": 5.426951300867245, "loss": 0.3326, "step": 16270 }, { "epoch": 5.426951300867245, "grad_norm": 1.8034840822219849, "step": 16270 }, { "epoch": 5.426951300867245, "learning_rate": 7.428774627526667e-05, "step": 16270 }, { "epoch": 5.426951300867245, "loss": 0.597827672958374, "step": 16270 }, { "ce_loss": 0.0970778539776802, "epoch": 5.426951300867245, "step": 16270 }, { "distill_loss": 0.09443487972021103, "epoch": 5.426951300867245, "step": 16270 }, { "epoch": 5.426951300867245, "ref_ce_loss": 0.07590417563915253, "step": 16270 }, { "epoch": 5.426951300867245, "loss": 0.3486078381538391, "step": 16270 }, { "ce_loss": 0.08969951421022415, "epoch": 5.426951300867245, "step": 16270 }, { "distill_loss": 0.09644579142332077, "epoch": 5.426951300867245, "step": 16270 }, { "epoch": 5.426951300867245, "ref_ce_loss": 0.10284632444381714, "step": 16270 }, { "epoch": 5.426951300867245, "loss": 0.3421056270599365, "step": 16270 }, { "ce_loss": 0.12405972927808762, "epoch": 5.426951300867245, "step": 16270 }, { "distill_loss": 0.11241130530834198, "epoch": 5.426951300867245, "step": 16270 }, { "epoch": 5.426951300867245, "ref_ce_loss": 0.07050801813602448, "step": 16270 }, { "epoch": 5.426951300867245, "loss": 0.32441556453704834, "step": 16270 }, { "ce_loss": 0.04846134036779404, "epoch": 5.426951300867245, "step": 16270 }, { "distill_loss": 0.08198875188827515, "epoch": 5.426951300867245, "step": 16270 }, { "epoch": 5.426951300867245, "ref_ce_loss": 0.044600147753953934, "step": 16270 }, { "epoch": 5.43028685790527, "loss": 0.337, "step": 16280 }, { "epoch": 5.43028685790527, "grad_norm": 2.3241167068481445, "step": 16280 }, { "epoch": 5.43028685790527, "learning_rate": 7.411295086579422e-05, "step": 16280 }, { "epoch": 5.43028685790527, "loss": 0.372223824262619, "step": 16280 }, { "ce_loss": 0.11412045359611511, "epoch": 5.43028685790527, "step": 16280 }, { "distill_loss": 0.11776827275753021, "epoch": 5.43028685790527, "step": 16280 }, { "epoch": 5.43028685790527, "ref_ce_loss": 0.1010143980383873, "step": 16280 }, { "epoch": 5.43028685790527, "loss": 0.25290191173553467, "step": 16280 }, { "ce_loss": 0.023051615804433823, "epoch": 5.43028685790527, "step": 16280 }, { "distill_loss": 0.07711265236139297, "epoch": 5.43028685790527, "step": 16280 }, { "epoch": 5.43028685790527, "ref_ce_loss": 0.0660116896033287, "step": 16280 }, { "epoch": 5.43028685790527, "loss": 0.2648986876010895, "step": 16280 }, { "ce_loss": 0.0392785519361496, "epoch": 5.43028685790527, "step": 16280 }, { "distill_loss": 0.103148452937603, "epoch": 5.43028685790527, "step": 16280 }, { "epoch": 5.43028685790527, "ref_ce_loss": 0.06433369219303131, "step": 16280 }, { "epoch": 5.43028685790527, "loss": 0.32053670287132263, "step": 16280 }, { "ce_loss": 0.09016153961420059, "epoch": 5.43028685790527, "step": 16280 }, { "distill_loss": 0.13692647218704224, "epoch": 5.43028685790527, "step": 16280 }, { "epoch": 5.43028685790527, "ref_ce_loss": 0.06246296316385269, "step": 16280 }, { "epoch": 5.4336224149432955, "loss": 0.3091, "step": 16290 }, { "epoch": 5.4336224149432955, "grad_norm": 1.8611153364181519, "step": 16290 }, { "epoch": 5.4336224149432955, "learning_rate": 7.393829384436447e-05, "step": 16290 }, { "epoch": 5.4336224149432955, "loss": 0.28557321429252625, "step": 16290 }, { "ce_loss": 0.06013104319572449, "epoch": 5.4336224149432955, "step": 16290 }, { "distill_loss": 0.107640340924263, "epoch": 5.4336224149432955, "step": 16290 }, { "epoch": 5.4336224149432955, "ref_ce_loss": 0.11766494065523148, "step": 16290 }, { "epoch": 5.4336224149432955, "loss": 0.37696021795272827, "step": 16290 }, { "ce_loss": 0.025803005322813988, "epoch": 5.4336224149432955, "step": 16290 }, { "distill_loss": 0.08128707110881805, "epoch": 5.4336224149432955, "step": 16290 }, { "epoch": 5.4336224149432955, "ref_ce_loss": 0.0619768463075161, "step": 16290 }, { "epoch": 5.4336224149432955, "loss": 0.33180344104766846, "step": 16290 }, { "ce_loss": 0.07863081246614456, "epoch": 5.4336224149432955, "step": 16290 }, { "distill_loss": 0.11234287917613983, "epoch": 5.4336224149432955, "step": 16290 }, { "epoch": 5.4336224149432955, "ref_ce_loss": 0.11700798571109772, "step": 16290 }, { "epoch": 5.4336224149432955, "loss": 0.32060012221336365, "step": 16290 }, { "ce_loss": 0.09802191704511642, "epoch": 5.4336224149432955, "step": 16290 }, { "distill_loss": 0.09087618440389633, "epoch": 5.4336224149432955, "step": 16290 }, { "epoch": 5.4336224149432955, "ref_ce_loss": 0.09540796279907227, "step": 16290 }, { "epoch": 5.436957971981321, "loss": 0.3082, "step": 16300 }, { "epoch": 5.436957971981321, "grad_norm": 1.750720739364624, "step": 16300 }, { "epoch": 5.436957971981321, "learning_rate": 7.37637755294828e-05, "step": 16300 }, { "epoch": 5.436957971981321, "loss": 0.31910401582717896, "step": 16300 }, { "ce_loss": 0.02396697923541069, "epoch": 5.436957971981321, "step": 16300 }, { "distill_loss": 0.11216147243976593, "epoch": 5.436957971981321, "step": 16300 }, { "epoch": 5.436957971981321, "ref_ce_loss": 0.08133500814437866, "step": 16300 }, { "epoch": 5.436957971981321, "loss": 0.13243257999420166, "step": 16300 }, { "ce_loss": 0.013941922225058079, "epoch": 5.436957971981321, "step": 16300 }, { "distill_loss": 0.06792638450860977, "epoch": 5.436957971981321, "step": 16300 }, { "epoch": 5.436957971981321, "ref_ce_loss": 0.03205156326293945, "step": 16300 }, { "epoch": 5.436957971981321, "loss": 0.24062950909137726, "step": 16300 }, { "ce_loss": 0.06546594202518463, "epoch": 5.436957971981321, "step": 16300 }, { "distill_loss": 0.09706707298755646, "epoch": 5.436957971981321, "step": 16300 }, { "epoch": 5.436957971981321, "ref_ce_loss": 0.07798836380243301, "step": 16300 }, { "epoch": 5.436957971981321, "loss": 0.3027392029762268, "step": 16300 }, { "ce_loss": 0.07704725116491318, "epoch": 5.436957971981321, "step": 16300 }, { "distill_loss": 0.121660515666008, "epoch": 5.436957971981321, "step": 16300 }, { "epoch": 5.436957971981321, "ref_ce_loss": 0.10396517068147659, "step": 16300 }, { "epoch": 5.440293529019346, "loss": 0.3203, "step": 16310 }, { "epoch": 5.440293529019346, "grad_norm": 3.087531805038452, "step": 16310 }, { "epoch": 5.440293529019346, "learning_rate": 7.358939623940182e-05, "step": 16310 }, { "epoch": 5.440293529019346, "loss": 0.1709238737821579, "step": 16310 }, { "ce_loss": 0.04030200093984604, "epoch": 5.440293529019346, "step": 16310 }, { "distill_loss": 0.08430983126163483, "epoch": 5.440293529019346, "step": 16310 }, { "epoch": 5.440293529019346, "ref_ce_loss": 0.04611937701702118, "step": 16310 }, { "epoch": 5.440293529019346, "loss": 1.0106971263885498, "step": 16310 }, { "ce_loss": 0.13990041613578796, "epoch": 5.440293529019346, "step": 16310 }, { "distill_loss": 0.1668081283569336, "epoch": 5.440293529019346, "step": 16310 }, { "epoch": 5.440293529019346, "ref_ce_loss": 0.10148528963327408, "step": 16310 }, { "epoch": 5.440293529019346, "loss": 0.1992446333169937, "step": 16310 }, { "ce_loss": 0.04668141156435013, "epoch": 5.440293529019346, "step": 16310 }, { "distill_loss": 0.10151037573814392, "epoch": 5.440293529019346, "step": 16310 }, { "epoch": 5.440293529019346, "ref_ce_loss": 0.05095856264233589, "step": 16310 }, { "epoch": 5.440293529019346, "loss": 0.2627887427806854, "step": 16310 }, { "ce_loss": 0.08537213504314423, "epoch": 5.440293529019346, "step": 16310 }, { "distill_loss": 0.10909847170114517, "epoch": 5.440293529019346, "step": 16310 }, { "epoch": 5.440293529019346, "ref_ce_loss": 0.0681203156709671, "step": 16310 }, { "epoch": 5.4436290860573715, "loss": 0.3939, "step": 16320 }, { "epoch": 5.4436290860573715, "grad_norm": 2.990647315979004, "step": 16320 }, { "epoch": 5.4436290860573715, "learning_rate": 7.341515629212056e-05, "step": 16320 }, { "epoch": 5.4436290860573715, "loss": 0.4164668321609497, "step": 16320 }, { "ce_loss": 0.11799903959035873, "epoch": 5.4436290860573715, "step": 16320 }, { "distill_loss": 0.12904855608940125, "epoch": 5.4436290860573715, "step": 16320 }, { "epoch": 5.4436290860573715, "ref_ce_loss": 0.12566408514976501, "step": 16320 }, { "epoch": 5.4436290860573715, "loss": 0.31384703516960144, "step": 16320 }, { "ce_loss": 0.0404597632586956, "epoch": 5.4436290860573715, "step": 16320 }, { "distill_loss": 0.12027452886104584, "epoch": 5.4436290860573715, "step": 16320 }, { "epoch": 5.4436290860573715, "ref_ce_loss": 0.08330576866865158, "step": 16320 }, { "epoch": 5.4436290860573715, "loss": 0.2575395107269287, "step": 16320 }, { "ce_loss": 0.01610216125845909, "epoch": 5.4436290860573715, "step": 16320 }, { "distill_loss": 0.12123523652553558, "epoch": 5.4436290860573715, "step": 16320 }, { "epoch": 5.4436290860573715, "ref_ce_loss": 0.05706319212913513, "step": 16320 }, { "epoch": 5.4436290860573715, "loss": 0.20323054492473602, "step": 16320 }, { "ce_loss": 0.021865028887987137, "epoch": 5.4436290860573715, "step": 16320 }, { "distill_loss": 0.09164980798959732, "epoch": 5.4436290860573715, "step": 16320 }, { "epoch": 5.4436290860573715, "ref_ce_loss": 0.06141813471913338, "step": 16320 }, { "epoch": 5.446964643095397, "loss": 0.3603, "step": 16330 }, { "epoch": 5.446964643095397, "grad_norm": 2.530071496963501, "step": 16330 }, { "epoch": 5.446964643095397, "learning_rate": 7.324105600538398e-05, "step": 16330 }, { "epoch": 5.446964643095397, "loss": 0.3814869523048401, "step": 16330 }, { "ce_loss": 0.07421386986970901, "epoch": 5.446964643095397, "step": 16330 }, { "distill_loss": 0.11971503496170044, "epoch": 5.446964643095397, "step": 16330 }, { "epoch": 5.446964643095397, "ref_ce_loss": 0.087638258934021, "step": 16330 }, { "epoch": 5.446964643095397, "loss": 0.2262338250875473, "step": 16330 }, { "ce_loss": 0.04096594080328941, "epoch": 5.446964643095397, "step": 16330 }, { "distill_loss": 0.09016257524490356, "epoch": 5.446964643095397, "step": 16330 }, { "epoch": 5.446964643095397, "ref_ce_loss": 0.05134720727801323, "step": 16330 }, { "epoch": 5.446964643095397, "loss": 0.2489914894104004, "step": 16330 }, { "ce_loss": 0.08123141527175903, "epoch": 5.446964643095397, "step": 16330 }, { "distill_loss": 0.09971652925014496, "epoch": 5.446964643095397, "step": 16330 }, { "epoch": 5.446964643095397, "ref_ce_loss": 0.04955803602933884, "step": 16330 }, { "epoch": 5.446964643095397, "loss": 0.2052384614944458, "step": 16330 }, { "ce_loss": 0.07025023549795151, "epoch": 5.446964643095397, "step": 16330 }, { "distill_loss": 0.09092271327972412, "epoch": 5.446964643095397, "step": 16330 }, { "epoch": 5.446964643095397, "ref_ce_loss": 0.04397953301668167, "step": 16330 }, { "epoch": 5.450300200133422, "loss": 0.3527, "step": 16340 }, { "epoch": 5.450300200133422, "grad_norm": 3.414726972579956, "step": 16340 }, { "epoch": 5.450300200133422, "learning_rate": 7.306709569668236e-05, "step": 16340 }, { "epoch": 5.450300200133422, "loss": 0.24532371759414673, "step": 16340 }, { "ce_loss": 0.060081496834754944, "epoch": 5.450300200133422, "step": 16340 }, { "distill_loss": 0.09829045087099075, "epoch": 5.450300200133422, "step": 16340 }, { "epoch": 5.450300200133422, "ref_ce_loss": 0.046170350164175034, "step": 16340 }, { "epoch": 5.450300200133422, "loss": 0.3321872055530548, "step": 16340 }, { "ce_loss": 0.08811584860086441, "epoch": 5.450300200133422, "step": 16340 }, { "distill_loss": 0.1313212364912033, "epoch": 5.450300200133422, "step": 16340 }, { "epoch": 5.450300200133422, "ref_ce_loss": 0.08971906453371048, "step": 16340 }, { "epoch": 5.450300200133422, "loss": 0.4201227128505707, "step": 16340 }, { "ce_loss": 0.17822621762752533, "epoch": 5.450300200133422, "step": 16340 }, { "distill_loss": 0.14979350566864014, "epoch": 5.450300200133422, "step": 16340 }, { "epoch": 5.450300200133422, "ref_ce_loss": 0.07139469683170319, "step": 16340 }, { "epoch": 5.450300200133422, "loss": 0.38188281655311584, "step": 16340 }, { "ce_loss": 0.08560221642255783, "epoch": 5.450300200133422, "step": 16340 }, { "distill_loss": 0.1589680016040802, "epoch": 5.450300200133422, "step": 16340 }, { "epoch": 5.450300200133422, "ref_ce_loss": 0.11068110167980194, "step": 16340 }, { "epoch": 5.4536357571714476, "loss": 0.3255, "step": 16350 }, { "epoch": 5.4536357571714476, "grad_norm": 1.9104830026626587, "step": 16350 }, { "epoch": 5.4536357571714476, "learning_rate": 7.289327568325061e-05, "step": 16350 }, { "epoch": 5.4536357571714476, "loss": 0.2057226002216339, "step": 16350 }, { "ce_loss": 0.013312791474163532, "epoch": 5.4536357571714476, "step": 16350 }, { "distill_loss": 0.10898993909358978, "epoch": 5.4536357571714476, "step": 16350 }, { "epoch": 5.4536357571714476, "ref_ce_loss": 0.06706269085407257, "step": 16350 }, { "epoch": 5.4536357571714476, "loss": 0.3488602936267853, "step": 16350 }, { "ce_loss": 0.1517920345067978, "epoch": 5.4536357571714476, "step": 16350 }, { "distill_loss": 0.12502455711364746, "epoch": 5.4536357571714476, "step": 16350 }, { "epoch": 5.4536357571714476, "ref_ce_loss": 0.07182005792856216, "step": 16350 }, { "epoch": 5.4536357571714476, "loss": 0.38063332438468933, "step": 16350 }, { "ce_loss": 0.0700574442744255, "epoch": 5.4536357571714476, "step": 16350 }, { "distill_loss": 0.13038572669029236, "epoch": 5.4536357571714476, "step": 16350 }, { "epoch": 5.4536357571714476, "ref_ce_loss": 0.07677096873521805, "step": 16350 }, { "epoch": 5.4536357571714476, "loss": 0.17652976512908936, "step": 16350 }, { "ce_loss": 0.01483121793717146, "epoch": 5.4536357571714476, "step": 16350 }, { "distill_loss": 0.0829794704914093, "epoch": 5.4536357571714476, "step": 16350 }, { "epoch": 5.4536357571714476, "ref_ce_loss": 0.058518633246421814, "step": 16350 }, { "epoch": 5.456971314209473, "loss": 0.3239, "step": 16360 }, { "epoch": 5.456971314209473, "grad_norm": 1.9413048028945923, "step": 16360 }, { "epoch": 5.456971314209473, "learning_rate": 7.271959628206786e-05, "step": 16360 }, { "epoch": 5.456971314209473, "loss": 0.20399023592472076, "step": 16360 }, { "ce_loss": 0.026615489274263382, "epoch": 5.456971314209473, "step": 16360 }, { "distill_loss": 0.0943148136138916, "epoch": 5.456971314209473, "step": 16360 }, { "epoch": 5.456971314209473, "ref_ce_loss": 0.05922786518931389, "step": 16360 }, { "epoch": 5.456971314209473, "loss": 0.258375346660614, "step": 16360 }, { "ce_loss": 0.06289554387331009, "epoch": 5.456971314209473, "step": 16360 }, { "distill_loss": 0.1057371124625206, "epoch": 5.456971314209473, "step": 16360 }, { "epoch": 5.456971314209473, "ref_ce_loss": 0.05854816362261772, "step": 16360 }, { "epoch": 5.456971314209473, "loss": 0.2847128212451935, "step": 16360 }, { "ce_loss": 0.07973343133926392, "epoch": 5.456971314209473, "step": 16360 }, { "distill_loss": 0.10827630013227463, "epoch": 5.456971314209473, "step": 16360 }, { "epoch": 5.456971314209473, "ref_ce_loss": 0.09657879173755646, "step": 16360 }, { "epoch": 5.456971314209473, "loss": 0.5999765992164612, "step": 16360 }, { "ce_loss": 0.12350272387266159, "epoch": 5.456971314209473, "step": 16360 }, { "distill_loss": 0.1628512442111969, "epoch": 5.456971314209473, "step": 16360 }, { "epoch": 5.456971314209473, "ref_ce_loss": 0.10591977089643478, "step": 16360 }, { "epoch": 5.460306871247498, "loss": 0.3582, "step": 16370 }, { "epoch": 5.460306871247498, "grad_norm": 2.5877041816711426, "step": 16370 }, { "epoch": 5.460306871247498, "learning_rate": 7.254605780985687e-05, "step": 16370 }, { "epoch": 5.460306871247498, "loss": 0.31887102127075195, "step": 16370 }, { "ce_loss": 0.11570335924625397, "epoch": 5.460306871247498, "step": 16370 }, { "distill_loss": 0.1031806692481041, "epoch": 5.460306871247498, "step": 16370 }, { "epoch": 5.460306871247498, "ref_ce_loss": 0.0757158175110817, "step": 16370 }, { "epoch": 5.460306871247498, "loss": 0.302977591753006, "step": 16370 }, { "ce_loss": 0.09307489544153214, "epoch": 5.460306871247498, "step": 16370 }, { "distill_loss": 0.12041888386011124, "epoch": 5.460306871247498, "step": 16370 }, { "epoch": 5.460306871247498, "ref_ce_loss": 0.060292188078165054, "step": 16370 }, { "epoch": 5.460306871247498, "loss": 0.38156723976135254, "step": 16370 }, { "ce_loss": 0.05984897539019585, "epoch": 5.460306871247498, "step": 16370 }, { "distill_loss": 0.11454299837350845, "epoch": 5.460306871247498, "step": 16370 }, { "epoch": 5.460306871247498, "ref_ce_loss": 0.07398920506238937, "step": 16370 }, { "epoch": 5.460306871247498, "loss": 0.2736813724040985, "step": 16370 }, { "ce_loss": 0.11653132736682892, "epoch": 5.460306871247498, "step": 16370 }, { "distill_loss": 0.10505042225122452, "epoch": 5.460306871247498, "step": 16370 }, { "epoch": 5.460306871247498, "ref_ce_loss": 0.0519915409386158, "step": 16370 }, { "epoch": 5.463642428285524, "loss": 0.3778, "step": 16380 }, { "epoch": 5.463642428285524, "grad_norm": 3.572533369064331, "step": 16380 }, { "epoch": 5.463642428285524, "learning_rate": 7.237266058308337e-05, "step": 16380 }, { "epoch": 5.463642428285524, "loss": 0.30562925338745117, "step": 16380 }, { "ce_loss": 0.07567353546619415, "epoch": 5.463642428285524, "step": 16380 }, { "distill_loss": 0.11570107936859131, "epoch": 5.463642428285524, "step": 16380 }, { "epoch": 5.463642428285524, "ref_ce_loss": 0.07223859429359436, "step": 16380 }, { "epoch": 5.463642428285524, "loss": 0.2510024905204773, "step": 16380 }, { "ce_loss": 0.06031372770667076, "epoch": 5.463642428285524, "step": 16380 }, { "distill_loss": 0.11193616688251495, "epoch": 5.463642428285524, "step": 16380 }, { "epoch": 5.463642428285524, "ref_ce_loss": 0.05297223851084709, "step": 16380 }, { "epoch": 5.463642428285524, "loss": 0.21337854862213135, "step": 16380 }, { "ce_loss": 0.030846117064356804, "epoch": 5.463642428285524, "step": 16380 }, { "distill_loss": 0.09279409050941467, "epoch": 5.463642428285524, "step": 16380 }, { "epoch": 5.463642428285524, "ref_ce_loss": 0.05850100889801979, "step": 16380 }, { "epoch": 5.463642428285524, "loss": 0.6940407156944275, "step": 16380 }, { "ce_loss": 0.06561942398548126, "epoch": 5.463642428285524, "step": 16380 }, { "distill_loss": 0.08580558747053146, "epoch": 5.463642428285524, "step": 16380 }, { "epoch": 5.463642428285524, "ref_ce_loss": 0.040098849684000015, "step": 16380 }, { "epoch": 5.466977985323549, "loss": 0.3414, "step": 16390 }, { "epoch": 5.466977985323549, "grad_norm": 3.0983810424804688, "step": 16390 }, { "epoch": 5.466977985323549, "learning_rate": 7.21994049179555e-05, "step": 16390 }, { "epoch": 5.466977985323549, "loss": 0.28001460433006287, "step": 16390 }, { "ce_loss": 0.047482822090387344, "epoch": 5.466977985323549, "step": 16390 }, { "distill_loss": 0.10146171599626541, "epoch": 5.466977985323549, "step": 16390 }, { "epoch": 5.466977985323549, "ref_ce_loss": 0.06448373943567276, "step": 16390 }, { "epoch": 5.466977985323549, "loss": 0.3621969223022461, "step": 16390 }, { "ce_loss": 0.1023976057767868, "epoch": 5.466977985323549, "step": 16390 }, { "distill_loss": 0.14057931303977966, "epoch": 5.466977985323549, "step": 16390 }, { "epoch": 5.466977985323549, "ref_ce_loss": 0.08347362279891968, "step": 16390 }, { "epoch": 5.466977985323549, "loss": 0.34943342208862305, "step": 16390 }, { "ce_loss": 0.05505457520484924, "epoch": 5.466977985323549, "step": 16390 }, { "distill_loss": 0.11126148700714111, "epoch": 5.466977985323549, "step": 16390 }, { "epoch": 5.466977985323549, "ref_ce_loss": 0.08371064811944962, "step": 16390 }, { "epoch": 5.466977985323549, "loss": 0.2689695358276367, "step": 16390 }, { "ce_loss": 0.07311775535345078, "epoch": 5.466977985323549, "step": 16390 }, { "distill_loss": 0.09362819790840149, "epoch": 5.466977985323549, "step": 16390 }, { "epoch": 5.466977985323549, "ref_ce_loss": 0.04809510335326195, "step": 16390 }, { "epoch": 5.470313542361574, "loss": 0.3496, "step": 16400 }, { "epoch": 5.470313542361574, "grad_norm": 2.867830753326416, "step": 16400 }, { "epoch": 5.470313542361574, "learning_rate": 7.20262911304232e-05, "step": 16400 }, { "epoch": 5.470313542361574, "loss": 0.3608822524547577, "step": 16400 }, { "ce_loss": 0.06633630394935608, "epoch": 5.470313542361574, "step": 16400 }, { "distill_loss": 0.16216693818569183, "epoch": 5.470313542361574, "step": 16400 }, { "epoch": 5.470313542361574, "ref_ce_loss": 0.10717830061912537, "step": 16400 }, { "epoch": 5.470313542361574, "loss": 0.29830560088157654, "step": 16400 }, { "ce_loss": 0.08958641439676285, "epoch": 5.470313542361574, "step": 16400 }, { "distill_loss": 0.1087767630815506, "epoch": 5.470313542361574, "step": 16400 }, { "epoch": 5.470313542361574, "ref_ce_loss": 0.06897032260894775, "step": 16400 }, { "epoch": 5.470313542361574, "loss": 0.44855913519859314, "step": 16400 }, { "ce_loss": 0.16237951815128326, "epoch": 5.470313542361574, "step": 16400 }, { "distill_loss": 0.13723891973495483, "epoch": 5.470313542361574, "step": 16400 }, { "epoch": 5.470313542361574, "ref_ce_loss": 0.1202978566288948, "step": 16400 }, { "epoch": 5.470313542361574, "loss": 0.2657770812511444, "step": 16400 }, { "ce_loss": 0.043764788657426834, "epoch": 5.470313542361574, "step": 16400 }, { "distill_loss": 0.09835869073867798, "epoch": 5.470313542361574, "step": 16400 }, { "epoch": 5.470313542361574, "ref_ce_loss": 0.04671204835176468, "step": 16400 }, { "epoch": 5.4736490993996, "loss": 0.3583, "step": 16410 }, { "epoch": 5.4736490993996, "grad_norm": 2.4175097942352295, "step": 16410 }, { "epoch": 5.4736490993996, "learning_rate": 7.185331953617774e-05, "step": 16410 }, { "epoch": 5.4736490993996, "loss": 0.3450283408164978, "step": 16410 }, { "ce_loss": 0.09441274404525757, "epoch": 5.4736490993996, "step": 16410 }, { "distill_loss": 0.18175029754638672, "epoch": 5.4736490993996, "step": 16410 }, { "epoch": 5.4736490993996, "ref_ce_loss": 0.0529416985809803, "step": 16410 }, { "epoch": 5.4736490993996, "loss": 0.26326984167099, "step": 16410 }, { "ce_loss": 0.04440765455365181, "epoch": 5.4736490993996, "step": 16410 }, { "distill_loss": 0.12239554524421692, "epoch": 5.4736490993996, "step": 16410 }, { "epoch": 5.4736490993996, "ref_ce_loss": 0.09604854881763458, "step": 16410 }, { "epoch": 5.4736490993996, "loss": 0.2444891482591629, "step": 16410 }, { "ce_loss": 0.04429222643375397, "epoch": 5.4736490993996, "step": 16410 }, { "distill_loss": 0.1255653202533722, "epoch": 5.4736490993996, "step": 16410 }, { "epoch": 5.4736490993996, "ref_ce_loss": 0.07446930557489395, "step": 16410 }, { "epoch": 5.4736490993996, "loss": 0.3559562861919403, "step": 16410 }, { "ce_loss": 0.11063433438539505, "epoch": 5.4736490993996, "step": 16410 }, { "distill_loss": 0.1508217453956604, "epoch": 5.4736490993996, "step": 16410 }, { "epoch": 5.4736490993996, "ref_ce_loss": 0.07082000374794006, "step": 16410 }, { "epoch": 5.476984656437625, "loss": 0.4042, "step": 16420 }, { "epoch": 5.476984656437625, "grad_norm": 2.4823498725891113, "step": 16420 }, { "epoch": 5.476984656437625, "learning_rate": 7.16804904506511e-05, "step": 16420 }, { "epoch": 5.476984656437625, "loss": 0.3447977900505066, "step": 16420 }, { "ce_loss": 0.08455787599086761, "epoch": 5.476984656437625, "step": 16420 }, { "distill_loss": 0.1442045271396637, "epoch": 5.476984656437625, "step": 16420 }, { "epoch": 5.476984656437625, "ref_ce_loss": 0.06625892966985703, "step": 16420 }, { "epoch": 5.476984656437625, "loss": 0.5153990983963013, "step": 16420 }, { "ce_loss": 0.0379730723798275, "epoch": 5.476984656437625, "step": 16420 }, { "distill_loss": 0.13656309247016907, "epoch": 5.476984656437625, "step": 16420 }, { "epoch": 5.476984656437625, "ref_ce_loss": 0.0743139386177063, "step": 16420 }, { "epoch": 5.476984656437625, "loss": 0.5950380563735962, "step": 16420 }, { "ce_loss": 0.14564505219459534, "epoch": 5.476984656437625, "step": 16420 }, { "distill_loss": 0.1893148124217987, "epoch": 5.476984656437625, "step": 16420 }, { "epoch": 5.476984656437625, "ref_ce_loss": 0.09283700585365295, "step": 16420 }, { "epoch": 5.476984656437625, "loss": 0.23283891379833221, "step": 16420 }, { "ce_loss": 0.07694301754236221, "epoch": 5.476984656437625, "step": 16420 }, { "distill_loss": 0.10071247071027756, "epoch": 5.476984656437625, "step": 16420 }, { "epoch": 5.476984656437625, "ref_ce_loss": 0.05511435121297836, "step": 16420 }, { "epoch": 5.48032021347565, "loss": 0.3878, "step": 16430 }, { "epoch": 5.48032021347565, "grad_norm": 5.472512722015381, "step": 16430 }, { "epoch": 5.48032021347565, "learning_rate": 7.150780418901537e-05, "step": 16430 }, { "epoch": 5.48032021347565, "loss": 0.308619886636734, "step": 16430 }, { "ce_loss": 0.06344080716371536, "epoch": 5.48032021347565, "step": 16430 }, { "distill_loss": 0.13703030347824097, "epoch": 5.48032021347565, "step": 16430 }, { "epoch": 5.48032021347565, "ref_ce_loss": 0.08027543872594833, "step": 16430 }, { "epoch": 5.48032021347565, "loss": 0.43004536628723145, "step": 16430 }, { "ce_loss": 0.09899048507213593, "epoch": 5.48032021347565, "step": 16430 }, { "distill_loss": 0.11898726969957352, "epoch": 5.48032021347565, "step": 16430 }, { "epoch": 5.48032021347565, "ref_ce_loss": 0.09035888314247131, "step": 16430 }, { "epoch": 5.48032021347565, "loss": 0.2632295489311218, "step": 16430 }, { "ce_loss": 0.06394274532794952, "epoch": 5.48032021347565, "step": 16430 }, { "distill_loss": 0.1264699250459671, "epoch": 5.48032021347565, "step": 16430 }, { "epoch": 5.48032021347565, "ref_ce_loss": 0.05528374761343002, "step": 16430 }, { "epoch": 5.48032021347565, "loss": 0.2657163143157959, "step": 16430 }, { "ce_loss": 0.04656196013092995, "epoch": 5.48032021347565, "step": 16430 }, { "distill_loss": 0.1124621257185936, "epoch": 5.48032021347565, "step": 16430 }, { "epoch": 5.48032021347565, "ref_ce_loss": 0.054182395339012146, "step": 16430 }, { "epoch": 5.483655770513676, "loss": 0.36, "step": 16440 }, { "epoch": 5.483655770513676, "grad_norm": 2.9450929164886475, "step": 16440 }, { "epoch": 5.483655770513676, "learning_rate": 7.13352610661822e-05, "step": 16440 }, { "epoch": 5.483655770513676, "loss": 0.4299405515193939, "step": 16440 }, { "ce_loss": 0.040338076651096344, "epoch": 5.483655770513676, "step": 16440 }, { "distill_loss": 0.1451733559370041, "epoch": 5.483655770513676, "step": 16440 }, { "epoch": 5.483655770513676, "ref_ce_loss": 0.06834650039672852, "step": 16440 }, { "epoch": 5.483655770513676, "loss": 0.5239918828010559, "step": 16440 }, { "ce_loss": 0.10494782030582428, "epoch": 5.483655770513676, "step": 16440 }, { "distill_loss": 0.13455775380134583, "epoch": 5.483655770513676, "step": 16440 }, { "epoch": 5.483655770513676, "ref_ce_loss": 0.08458808809518814, "step": 16440 }, { "epoch": 5.483655770513676, "loss": 0.31824690103530884, "step": 16440 }, { "ce_loss": 0.07870815694332123, "epoch": 5.483655770513676, "step": 16440 }, { "distill_loss": 0.11882781237363815, "epoch": 5.483655770513676, "step": 16440 }, { "epoch": 5.483655770513676, "ref_ce_loss": 0.05696150287985802, "step": 16440 }, { "epoch": 5.483655770513676, "loss": 0.36554384231567383, "step": 16440 }, { "ce_loss": 0.0737508162856102, "epoch": 5.483655770513676, "step": 16440 }, { "distill_loss": 0.13044539093971252, "epoch": 5.483655770513676, "step": 16440 }, { "epoch": 5.483655770513676, "ref_ce_loss": 0.08092710375785828, "step": 16440 }, { "epoch": 5.486991327551701, "loss": 0.3634, "step": 16450 }, { "epoch": 5.486991327551701, "grad_norm": 1.8394968509674072, "step": 16450 }, { "epoch": 5.486991327551701, "learning_rate": 7.116286139680208e-05, "step": 16450 }, { "epoch": 5.486991327551701, "loss": 0.20372389256954193, "step": 16450 }, { "ce_loss": 0.034144558012485504, "epoch": 5.486991327551701, "step": 16450 }, { "distill_loss": 0.11417104303836823, "epoch": 5.486991327551701, "step": 16450 }, { "epoch": 5.486991327551701, "ref_ce_loss": 0.0553152859210968, "step": 16450 }, { "epoch": 5.486991327551701, "loss": 0.2984168827533722, "step": 16450 }, { "ce_loss": 0.07658090442419052, "epoch": 5.486991327551701, "step": 16450 }, { "distill_loss": 0.10359425842761993, "epoch": 5.486991327551701, "step": 16450 }, { "epoch": 5.486991327551701, "ref_ce_loss": 0.07040654867887497, "step": 16450 }, { "epoch": 5.486991327551701, "loss": 0.2048768848180771, "step": 16450 }, { "ce_loss": 0.042437851428985596, "epoch": 5.486991327551701, "step": 16450 }, { "distill_loss": 0.07546865195035934, "epoch": 5.486991327551701, "step": 16450 }, { "epoch": 5.486991327551701, "ref_ce_loss": 0.0681789219379425, "step": 16450 }, { "epoch": 5.486991327551701, "loss": 0.3559339642524719, "step": 16450 }, { "ce_loss": 0.11100053042173386, "epoch": 5.486991327551701, "step": 16450 }, { "distill_loss": 0.10700476169586182, "epoch": 5.486991327551701, "step": 16450 }, { "epoch": 5.486991327551701, "ref_ce_loss": 0.09130432456731796, "step": 16450 }, { "epoch": 5.490326884589726, "loss": 0.3844, "step": 16460 }, { "epoch": 5.490326884589726, "grad_norm": 1.9888533353805542, "step": 16460 }, { "epoch": 5.490326884589726, "learning_rate": 7.099060549526406e-05, "step": 16460 }, { "epoch": 5.490326884589726, "loss": 0.1936185657978058, "step": 16460 }, { "ce_loss": 0.007558522745966911, "epoch": 5.490326884589726, "step": 16460 }, { "distill_loss": 0.07454708963632584, "epoch": 5.490326884589726, "step": 16460 }, { "epoch": 5.490326884589726, "ref_ce_loss": 0.043821558356285095, "step": 16460 }, { "epoch": 5.490326884589726, "loss": 0.4054626524448395, "step": 16460 }, { "ce_loss": 0.024897685274481773, "epoch": 5.490326884589726, "step": 16460 }, { "distill_loss": 0.11481006443500519, "epoch": 5.490326884589726, "step": 16460 }, { "epoch": 5.490326884589726, "ref_ce_loss": 0.12120971083641052, "step": 16460 }, { "epoch": 5.490326884589726, "loss": 0.17700545489788055, "step": 16460 }, { "ce_loss": 0.03312915191054344, "epoch": 5.490326884589726, "step": 16460 }, { "distill_loss": 0.09140119701623917, "epoch": 5.490326884589726, "step": 16460 }, { "epoch": 5.490326884589726, "ref_ce_loss": 0.05226533114910126, "step": 16460 }, { "epoch": 5.490326884589726, "loss": 0.21900974214076996, "step": 16460 }, { "ce_loss": 0.041102517396211624, "epoch": 5.490326884589726, "step": 16460 }, { "distill_loss": 0.10525349527597427, "epoch": 5.490326884589726, "step": 16460 }, { "epoch": 5.490326884589726, "ref_ce_loss": 0.05411553755402565, "step": 16460 }, { "epoch": 5.493662441627752, "loss": 0.3449, "step": 16470 }, { "epoch": 5.493662441627752, "grad_norm": 2.341479539871216, "step": 16470 }, { "epoch": 5.493662441627752, "learning_rate": 7.081849367569502e-05, "step": 16470 }, { "epoch": 5.493662441627752, "loss": 0.24781110882759094, "step": 16470 }, { "ce_loss": 0.02872762642800808, "epoch": 5.493662441627752, "step": 16470 }, { "distill_loss": 0.09619366377592087, "epoch": 5.493662441627752, "step": 16470 }, { "epoch": 5.493662441627752, "ref_ce_loss": 0.07014994323253632, "step": 16470 }, { "epoch": 5.493662441627752, "loss": 0.3405912220478058, "step": 16470 }, { "ce_loss": 0.04670342057943344, "epoch": 5.493662441627752, "step": 16470 }, { "distill_loss": 0.1301906406879425, "epoch": 5.493662441627752, "step": 16470 }, { "epoch": 5.493662441627752, "ref_ce_loss": 0.09883410483598709, "step": 16470 }, { "epoch": 5.493662441627752, "loss": 0.35678622126579285, "step": 16470 }, { "ce_loss": 0.09243296831846237, "epoch": 5.493662441627752, "step": 16470 }, { "distill_loss": 0.15158620476722717, "epoch": 5.493662441627752, "step": 16470 }, { "epoch": 5.493662441627752, "ref_ce_loss": 0.08452881872653961, "step": 16470 }, { "epoch": 5.493662441627752, "loss": 0.22663433849811554, "step": 16470 }, { "ce_loss": 0.03276556730270386, "epoch": 5.493662441627752, "step": 16470 }, { "distill_loss": 0.08965560793876648, "epoch": 5.493662441627752, "step": 16470 }, { "epoch": 5.493662441627752, "ref_ce_loss": 0.0697120726108551, "step": 16470 }, { "epoch": 5.496997998665777, "loss": 0.3554, "step": 16480 }, { "epoch": 5.496997998665777, "grad_norm": 3.4519755840301514, "step": 16480 }, { "epoch": 5.496997998665777, "learning_rate": 7.064652625195883e-05, "step": 16480 }, { "epoch": 5.496997998665777, "loss": 0.32312771677970886, "step": 16480 }, { "ce_loss": 0.08518663048744202, "epoch": 5.496997998665777, "step": 16480 }, { "distill_loss": 0.11265785247087479, "epoch": 5.496997998665777, "step": 16480 }, { "epoch": 5.496997998665777, "ref_ce_loss": 0.1049276664853096, "step": 16480 }, { "epoch": 5.496997998665777, "loss": 0.29952967166900635, "step": 16480 }, { "ce_loss": 0.07962783426046371, "epoch": 5.496997998665777, "step": 16480 }, { "distill_loss": 0.10899212956428528, "epoch": 5.496997998665777, "step": 16480 }, { "epoch": 5.496997998665777, "ref_ce_loss": 0.08677392452955246, "step": 16480 }, { "epoch": 5.496997998665777, "loss": 0.32281216979026794, "step": 16480 }, { "ce_loss": 0.06670834124088287, "epoch": 5.496997998665777, "step": 16480 }, { "distill_loss": 0.14671894907951355, "epoch": 5.496997998665777, "step": 16480 }, { "epoch": 5.496997998665777, "ref_ce_loss": 0.0661352351307869, "step": 16480 }, { "epoch": 5.496997998665777, "loss": 0.27929234504699707, "step": 16480 }, { "ce_loss": 0.04909088462591171, "epoch": 5.496997998665777, "step": 16480 }, { "distill_loss": 0.14832495152950287, "epoch": 5.496997998665777, "step": 16480 }, { "epoch": 5.496997998665777, "ref_ce_loss": 0.056601885706186295, "step": 16480 }, { "epoch": 5.5003335557038024, "loss": 0.3452, "step": 16490 }, { "epoch": 5.5003335557038024, "grad_norm": 2.604300022125244, "step": 16490 }, { "epoch": 5.5003335557038024, "learning_rate": 7.047470353765648e-05, "step": 16490 }, { "epoch": 5.5003335557038024, "loss": 0.3394431471824646, "step": 16490 }, { "ce_loss": 0.09183358401060104, "epoch": 5.5003335557038024, "step": 16490 }, { "distill_loss": 0.1244954913854599, "epoch": 5.5003335557038024, "step": 16490 }, { "epoch": 5.5003335557038024, "ref_ce_loss": 0.06953916698694229, "step": 16490 }, { "epoch": 5.5003335557038024, "loss": 0.22680865228176117, "step": 16490 }, { "ce_loss": 0.04873758554458618, "epoch": 5.5003335557038024, "step": 16490 }, { "distill_loss": 0.10830150544643402, "epoch": 5.5003335557038024, "step": 16490 }, { "epoch": 5.5003335557038024, "ref_ce_loss": 0.03887654468417168, "step": 16490 }, { "epoch": 5.5003335557038024, "loss": 0.32523858547210693, "step": 16490 }, { "ce_loss": 0.12041735649108887, "epoch": 5.5003335557038024, "step": 16490 }, { "distill_loss": 0.11242323368787766, "epoch": 5.5003335557038024, "step": 16490 }, { "epoch": 5.5003335557038024, "ref_ce_loss": 0.06937728822231293, "step": 16490 }, { "epoch": 5.5003335557038024, "loss": 0.4538167417049408, "step": 16490 }, { "ce_loss": 0.16250401735305786, "epoch": 5.5003335557038024, "step": 16490 }, { "distill_loss": 0.16419684886932373, "epoch": 5.5003335557038024, "step": 16490 }, { "epoch": 5.5003335557038024, "ref_ce_loss": 0.09326472133398056, "step": 16490 }, { "epoch": 5.503669112741828, "loss": 0.3291, "step": 16500 }, { "epoch": 5.503669112741828, "grad_norm": 2.0353572368621826, "step": 16500 }, { "epoch": 5.503669112741828, "learning_rate": 7.030302584612467e-05, "step": 16500 }, { "epoch": 5.503669112741828, "loss": 0.304289311170578, "step": 16500 }, { "ce_loss": 0.07965365797281265, "epoch": 5.503669112741828, "step": 16500 }, { "distill_loss": 0.10643766075372696, "epoch": 5.503669112741828, "step": 16500 }, { "epoch": 5.503669112741828, "ref_ce_loss": 0.07184035331010818, "step": 16500 }, { "epoch": 5.503669112741828, "loss": 0.33690714836120605, "step": 16500 }, { "ce_loss": 0.06654191762208939, "epoch": 5.503669112741828, "step": 16500 }, { "distill_loss": 0.12142252922058105, "epoch": 5.503669112741828, "step": 16500 }, { "epoch": 5.503669112741828, "ref_ce_loss": 0.06164618954062462, "step": 16500 }, { "epoch": 5.503669112741828, "loss": 0.29911255836486816, "step": 16500 }, { "ce_loss": 0.060956381261348724, "epoch": 5.503669112741828, "step": 16500 }, { "distill_loss": 0.13254477083683014, "epoch": 5.503669112741828, "step": 16500 }, { "epoch": 5.503669112741828, "ref_ce_loss": 0.05198168754577637, "step": 16500 }, { "epoch": 5.503669112741828, "loss": 0.3498637080192566, "step": 16500 }, { "ce_loss": 0.05736403167247772, "epoch": 5.503669112741828, "step": 16500 }, { "distill_loss": 0.15107011795043945, "epoch": 5.503669112741828, "step": 16500 }, { "epoch": 5.503669112741828, "ref_ce_loss": 0.06633694469928741, "step": 16500 }, { "epoch": 5.507004669779853, "loss": 0.3628, "step": 16510 }, { "epoch": 5.507004669779853, "grad_norm": 2.2993712425231934, "step": 16510 }, { "epoch": 5.507004669779853, "learning_rate": 7.013149349043581e-05, "step": 16510 }, { "epoch": 5.507004669779853, "loss": 0.3825063109397888, "step": 16510 }, { "ce_loss": 0.07082057744264603, "epoch": 5.507004669779853, "step": 16510 }, { "distill_loss": 0.16355140507221222, "epoch": 5.507004669779853, "step": 16510 }, { "epoch": 5.507004669779853, "ref_ce_loss": 0.08108976483345032, "step": 16510 }, { "epoch": 5.507004669779853, "loss": 0.3623524010181427, "step": 16510 }, { "ce_loss": 0.15745019912719727, "epoch": 5.507004669779853, "step": 16510 }, { "distill_loss": 0.11442229896783829, "epoch": 5.507004669779853, "step": 16510 }, { "epoch": 5.507004669779853, "ref_ce_loss": 0.06455233693122864, "step": 16510 }, { "epoch": 5.507004669779853, "loss": 0.26522186398506165, "step": 16510 }, { "ce_loss": 0.06558328866958618, "epoch": 5.507004669779853, "step": 16510 }, { "distill_loss": 0.0987500324845314, "epoch": 5.507004669779853, "step": 16510 }, { "epoch": 5.507004669779853, "ref_ce_loss": 0.0726059302687645, "step": 16510 }, { "epoch": 5.507004669779853, "loss": 0.5976641178131104, "step": 16510 }, { "ce_loss": 0.09752736985683441, "epoch": 5.507004669779853, "step": 16510 }, { "distill_loss": 0.13297367095947266, "epoch": 5.507004669779853, "step": 16510 }, { "epoch": 5.507004669779853, "ref_ce_loss": 0.07492193579673767, "step": 16510 }, { "epoch": 5.5103402268178785, "loss": 0.3752, "step": 16520 }, { "epoch": 5.5103402268178785, "grad_norm": 3.8198392391204834, "step": 16520 }, { "epoch": 5.5103402268178785, "learning_rate": 6.996010678339732e-05, "step": 16520 }, { "epoch": 5.5103402268178785, "loss": 0.43566957116127014, "step": 16520 }, { "ce_loss": 0.10428964346647263, "epoch": 5.5103402268178785, "step": 16520 }, { "distill_loss": 0.12884977459907532, "epoch": 5.5103402268178785, "step": 16520 }, { "epoch": 5.5103402268178785, "ref_ce_loss": 0.07748550921678543, "step": 16520 }, { "epoch": 5.5103402268178785, "loss": 0.6612772941589355, "step": 16520 }, { "ce_loss": 0.06919779628515244, "epoch": 5.5103402268178785, "step": 16520 }, { "distill_loss": 0.1307699978351593, "epoch": 5.5103402268178785, "step": 16520 }, { "epoch": 5.5103402268178785, "ref_ce_loss": 0.07698625326156616, "step": 16520 }, { "epoch": 5.5103402268178785, "loss": 0.42775776982307434, "step": 16520 }, { "ce_loss": 0.058075323700904846, "epoch": 5.5103402268178785, "step": 16520 }, { "distill_loss": 0.10789398103952408, "epoch": 5.5103402268178785, "step": 16520 }, { "epoch": 5.5103402268178785, "ref_ce_loss": 0.07196734100580215, "step": 16520 }, { "epoch": 5.5103402268178785, "loss": 0.19031690061092377, "step": 16520 }, { "ce_loss": 0.047655943781137466, "epoch": 5.5103402268178785, "step": 16520 }, { "distill_loss": 0.10163514316082001, "epoch": 5.5103402268178785, "step": 16520 }, { "epoch": 5.5103402268178785, "ref_ce_loss": 0.040984734892845154, "step": 16520 }, { "epoch": 5.513675783855904, "loss": 0.4149, "step": 16530 }, { "epoch": 5.513675783855904, "grad_norm": 3.6521718502044678, "step": 16530 }, { "epoch": 5.513675783855904, "learning_rate": 6.978886603755087e-05, "step": 16530 }, { "epoch": 5.513675783855904, "loss": 0.30453452467918396, "step": 16530 }, { "ce_loss": 0.04079536721110344, "epoch": 5.513675783855904, "step": 16530 }, { "distill_loss": 0.14041092991828918, "epoch": 5.513675783855904, "step": 16530 }, { "epoch": 5.513675783855904, "ref_ce_loss": 0.06573795527219772, "step": 16530 }, { "epoch": 5.513675783855904, "loss": 0.24023286998271942, "step": 16530 }, { "ce_loss": 0.07542020827531815, "epoch": 5.513675783855904, "step": 16530 }, { "distill_loss": 0.0940532237291336, "epoch": 5.513675783855904, "step": 16530 }, { "epoch": 5.513675783855904, "ref_ce_loss": 0.07066644728183746, "step": 16530 }, { "epoch": 5.513675783855904, "loss": 0.34776246547698975, "step": 16530 }, { "ce_loss": 0.04864807799458504, "epoch": 5.513675783855904, "step": 16530 }, { "distill_loss": 0.12712392210960388, "epoch": 5.513675783855904, "step": 16530 }, { "epoch": 5.513675783855904, "ref_ce_loss": 0.07447850704193115, "step": 16530 }, { "epoch": 5.513675783855904, "loss": 0.43463602662086487, "step": 16530 }, { "ce_loss": 0.10774467140436172, "epoch": 5.513675783855904, "step": 16530 }, { "distill_loss": 0.12198501080274582, "epoch": 5.513675783855904, "step": 16530 }, { "epoch": 5.513675783855904, "ref_ce_loss": 0.08402875065803528, "step": 16530 }, { "epoch": 5.517011340893929, "loss": 0.3387, "step": 16540 }, { "epoch": 5.517011340893929, "grad_norm": 2.0748274326324463, "step": 16540 }, { "epoch": 5.517011340893929, "learning_rate": 6.961777156517198e-05, "step": 16540 }, { "epoch": 5.517011340893929, "loss": 0.36993128061294556, "step": 16540 }, { "ce_loss": 0.09083930402994156, "epoch": 5.517011340893929, "step": 16540 }, { "distill_loss": 0.1188422292470932, "epoch": 5.517011340893929, "step": 16540 }, { "epoch": 5.517011340893929, "ref_ce_loss": 0.06648019701242447, "step": 16540 }, { "epoch": 5.517011340893929, "loss": 0.25275933742523193, "step": 16540 }, { "ce_loss": 0.06319792568683624, "epoch": 5.517011340893929, "step": 16540 }, { "distill_loss": 0.09456001222133636, "epoch": 5.517011340893929, "step": 16540 }, { "epoch": 5.517011340893929, "ref_ce_loss": 0.05898076295852661, "step": 16540 }, { "epoch": 5.517011340893929, "loss": 0.5341216325759888, "step": 16540 }, { "ce_loss": 0.11002179235219955, "epoch": 5.517011340893929, "step": 16540 }, { "distill_loss": 0.15567591786384583, "epoch": 5.517011340893929, "step": 16540 }, { "epoch": 5.517011340893929, "ref_ce_loss": 0.07723324745893478, "step": 16540 }, { "epoch": 5.517011340893929, "loss": 0.29186636209487915, "step": 16540 }, { "ce_loss": 0.029037468135356903, "epoch": 5.517011340893929, "step": 16540 }, { "distill_loss": 0.1176878809928894, "epoch": 5.517011340893929, "step": 16540 }, { "epoch": 5.517011340893929, "ref_ce_loss": 0.05208834260702133, "step": 16540 }, { "epoch": 5.5203468979319545, "loss": 0.3831, "step": 16550 }, { "epoch": 5.5203468979319545, "grad_norm": 3.3479764461517334, "step": 16550 }, { "epoch": 5.5203468979319545, "learning_rate": 6.944682367826966e-05, "step": 16550 }, { "epoch": 5.5203468979319545, "loss": 0.3768744468688965, "step": 16550 }, { "ce_loss": 0.0981643795967102, "epoch": 5.5203468979319545, "step": 16550 }, { "distill_loss": 0.15606792271137238, "epoch": 5.5203468979319545, "step": 16550 }, { "epoch": 5.5203468979319545, "ref_ce_loss": 0.07630893588066101, "step": 16550 }, { "epoch": 5.5203468979319545, "loss": 0.20364636182785034, "step": 16550 }, { "ce_loss": 0.029577821493148804, "epoch": 5.5203468979319545, "step": 16550 }, { "distill_loss": 0.09735309332609177, "epoch": 5.5203468979319545, "step": 16550 }, { "epoch": 5.5203468979319545, "ref_ce_loss": 0.054588109254837036, "step": 16550 }, { "epoch": 5.5203468979319545, "loss": 0.2448481321334839, "step": 16550 }, { "ce_loss": 0.07320421189069748, "epoch": 5.5203468979319545, "step": 16550 }, { "distill_loss": 0.10613042116165161, "epoch": 5.5203468979319545, "step": 16550 }, { "epoch": 5.5203468979319545, "ref_ce_loss": 0.06539609283208847, "step": 16550 }, { "epoch": 5.5203468979319545, "loss": 0.23442178964614868, "step": 16550 }, { "ce_loss": 0.048477523028850555, "epoch": 5.5203468979319545, "step": 16550 }, { "distill_loss": 0.1168837770819664, "epoch": 5.5203468979319545, "step": 16550 }, { "epoch": 5.5203468979319545, "ref_ce_loss": 0.05537949129939079, "step": 16550 }, { "epoch": 5.52368245496998, "loss": 0.3322, "step": 16560 }, { "epoch": 5.52368245496998, "grad_norm": 2.843312978744507, "step": 16560 }, { "epoch": 5.52368245496998, "learning_rate": 6.927602268858526e-05, "step": 16560 }, { "epoch": 5.52368245496998, "loss": 0.3520795404911041, "step": 16560 }, { "ce_loss": 0.09770800173282623, "epoch": 5.52368245496998, "step": 16560 }, { "distill_loss": 0.13243599236011505, "epoch": 5.52368245496998, "step": 16560 }, { "epoch": 5.52368245496998, "ref_ce_loss": 0.06643498688936234, "step": 16560 }, { "epoch": 5.52368245496998, "loss": 0.3254886567592621, "step": 16560 }, { "ce_loss": 0.06492993235588074, "epoch": 5.52368245496998, "step": 16560 }, { "distill_loss": 0.10419841855764389, "epoch": 5.52368245496998, "step": 16560 }, { "epoch": 5.52368245496998, "ref_ce_loss": 0.0797518864274025, "step": 16560 }, { "epoch": 5.52368245496998, "loss": 0.6603392362594604, "step": 16560 }, { "ce_loss": 0.05518954619765282, "epoch": 5.52368245496998, "step": 16560 }, { "distill_loss": 0.13722261786460876, "epoch": 5.52368245496998, "step": 16560 }, { "epoch": 5.52368245496998, "ref_ce_loss": 0.08445858210325241, "step": 16560 }, { "epoch": 5.52368245496998, "loss": 0.3482056260108948, "step": 16560 }, { "ce_loss": 0.03709990531206131, "epoch": 5.52368245496998, "step": 16560 }, { "distill_loss": 0.12940657138824463, "epoch": 5.52368245496998, "step": 16560 }, { "epoch": 5.52368245496998, "ref_ce_loss": 0.06875383853912354, "step": 16560 }, { "epoch": 5.527018012008005, "loss": 0.3693, "step": 16570 }, { "epoch": 5.527018012008005, "grad_norm": 3.7445974349975586, "step": 16570 }, { "epoch": 5.527018012008005, "learning_rate": 6.910536890759254e-05, "step": 16570 }, { "epoch": 5.527018012008005, "loss": 0.30111929774284363, "step": 16570 }, { "ce_loss": 0.11203796416521072, "epoch": 5.527018012008005, "step": 16570 }, { "distill_loss": 0.1373513787984848, "epoch": 5.527018012008005, "step": 16570 }, { "epoch": 5.527018012008005, "ref_ce_loss": 0.05158596858382225, "step": 16570 }, { "epoch": 5.527018012008005, "loss": 0.2687418758869171, "step": 16570 }, { "ce_loss": 0.05072599649429321, "epoch": 5.527018012008005, "step": 16570 }, { "distill_loss": 0.08930881321430206, "epoch": 5.527018012008005, "step": 16570 }, { "epoch": 5.527018012008005, "ref_ce_loss": 0.06740231812000275, "step": 16570 }, { "epoch": 5.527018012008005, "loss": 0.3890778720378876, "step": 16570 }, { "ce_loss": 0.12530957162380219, "epoch": 5.527018012008005, "step": 16570 }, { "distill_loss": 0.11893895268440247, "epoch": 5.527018012008005, "step": 16570 }, { "epoch": 5.527018012008005, "ref_ce_loss": 0.09552176296710968, "step": 16570 }, { "epoch": 5.527018012008005, "loss": 0.48636484146118164, "step": 16570 }, { "ce_loss": 0.04522004351019859, "epoch": 5.527018012008005, "step": 16570 }, { "distill_loss": 0.10927541553974152, "epoch": 5.527018012008005, "step": 16570 }, { "epoch": 5.527018012008005, "ref_ce_loss": 0.07342866063117981, "step": 16570 }, { "epoch": 5.530353569046031, "loss": 0.3443, "step": 16580 }, { "epoch": 5.530353569046031, "grad_norm": 2.7234625816345215, "step": 16580 }, { "epoch": 5.530353569046031, "learning_rate": 6.893486264649653e-05, "step": 16580 }, { "epoch": 5.530353569046031, "loss": 0.3358912765979767, "step": 16580 }, { "ce_loss": 0.046590499579906464, "epoch": 5.530353569046031, "step": 16580 }, { "distill_loss": 0.09500577300786972, "epoch": 5.530353569046031, "step": 16580 }, { "epoch": 5.530353569046031, "ref_ce_loss": 0.06674519181251526, "step": 16580 }, { "epoch": 5.530353569046031, "loss": 0.25684136152267456, "step": 16580 }, { "ce_loss": 0.04049672558903694, "epoch": 5.530353569046031, "step": 16580 }, { "distill_loss": 0.10132599622011185, "epoch": 5.530353569046031, "step": 16580 }, { "epoch": 5.530353569046031, "ref_ce_loss": 0.09812045097351074, "step": 16580 }, { "epoch": 5.530353569046031, "loss": 0.47450417280197144, "step": 16580 }, { "ce_loss": 0.0634898990392685, "epoch": 5.530353569046031, "step": 16580 }, { "distill_loss": 0.10582137107849121, "epoch": 5.530353569046031, "step": 16580 }, { "epoch": 5.530353569046031, "ref_ce_loss": 0.13480916619300842, "step": 16580 }, { "epoch": 5.530353569046031, "loss": 0.4640018939971924, "step": 16580 }, { "ce_loss": 0.11096587777137756, "epoch": 5.530353569046031, "step": 16580 }, { "distill_loss": 0.1723109483718872, "epoch": 5.530353569046031, "step": 16580 }, { "epoch": 5.530353569046031, "ref_ce_loss": 0.07591967284679413, "step": 16580 }, { "epoch": 5.533689126084056, "loss": 0.4116, "step": 16590 }, { "epoch": 5.533689126084056, "grad_norm": 3.5316085815429688, "step": 16590 }, { "epoch": 5.533689126084056, "learning_rate": 6.876450421623346e-05, "step": 16590 }, { "epoch": 5.533689126084056, "loss": 0.1337299346923828, "step": 16590 }, { "ce_loss": 0.007529797963798046, "epoch": 5.533689126084056, "step": 16590 }, { "distill_loss": 0.08628208190202713, "epoch": 5.533689126084056, "step": 16590 }, { "epoch": 5.533689126084056, "ref_ce_loss": 0.03970135375857353, "step": 16590 }, { "epoch": 5.533689126084056, "loss": 0.27693334221839905, "step": 16590 }, { "ce_loss": 0.04799002781510353, "epoch": 5.533689126084056, "step": 16590 }, { "distill_loss": 0.1492634117603302, "epoch": 5.533689126084056, "step": 16590 }, { "epoch": 5.533689126084056, "ref_ce_loss": 0.0795946940779686, "step": 16590 }, { "epoch": 5.533689126084056, "loss": 0.2540445327758789, "step": 16590 }, { "ce_loss": 0.03331456705927849, "epoch": 5.533689126084056, "step": 16590 }, { "distill_loss": 0.10557867586612701, "epoch": 5.533689126084056, "step": 16590 }, { "epoch": 5.533689126084056, "ref_ce_loss": 0.05052439495921135, "step": 16590 }, { "epoch": 5.533689126084056, "loss": 0.28934377431869507, "step": 16590 }, { "ce_loss": 0.07273193448781967, "epoch": 5.533689126084056, "step": 16590 }, { "distill_loss": 0.13207106292247772, "epoch": 5.533689126084056, "step": 16590 }, { "epoch": 5.533689126084056, "ref_ce_loss": 0.08445105701684952, "step": 16590 }, { "epoch": 5.537024683122081, "loss": 0.3145, "step": 16600 }, { "epoch": 5.537024683122081, "grad_norm": 2.272958993911743, "step": 16600 }, { "epoch": 5.537024683122081, "learning_rate": 6.859429392746993e-05, "step": 16600 }, { "epoch": 5.537024683122081, "loss": 0.30334970355033875, "step": 16600 }, { "ce_loss": 0.07540854066610336, "epoch": 5.537024683122081, "step": 16600 }, { "distill_loss": 0.11994440108537674, "epoch": 5.537024683122081, "step": 16600 }, { "epoch": 5.537024683122081, "ref_ce_loss": 0.07918747514486313, "step": 16600 }, { "epoch": 5.537024683122081, "loss": 0.38659754395484924, "step": 16600 }, { "ce_loss": 0.13602618873119354, "epoch": 5.537024683122081, "step": 16600 }, { "distill_loss": 0.17836399376392365, "epoch": 5.537024683122081, "step": 16600 }, { "epoch": 5.537024683122081, "ref_ce_loss": 0.07202507555484772, "step": 16600 }, { "epoch": 5.537024683122081, "loss": 0.6665504574775696, "step": 16600 }, { "ce_loss": 0.0435519814491272, "epoch": 5.537024683122081, "step": 16600 }, { "distill_loss": 0.1457093060016632, "epoch": 5.537024683122081, "step": 16600 }, { "epoch": 5.537024683122081, "ref_ce_loss": 0.07195711135864258, "step": 16600 }, { "epoch": 5.537024683122081, "loss": 0.22605876624584198, "step": 16600 }, { "ce_loss": 0.04164009541273117, "epoch": 5.537024683122081, "step": 16600 }, { "distill_loss": 0.08552563190460205, "epoch": 5.537024683122081, "step": 16600 }, { "epoch": 5.537024683122081, "ref_ce_loss": 0.07331196218729019, "step": 16600 }, { "epoch": 5.540360240160107, "loss": 0.3445, "step": 16610 }, { "epoch": 5.540360240160107, "grad_norm": 2.327739715576172, "step": 16610 }, { "epoch": 5.540360240160107, "learning_rate": 6.842423209060233e-05, "step": 16610 }, { "epoch": 5.540360240160107, "loss": 0.4476955533027649, "step": 16610 }, { "ce_loss": 0.0371936596930027, "epoch": 5.540360240160107, "step": 16610 }, { "distill_loss": 0.10962027311325073, "epoch": 5.540360240160107, "step": 16610 }, { "epoch": 5.540360240160107, "ref_ce_loss": 0.053445056080818176, "step": 16610 }, { "epoch": 5.540360240160107, "loss": 0.3170121908187866, "step": 16610 }, { "ce_loss": 0.03845048323273659, "epoch": 5.540360240160107, "step": 16610 }, { "distill_loss": 0.13532476127147675, "epoch": 5.540360240160107, "step": 16610 }, { "epoch": 5.540360240160107, "ref_ce_loss": 0.07792802900075912, "step": 16610 }, { "epoch": 5.540360240160107, "loss": 0.26157912611961365, "step": 16610 }, { "ce_loss": 0.05625046417117119, "epoch": 5.540360240160107, "step": 16610 }, { "distill_loss": 0.13596224784851074, "epoch": 5.540360240160107, "step": 16610 }, { "epoch": 5.540360240160107, "ref_ce_loss": 0.0490339994430542, "step": 16610 }, { "epoch": 5.540360240160107, "loss": 0.24408766627311707, "step": 16610 }, { "ce_loss": 0.03240533545613289, "epoch": 5.540360240160107, "step": 16610 }, { "distill_loss": 0.09369600564241409, "epoch": 5.540360240160107, "step": 16610 }, { "epoch": 5.540360240160107, "ref_ce_loss": 0.0623743012547493, "step": 16610 }, { "epoch": 5.543695797198132, "loss": 0.3579, "step": 16620 }, { "epoch": 5.543695797198132, "grad_norm": 1.9174339771270752, "step": 16620 }, { "epoch": 5.543695797198132, "learning_rate": 6.825431901575645e-05, "step": 16620 }, { "epoch": 5.543695797198132, "loss": 0.6768718957901001, "step": 16620 }, { "ce_loss": 0.0892643928527832, "epoch": 5.543695797198132, "step": 16620 }, { "distill_loss": 0.12472718209028244, "epoch": 5.543695797198132, "step": 16620 }, { "epoch": 5.543695797198132, "ref_ce_loss": 0.08057060092687607, "step": 16620 }, { "epoch": 5.543695797198132, "loss": 0.7044631838798523, "step": 16620 }, { "ce_loss": 0.06510592997074127, "epoch": 5.543695797198132, "step": 16620 }, { "distill_loss": 0.1234467476606369, "epoch": 5.543695797198132, "step": 16620 }, { "epoch": 5.543695797198132, "ref_ce_loss": 0.07243905961513519, "step": 16620 }, { "epoch": 5.543695797198132, "loss": 0.3674314022064209, "step": 16620 }, { "ce_loss": 0.09846795350313187, "epoch": 5.543695797198132, "step": 16620 }, { "distill_loss": 0.1294548213481903, "epoch": 5.543695797198132, "step": 16620 }, { "epoch": 5.543695797198132, "ref_ce_loss": 0.10479249805212021, "step": 16620 }, { "epoch": 5.543695797198132, "loss": 0.2651240825653076, "step": 16620 }, { "ce_loss": 0.06598225235939026, "epoch": 5.543695797198132, "step": 16620 }, { "distill_loss": 0.1323862224817276, "epoch": 5.543695797198132, "step": 16620 }, { "epoch": 5.543695797198132, "ref_ce_loss": 0.06659011542797089, "step": 16620 }, { "epoch": 5.547031354236157, "loss": 0.3521, "step": 16630 }, { "epoch": 5.547031354236157, "grad_norm": 3.350158214569092, "step": 16630 }, { "epoch": 5.547031354236157, "learning_rate": 6.808455501278659e-05, "step": 16630 }, { "epoch": 5.547031354236157, "loss": 0.31689751148223877, "step": 16630 }, { "ce_loss": 0.07073936611413956, "epoch": 5.547031354236157, "step": 16630 }, { "distill_loss": 0.13998320698738098, "epoch": 5.547031354236157, "step": 16630 }, { "epoch": 5.547031354236157, "ref_ce_loss": 0.08537984639406204, "step": 16630 }, { "epoch": 5.547031354236157, "loss": 0.26067429780960083, "step": 16630 }, { "ce_loss": 0.0695922002196312, "epoch": 5.547031354236157, "step": 16630 }, { "distill_loss": 0.10241564363241196, "epoch": 5.547031354236157, "step": 16630 }, { "epoch": 5.547031354236157, "ref_ce_loss": 0.06054640933871269, "step": 16630 }, { "epoch": 5.547031354236157, "loss": 0.3345610499382019, "step": 16630 }, { "ce_loss": 0.06958122551441193, "epoch": 5.547031354236157, "step": 16630 }, { "distill_loss": 0.100101038813591, "epoch": 5.547031354236157, "step": 16630 }, { "epoch": 5.547031354236157, "ref_ce_loss": 0.08606253564357758, "step": 16630 }, { "epoch": 5.547031354236157, "loss": 0.39710739254951477, "step": 16630 }, { "ce_loss": 0.08828610181808472, "epoch": 5.547031354236157, "step": 16630 }, { "distill_loss": 0.1490803062915802, "epoch": 5.547031354236157, "step": 16630 }, { "epoch": 5.547031354236157, "ref_ce_loss": 0.0988338366150856, "step": 16630 }, { "epoch": 5.550366911274183, "loss": 0.3204, "step": 16640 }, { "epoch": 5.550366911274183, "grad_norm": 2.270860195159912, "step": 16640 }, { "epoch": 5.550366911274183, "learning_rate": 6.791494039127539e-05, "step": 16640 }, { "epoch": 5.550366911274183, "loss": 0.563323438167572, "step": 16640 }, { "ce_loss": 0.10518639534711838, "epoch": 5.550366911274183, "step": 16640 }, { "distill_loss": 0.10006602853536606, "epoch": 5.550366911274183, "step": 16640 }, { "epoch": 5.550366911274183, "ref_ce_loss": 0.06689028441905975, "step": 16640 }, { "epoch": 5.550366911274183, "loss": 0.40847182273864746, "step": 16640 }, { "ce_loss": 0.06803660839796066, "epoch": 5.550366911274183, "step": 16640 }, { "distill_loss": 0.15362216532230377, "epoch": 5.550366911274183, "step": 16640 }, { "epoch": 5.550366911274183, "ref_ce_loss": 0.08404377102851868, "step": 16640 }, { "epoch": 5.550366911274183, "loss": 0.4165043830871582, "step": 16640 }, { "ce_loss": 0.10225572437047958, "epoch": 5.550366911274183, "step": 16640 }, { "distill_loss": 0.15723662078380585, "epoch": 5.550366911274183, "step": 16640 }, { "epoch": 5.550366911274183, "ref_ce_loss": 0.08742760866880417, "step": 16640 }, { "epoch": 5.550366911274183, "loss": 0.22630423307418823, "step": 16640 }, { "ce_loss": 0.0508527010679245, "epoch": 5.550366911274183, "step": 16640 }, { "distill_loss": 0.09829200059175491, "epoch": 5.550366911274183, "step": 16640 }, { "epoch": 5.550366911274183, "ref_ce_loss": 0.05572168529033661, "step": 16640 }, { "epoch": 5.553702468312208, "loss": 0.3596, "step": 16650 }, { "epoch": 5.553702468312208, "grad_norm": 4.383035659790039, "step": 16650 }, { "epoch": 5.553702468312208, "learning_rate": 6.774547546053297e-05, "step": 16650 }, { "epoch": 5.553702468312208, "loss": 0.24793292582035065, "step": 16650 }, { "ce_loss": 0.04041785001754761, "epoch": 5.553702468312208, "step": 16650 }, { "distill_loss": 0.1391821801662445, "epoch": 5.553702468312208, "step": 16650 }, { "epoch": 5.553702468312208, "ref_ce_loss": 0.06805215030908585, "step": 16650 }, { "epoch": 5.553702468312208, "loss": 0.30265313386917114, "step": 16650 }, { "ce_loss": 0.0523945614695549, "epoch": 5.553702468312208, "step": 16650 }, { "distill_loss": 0.11704914271831512, "epoch": 5.553702468312208, "step": 16650 }, { "epoch": 5.553702468312208, "ref_ce_loss": 0.06724908947944641, "step": 16650 }, { "epoch": 5.553702468312208, "loss": 0.2269441783428192, "step": 16650 }, { "ce_loss": 0.029113974422216415, "epoch": 5.553702468312208, "step": 16650 }, { "distill_loss": 0.11111169308423996, "epoch": 5.553702468312208, "step": 16650 }, { "epoch": 5.553702468312208, "ref_ce_loss": 0.08629629015922546, "step": 16650 }, { "epoch": 5.553702468312208, "loss": 0.2849983870983124, "step": 16650 }, { "ce_loss": 0.05142313987016678, "epoch": 5.553702468312208, "step": 16650 }, { "distill_loss": 0.1463957130908966, "epoch": 5.553702468312208, "step": 16650 }, { "epoch": 5.553702468312208, "ref_ce_loss": 0.0440007783472538, "step": 16650 }, { "epoch": 5.557038025350233, "loss": 0.3277, "step": 16660 }, { "epoch": 5.557038025350233, "grad_norm": 2.9877946376800537, "step": 16660 }, { "epoch": 5.557038025350233, "learning_rate": 6.757616052959658e-05, "step": 16660 }, { "epoch": 5.557038025350233, "loss": 0.21352145075798035, "step": 16660 }, { "ce_loss": 0.03965906798839569, "epoch": 5.557038025350233, "step": 16660 }, { "distill_loss": 0.09861264377832413, "epoch": 5.557038025350233, "step": 16660 }, { "epoch": 5.557038025350233, "ref_ce_loss": 0.04607783630490303, "step": 16660 }, { "epoch": 5.557038025350233, "loss": 0.38374829292297363, "step": 16660 }, { "ce_loss": 0.10164055228233337, "epoch": 5.557038025350233, "step": 16660 }, { "distill_loss": 0.11329170316457748, "epoch": 5.557038025350233, "step": 16660 }, { "epoch": 5.557038025350233, "ref_ce_loss": 0.0692281424999237, "step": 16660 }, { "epoch": 5.557038025350233, "loss": 0.8557072877883911, "step": 16660 }, { "ce_loss": 0.046634022146463394, "epoch": 5.557038025350233, "step": 16660 }, { "distill_loss": 0.1687537282705307, "epoch": 5.557038025350233, "step": 16660 }, { "epoch": 5.557038025350233, "ref_ce_loss": 0.06543728709220886, "step": 16660 }, { "epoch": 5.557038025350233, "loss": 0.19127684831619263, "step": 16660 }, { "ce_loss": 0.0667652115225792, "epoch": 5.557038025350233, "step": 16660 }, { "distill_loss": 0.095591239631176, "epoch": 5.557038025350233, "step": 16660 }, { "epoch": 5.557038025350233, "ref_ce_loss": 0.028839105740189552, "step": 16660 }, { "epoch": 5.560373582388259, "loss": 0.3787, "step": 16670 }, { "epoch": 5.560373582388259, "grad_norm": 3.7834699153900146, "step": 16670 }, { "epoch": 5.560373582388259, "learning_rate": 6.740699590722982e-05, "step": 16670 }, { "epoch": 5.560373582388259, "loss": 0.4098147749900818, "step": 16670 }, { "ce_loss": 0.06272386014461517, "epoch": 5.560373582388259, "step": 16670 }, { "distill_loss": 0.1474025994539261, "epoch": 5.560373582388259, "step": 16670 }, { "epoch": 5.560373582388259, "ref_ce_loss": 0.10220678150653839, "step": 16670 }, { "epoch": 5.560373582388259, "loss": 0.31686991453170776, "step": 16670 }, { "ce_loss": 0.1106472909450531, "epoch": 5.560373582388259, "step": 16670 }, { "distill_loss": 0.1048339232802391, "epoch": 5.560373582388259, "step": 16670 }, { "epoch": 5.560373582388259, "ref_ce_loss": 0.08176649361848831, "step": 16670 }, { "epoch": 5.560373582388259, "loss": 0.4467318058013916, "step": 16670 }, { "ce_loss": 0.09130599349737167, "epoch": 5.560373582388259, "step": 16670 }, { "distill_loss": 0.22026273608207703, "epoch": 5.560373582388259, "step": 16670 }, { "epoch": 5.560373582388259, "ref_ce_loss": 0.0889739841222763, "step": 16670 }, { "epoch": 5.560373582388259, "loss": 0.25014933943748474, "step": 16670 }, { "ce_loss": 0.06914155930280685, "epoch": 5.560373582388259, "step": 16670 }, { "distill_loss": 0.12035161256790161, "epoch": 5.560373582388259, "step": 16670 }, { "epoch": 5.560373582388259, "ref_ce_loss": 0.06039172783493996, "step": 16670 }, { "epoch": 5.563709139426284, "loss": 0.3791, "step": 16680 }, { "epoch": 5.563709139426284, "grad_norm": 3.1715545654296875, "step": 16680 }, { "epoch": 5.563709139426284, "learning_rate": 6.72379819019223e-05, "step": 16680 }, { "epoch": 5.563709139426284, "loss": 1.6280040740966797, "step": 16680 }, { "ce_loss": 0.1090499684214592, "epoch": 5.563709139426284, "step": 16680 }, { "distill_loss": 0.14834822714328766, "epoch": 5.563709139426284, "step": 16680 }, { "epoch": 5.563709139426284, "ref_ce_loss": 0.10466254502534866, "step": 16680 }, { "epoch": 5.563709139426284, "loss": 0.24755717813968658, "step": 16680 }, { "ce_loss": 0.07180020958185196, "epoch": 5.563709139426284, "step": 16680 }, { "distill_loss": 0.09066848456859589, "epoch": 5.563709139426284, "step": 16680 }, { "epoch": 5.563709139426284, "ref_ce_loss": 0.06948643922805786, "step": 16680 }, { "epoch": 5.563709139426284, "loss": 0.16598784923553467, "step": 16680 }, { "ce_loss": 0.006515104323625565, "epoch": 5.563709139426284, "step": 16680 }, { "distill_loss": 0.07459402829408646, "epoch": 5.563709139426284, "step": 16680 }, { "epoch": 5.563709139426284, "ref_ce_loss": 0.0633934810757637, "step": 16680 }, { "epoch": 5.563709139426284, "loss": 0.34224605560302734, "step": 16680 }, { "ce_loss": 0.058937061578035355, "epoch": 5.563709139426284, "step": 16680 }, { "distill_loss": 0.12465833127498627, "epoch": 5.563709139426284, "step": 16680 }, { "epoch": 5.563709139426284, "ref_ce_loss": 0.06454360485076904, "step": 16680 }, { "epoch": 5.567044696464309, "loss": 0.403, "step": 16690 }, { "epoch": 5.567044696464309, "grad_norm": 4.215590953826904, "step": 16690 }, { "epoch": 5.567044696464309, "learning_rate": 6.706911882188879e-05, "step": 16690 }, { "epoch": 5.567044696464309, "loss": 0.33820870518684387, "step": 16690 }, { "ce_loss": 0.08671735227108002, "epoch": 5.567044696464309, "step": 16690 }, { "distill_loss": 0.12246132642030716, "epoch": 5.567044696464309, "step": 16690 }, { "epoch": 5.567044696464309, "ref_ce_loss": 0.0622439831495285, "step": 16690 }, { "epoch": 5.567044696464309, "loss": 0.40078848600387573, "step": 16690 }, { "ce_loss": 0.08150546252727509, "epoch": 5.567044696464309, "step": 16690 }, { "distill_loss": 0.21842795610427856, "epoch": 5.567044696464309, "step": 16690 }, { "epoch": 5.567044696464309, "ref_ce_loss": 0.08130494505167007, "step": 16690 }, { "epoch": 5.567044696464309, "loss": 0.4324227273464203, "step": 16690 }, { "ce_loss": 0.11700266599655151, "epoch": 5.567044696464309, "step": 16690 }, { "distill_loss": 0.13477078080177307, "epoch": 5.567044696464309, "step": 16690 }, { "epoch": 5.567044696464309, "ref_ce_loss": 0.10167690366506577, "step": 16690 }, { "epoch": 5.567044696464309, "loss": 0.2886402904987335, "step": 16690 }, { "ce_loss": 0.054880108684301376, "epoch": 5.567044696464309, "step": 16690 }, { "distill_loss": 0.10805704444646835, "epoch": 5.567044696464309, "step": 16690 }, { "epoch": 5.567044696464309, "ref_ce_loss": 0.08718326687812805, "step": 16690 }, { "epoch": 5.570380253502335, "loss": 0.3885, "step": 16700 }, { "epoch": 5.570380253502335, "grad_norm": 3.0666604042053223, "step": 16700 }, { "epoch": 5.570380253502335, "learning_rate": 6.690040697506896e-05, "step": 16700 }, { "epoch": 5.570380253502335, "loss": 0.32792338728904724, "step": 16700 }, { "ce_loss": 0.07738234847784042, "epoch": 5.570380253502335, "step": 16700 }, { "distill_loss": 0.11659729480743408, "epoch": 5.570380253502335, "step": 16700 }, { "epoch": 5.570380253502335, "ref_ce_loss": 0.06254538148641586, "step": 16700 }, { "epoch": 5.570380253502335, "loss": 0.3328701853752136, "step": 16700 }, { "ce_loss": 0.11399929225444794, "epoch": 5.570380253502335, "step": 16700 }, { "distill_loss": 0.10265770554542542, "epoch": 5.570380253502335, "step": 16700 }, { "epoch": 5.570380253502335, "ref_ce_loss": 0.09002488851547241, "step": 16700 }, { "epoch": 5.570380253502335, "loss": 0.3892509341239929, "step": 16700 }, { "ce_loss": 0.06272214651107788, "epoch": 5.570380253502335, "step": 16700 }, { "distill_loss": 0.12995900213718414, "epoch": 5.570380253502335, "step": 16700 }, { "epoch": 5.570380253502335, "ref_ce_loss": 0.07331440597772598, "step": 16700 }, { "epoch": 5.570380253502335, "loss": 0.6979650259017944, "step": 16700 }, { "ce_loss": 0.06049128994345665, "epoch": 5.570380253502335, "step": 16700 }, { "distill_loss": 0.37069612741470337, "epoch": 5.570380253502335, "step": 16700 }, { "epoch": 5.570380253502335, "ref_ce_loss": 0.11396878212690353, "step": 16700 }, { "epoch": 5.57371581054036, "loss": 0.3821, "step": 16710 }, { "epoch": 5.57371581054036, "grad_norm": 2.9326987266540527, "step": 16710 }, { "epoch": 5.57371581054036, "learning_rate": 6.673184666912672e-05, "step": 16710 }, { "epoch": 5.57371581054036, "loss": 0.4939331114292145, "step": 16710 }, { "ce_loss": 0.11444830149412155, "epoch": 5.57371581054036, "step": 16710 }, { "distill_loss": 0.21305085718631744, "epoch": 5.57371581054036, "step": 16710 }, { "epoch": 5.57371581054036, "ref_ce_loss": 0.12908244132995605, "step": 16710 }, { "epoch": 5.57371581054036, "loss": 0.26652368903160095, "step": 16710 }, { "ce_loss": 0.07186118513345718, "epoch": 5.57371581054036, "step": 16710 }, { "distill_loss": 0.11902876943349838, "epoch": 5.57371581054036, "step": 16710 }, { "epoch": 5.57371581054036, "ref_ce_loss": 0.04031604155898094, "step": 16710 }, { "epoch": 5.57371581054036, "loss": 0.43540751934051514, "step": 16710 }, { "ce_loss": 0.09456142783164978, "epoch": 5.57371581054036, "step": 16710 }, { "distill_loss": 0.1499258279800415, "epoch": 5.57371581054036, "step": 16710 }, { "epoch": 5.57371581054036, "ref_ce_loss": 0.09525250643491745, "step": 16710 }, { "epoch": 5.57371581054036, "loss": 0.19972451031208038, "step": 16710 }, { "ce_loss": 0.02131311222910881, "epoch": 5.57371581054036, "step": 16710 }, { "distill_loss": 0.09276192635297775, "epoch": 5.57371581054036, "step": 16710 }, { "epoch": 5.57371581054036, "ref_ce_loss": 0.05993827059864998, "step": 16710 }, { "epoch": 5.5770513675783855, "loss": 0.3731, "step": 16720 }, { "epoch": 5.5770513675783855, "grad_norm": 2.389400005340576, "step": 16720 }, { "epoch": 5.5770513675783855, "learning_rate": 6.656343821144956e-05, "step": 16720 }, { "epoch": 5.5770513675783855, "loss": 0.38022905588150024, "step": 16720 }, { "ce_loss": 0.10960319638252258, "epoch": 5.5770513675783855, "step": 16720 }, { "distill_loss": 0.22096657752990723, "epoch": 5.5770513675783855, "step": 16720 }, { "epoch": 5.5770513675783855, "ref_ce_loss": 0.033225253224372864, "step": 16720 }, { "epoch": 5.5770513675783855, "loss": 0.35766640305519104, "step": 16720 }, { "ce_loss": 0.09015137702226639, "epoch": 5.5770513675783855, "step": 16720 }, { "distill_loss": 0.1479002833366394, "epoch": 5.5770513675783855, "step": 16720 }, { "epoch": 5.5770513675783855, "ref_ce_loss": 0.09554192423820496, "step": 16720 }, { "epoch": 5.5770513675783855, "loss": 0.3077741265296936, "step": 16720 }, { "ce_loss": 0.06773196160793304, "epoch": 5.5770513675783855, "step": 16720 }, { "distill_loss": 0.11214911192655563, "epoch": 5.5770513675783855, "step": 16720 }, { "epoch": 5.5770513675783855, "ref_ce_loss": 0.07687600702047348, "step": 16720 }, { "epoch": 5.5770513675783855, "loss": 0.5804701447486877, "step": 16720 }, { "ce_loss": 0.07880949229001999, "epoch": 5.5770513675783855, "step": 16720 }, { "distill_loss": 0.2357291579246521, "epoch": 5.5770513675783855, "step": 16720 }, { "epoch": 5.5770513675783855, "ref_ce_loss": 0.08584926277399063, "step": 16720 }, { "epoch": 5.580386924616411, "loss": 0.3451, "step": 16730 }, { "epoch": 5.580386924616411, "grad_norm": 1.8142964839935303, "step": 16730 }, { "epoch": 5.580386924616411, "learning_rate": 6.639518190914808e-05, "step": 16730 }, { "epoch": 5.580386924616411, "loss": 0.3560985326766968, "step": 16730 }, { "ce_loss": 0.06115071102976799, "epoch": 5.580386924616411, "step": 16730 }, { "distill_loss": 0.14061114192008972, "epoch": 5.580386924616411, "step": 16730 }, { "epoch": 5.580386924616411, "ref_ce_loss": 0.1131257712841034, "step": 16730 }, { "epoch": 5.580386924616411, "loss": 0.2548341155052185, "step": 16730 }, { "ce_loss": 0.08115708082914352, "epoch": 5.580386924616411, "step": 16730 }, { "distill_loss": 0.09377031028270721, "epoch": 5.580386924616411, "step": 16730 }, { "epoch": 5.580386924616411, "ref_ce_loss": 0.07978391647338867, "step": 16730 }, { "epoch": 5.580386924616411, "loss": 0.2637752294540405, "step": 16730 }, { "ce_loss": 0.04890935868024826, "epoch": 5.580386924616411, "step": 16730 }, { "distill_loss": 0.12748581171035767, "epoch": 5.580386924616411, "step": 16730 }, { "epoch": 5.580386924616411, "ref_ce_loss": 0.06210982799530029, "step": 16730 }, { "epoch": 5.580386924616411, "loss": 0.308973103761673, "step": 16730 }, { "ce_loss": 0.06964000314474106, "epoch": 5.580386924616411, "step": 16730 }, { "distill_loss": 0.09779088199138641, "epoch": 5.580386924616411, "step": 16730 }, { "epoch": 5.580386924616411, "ref_ce_loss": 0.0733722448348999, "step": 16730 }, { "epoch": 5.583722481654436, "loss": 0.3738, "step": 16740 }, { "epoch": 5.583722481654436, "grad_norm": 5.687265396118164, "step": 16740 }, { "epoch": 5.583722481654436, "learning_rate": 6.622707806905536e-05, "step": 16740 }, { "epoch": 5.583722481654436, "loss": 0.502987802028656, "step": 16740 }, { "ce_loss": 0.1237458735704422, "epoch": 5.583722481654436, "step": 16740 }, { "distill_loss": 0.16146957874298096, "epoch": 5.583722481654436, "step": 16740 }, { "epoch": 5.583722481654436, "ref_ce_loss": 0.08447730541229248, "step": 16740 }, { "epoch": 5.583722481654436, "loss": 0.22744502127170563, "step": 16740 }, { "ce_loss": 0.05142305791378021, "epoch": 5.583722481654436, "step": 16740 }, { "distill_loss": 0.10567010194063187, "epoch": 5.583722481654436, "step": 16740 }, { "epoch": 5.583722481654436, "ref_ce_loss": 0.051125843077898026, "step": 16740 }, { "epoch": 5.583722481654436, "loss": 0.2630317211151123, "step": 16740 }, { "ce_loss": 0.017098814249038696, "epoch": 5.583722481654436, "step": 16740 }, { "distill_loss": 0.16734573245048523, "epoch": 5.583722481654436, "step": 16740 }, { "epoch": 5.583722481654436, "ref_ce_loss": 0.05436302348971367, "step": 16740 }, { "epoch": 5.583722481654436, "loss": 0.22591489553451538, "step": 16740 }, { "ce_loss": 0.02939753420650959, "epoch": 5.583722481654436, "step": 16740 }, { "distill_loss": 0.08773542940616608, "epoch": 5.583722481654436, "step": 16740 }, { "epoch": 5.583722481654436, "ref_ce_loss": 0.047614991664886475, "step": 16740 }, { "epoch": 5.5870580386924615, "loss": 0.3427, "step": 16750 }, { "epoch": 5.5870580386924615, "grad_norm": 3.073615074157715, "step": 16750 }, { "epoch": 5.5870580386924615, "learning_rate": 6.605912699772657e-05, "step": 16750 }, { "epoch": 5.5870580386924615, "loss": 0.49910616874694824, "step": 16750 }, { "ce_loss": 0.11704373359680176, "epoch": 5.5870580386924615, "step": 16750 }, { "distill_loss": 0.16967318952083588, "epoch": 5.5870580386924615, "step": 16750 }, { "epoch": 5.5870580386924615, "ref_ce_loss": 0.08763284236192703, "step": 16750 }, { "epoch": 5.5870580386924615, "loss": 0.39135798811912537, "step": 16750 }, { "ce_loss": 0.12653915584087372, "epoch": 5.5870580386924615, "step": 16750 }, { "distill_loss": 0.1373661309480667, "epoch": 5.5870580386924615, "step": 16750 }, { "epoch": 5.5870580386924615, "ref_ce_loss": 0.11389608681201935, "step": 16750 }, { "epoch": 5.5870580386924615, "loss": 0.22781170904636383, "step": 16750 }, { "ce_loss": 0.044658780097961426, "epoch": 5.5870580386924615, "step": 16750 }, { "distill_loss": 0.09752068668603897, "epoch": 5.5870580386924615, "step": 16750 }, { "epoch": 5.5870580386924615, "ref_ce_loss": 0.07326694577932358, "step": 16750 }, { "epoch": 5.5870580386924615, "loss": 0.3132156729698181, "step": 16750 }, { "ce_loss": 0.0768897607922554, "epoch": 5.5870580386924615, "step": 16750 }, { "distill_loss": 0.15624229609966278, "epoch": 5.5870580386924615, "step": 16750 }, { "epoch": 5.5870580386924615, "ref_ce_loss": 0.06274968385696411, "step": 16750 }, { "epoch": 5.590393595730487, "loss": 0.4097, "step": 16760 }, { "epoch": 5.590393595730487, "grad_norm": 2.408324956893921, "step": 16760 }, { "epoch": 5.590393595730487, "learning_rate": 6.589132900143807e-05, "step": 16760 }, { "epoch": 5.590393595730487, "loss": 0.26510536670684814, "step": 16760 }, { "ce_loss": 0.0586848221719265, "epoch": 5.590393595730487, "step": 16760 }, { "distill_loss": 0.09962846338748932, "epoch": 5.590393595730487, "step": 16760 }, { "epoch": 5.590393595730487, "ref_ce_loss": 0.07298450917005539, "step": 16760 }, { "epoch": 5.590393595730487, "loss": 0.4885880947113037, "step": 16760 }, { "ce_loss": 0.1015976145863533, "epoch": 5.590393595730487, "step": 16760 }, { "distill_loss": 0.21036067605018616, "epoch": 5.590393595730487, "step": 16760 }, { "epoch": 5.590393595730487, "ref_ce_loss": 0.058434370905160904, "step": 16760 }, { "epoch": 5.590393595730487, "loss": 0.2753802537918091, "step": 16760 }, { "ce_loss": 0.05114470794796944, "epoch": 5.590393595730487, "step": 16760 }, { "distill_loss": 0.12594880163669586, "epoch": 5.590393595730487, "step": 16760 }, { "epoch": 5.590393595730487, "ref_ce_loss": 0.061987679451704025, "step": 16760 }, { "epoch": 5.590393595730487, "loss": 0.17079053819179535, "step": 16760 }, { "ce_loss": 0.004621799103915691, "epoch": 5.590393595730487, "step": 16760 }, { "distill_loss": 0.09937524795532227, "epoch": 5.590393595730487, "step": 16760 }, { "epoch": 5.590393595730487, "ref_ce_loss": 0.046160973608493805, "step": 16760 }, { "epoch": 5.593729152768512, "loss": 0.3805, "step": 16770 }, { "epoch": 5.593729152768512, "grad_norm": 2.079479694366455, "step": 16770 }, { "epoch": 5.593729152768512, "learning_rate": 6.572368438618734e-05, "step": 16770 }, { "epoch": 5.593729152768512, "loss": 0.5323101878166199, "step": 16770 }, { "ce_loss": 0.12786847352981567, "epoch": 5.593729152768512, "step": 16770 }, { "distill_loss": 0.16538603603839874, "epoch": 5.593729152768512, "step": 16770 }, { "epoch": 5.593729152768512, "ref_ce_loss": 0.09879563003778458, "step": 16770 }, { "epoch": 5.593729152768512, "loss": 0.5461047291755676, "step": 16770 }, { "ce_loss": 0.12018280476331711, "epoch": 5.593729152768512, "step": 16770 }, { "distill_loss": 0.13303323090076447, "epoch": 5.593729152768512, "step": 16770 }, { "epoch": 5.593729152768512, "ref_ce_loss": 0.08113959431648254, "step": 16770 }, { "epoch": 5.593729152768512, "loss": 0.26084309816360474, "step": 16770 }, { "ce_loss": 0.04693206027150154, "epoch": 5.593729152768512, "step": 16770 }, { "distill_loss": 0.12489647418260574, "epoch": 5.593729152768512, "step": 16770 }, { "epoch": 5.593729152768512, "ref_ce_loss": 0.0498616062104702, "step": 16770 }, { "epoch": 5.593729152768512, "loss": 0.6751585602760315, "step": 16770 }, { "ce_loss": 0.09317155182361603, "epoch": 5.593729152768512, "step": 16770 }, { "distill_loss": 0.13803917169570923, "epoch": 5.593729152768512, "step": 16770 }, { "epoch": 5.593729152768512, "ref_ce_loss": 0.10896999388933182, "step": 16770 }, { "epoch": 5.597064709806538, "loss": 0.3811, "step": 16780 }, { "epoch": 5.597064709806538, "grad_norm": 3.617305278778076, "step": 16780 }, { "epoch": 5.597064709806538, "learning_rate": 6.555619345769205e-05, "step": 16780 }, { "epoch": 5.597064709806538, "loss": 0.27140775322914124, "step": 16780 }, { "ce_loss": 0.09383071959018707, "epoch": 5.597064709806538, "step": 16780 }, { "distill_loss": 0.08340008556842804, "epoch": 5.597064709806538, "step": 16780 }, { "epoch": 5.597064709806538, "ref_ce_loss": 0.07747329771518707, "step": 16780 }, { "epoch": 5.597064709806538, "loss": 0.31209468841552734, "step": 16780 }, { "ce_loss": 0.04330824688076973, "epoch": 5.597064709806538, "step": 16780 }, { "distill_loss": 0.13244518637657166, "epoch": 5.597064709806538, "step": 16780 }, { "epoch": 5.597064709806538, "ref_ce_loss": 0.0734424814581871, "step": 16780 }, { "epoch": 5.597064709806538, "loss": 0.451698362827301, "step": 16780 }, { "ce_loss": 0.024857228621840477, "epoch": 5.597064709806538, "step": 16780 }, { "distill_loss": 0.16980913281440735, "epoch": 5.597064709806538, "step": 16780 }, { "epoch": 5.597064709806538, "ref_ce_loss": 0.08439881354570389, "step": 16780 }, { "epoch": 5.597064709806538, "loss": 0.3579801321029663, "step": 16780 }, { "ce_loss": 0.09500003606081009, "epoch": 5.597064709806538, "step": 16780 }, { "distill_loss": 0.1582847535610199, "epoch": 5.597064709806538, "step": 16780 }, { "epoch": 5.597064709806538, "ref_ce_loss": 0.08331548422574997, "step": 16780 }, { "epoch": 5.600400266844563, "loss": 0.374, "step": 16790 }, { "epoch": 5.600400266844563, "grad_norm": 2.767927646636963, "step": 16790 }, { "epoch": 5.600400266844563, "learning_rate": 6.53888565213895e-05, "step": 16790 }, { "epoch": 5.600400266844563, "loss": 0.5215246677398682, "step": 16790 }, { "ce_loss": 0.09143102169036865, "epoch": 5.600400266844563, "step": 16790 }, { "distill_loss": 0.16542060673236847, "epoch": 5.600400266844563, "step": 16790 }, { "epoch": 5.600400266844563, "ref_ce_loss": 0.07400554418563843, "step": 16790 }, { "epoch": 5.600400266844563, "loss": 0.38477057218551636, "step": 16790 }, { "ce_loss": 0.07713790237903595, "epoch": 5.600400266844563, "step": 16790 }, { "distill_loss": 0.1572268009185791, "epoch": 5.600400266844563, "step": 16790 }, { "epoch": 5.600400266844563, "ref_ce_loss": 0.07512886077165604, "step": 16790 }, { "epoch": 5.600400266844563, "loss": 0.8082589507102966, "step": 16790 }, { "ce_loss": 0.10924425721168518, "epoch": 5.600400266844563, "step": 16790 }, { "distill_loss": 0.21931283175945282, "epoch": 5.600400266844563, "step": 16790 }, { "epoch": 5.600400266844563, "ref_ce_loss": 0.08828362822532654, "step": 16790 }, { "epoch": 5.600400266844563, "loss": 0.48302650451660156, "step": 16790 }, { "ce_loss": 0.08925081789493561, "epoch": 5.600400266844563, "step": 16790 }, { "distill_loss": 0.14999093115329742, "epoch": 5.600400266844563, "step": 16790 }, { "epoch": 5.600400266844563, "ref_ce_loss": 0.12079279124736786, "step": 16790 }, { "epoch": 5.603735823882588, "loss": 0.3433, "step": 16800 }, { "epoch": 5.603735823882588, "grad_norm": 2.1402974128723145, "step": 16800 }, { "epoch": 5.603735823882588, "learning_rate": 6.522167388243632e-05, "step": 16800 }, { "epoch": 5.603735823882588, "loss": 0.22286638617515564, "step": 16800 }, { "ce_loss": 0.005005288403481245, "epoch": 5.603735823882588, "step": 16800 }, { "distill_loss": 0.12163923680782318, "epoch": 5.603735823882588, "step": 16800 }, { "epoch": 5.603735823882588, "ref_ce_loss": 0.06334011256694794, "step": 16800 }, { "epoch": 5.603735823882588, "loss": 0.4707980751991272, "step": 16800 }, { "ce_loss": 0.08313055336475372, "epoch": 5.603735823882588, "step": 16800 }, { "distill_loss": 0.12003469467163086, "epoch": 5.603735823882588, "step": 16800 }, { "epoch": 5.603735823882588, "ref_ce_loss": 0.07313154637813568, "step": 16800 }, { "epoch": 5.603735823882588, "loss": 0.2341451197862625, "step": 16800 }, { "ce_loss": 0.0286604855209589, "epoch": 5.603735823882588, "step": 16800 }, { "distill_loss": 0.14671389758586884, "epoch": 5.603735823882588, "step": 16800 }, { "epoch": 5.603735823882588, "ref_ce_loss": 0.0586368553340435, "step": 16800 }, { "epoch": 5.603735823882588, "loss": 0.2169308066368103, "step": 16800 }, { "ce_loss": 0.055694498121738434, "epoch": 5.603735823882588, "step": 16800 }, { "distill_loss": 0.09109349548816681, "epoch": 5.603735823882588, "step": 16800 }, { "epoch": 5.603735823882588, "ref_ce_loss": 0.05517904832959175, "step": 16800 }, { "epoch": 5.607071380920614, "loss": 0.3733, "step": 16810 }, { "epoch": 5.607071380920614, "grad_norm": 4.999337673187256, "step": 16810 }, { "epoch": 5.607071380920614, "learning_rate": 6.50546458457076e-05, "step": 16810 }, { "epoch": 5.607071380920614, "loss": 0.3535381257534027, "step": 16810 }, { "ce_loss": 0.08812938630580902, "epoch": 5.607071380920614, "step": 16810 }, { "distill_loss": 0.11784398555755615, "epoch": 5.607071380920614, "step": 16810 }, { "epoch": 5.607071380920614, "ref_ce_loss": 0.08096872270107269, "step": 16810 }, { "epoch": 5.607071380920614, "loss": 0.45187708735466003, "step": 16810 }, { "ce_loss": 0.029312031343579292, "epoch": 5.607071380920614, "step": 16810 }, { "distill_loss": 0.10933209210634232, "epoch": 5.607071380920614, "step": 16810 }, { "epoch": 5.607071380920614, "ref_ce_loss": 0.08249694108963013, "step": 16810 }, { "epoch": 5.607071380920614, "loss": 0.32924607396125793, "step": 16810 }, { "ce_loss": 0.06451299041509628, "epoch": 5.607071380920614, "step": 16810 }, { "distill_loss": 0.1427079141139984, "epoch": 5.607071380920614, "step": 16810 }, { "epoch": 5.607071380920614, "ref_ce_loss": 0.08778007328510284, "step": 16810 }, { "epoch": 5.607071380920614, "loss": 0.42343294620513916, "step": 16810 }, { "ce_loss": 0.14571364223957062, "epoch": 5.607071380920614, "step": 16810 }, { "distill_loss": 0.2002367526292801, "epoch": 5.607071380920614, "step": 16810 }, { "epoch": 5.607071380920614, "ref_ce_loss": 0.07713088393211365, "step": 16810 }, { "epoch": 5.610406937958639, "loss": 0.3416, "step": 16820 }, { "epoch": 5.610406937958639, "grad_norm": 1.9727840423583984, "step": 16820 }, { "epoch": 5.610406937958639, "learning_rate": 6.48877727157966e-05, "step": 16820 }, { "epoch": 5.610406937958639, "loss": 0.19755208492279053, "step": 16820 }, { "ce_loss": 0.021219512447714806, "epoch": 5.610406937958639, "step": 16820 }, { "distill_loss": 0.11275633424520493, "epoch": 5.610406937958639, "step": 16820 }, { "epoch": 5.610406937958639, "ref_ce_loss": 0.06317698955535889, "step": 16820 }, { "epoch": 5.610406937958639, "loss": 0.5142295360565186, "step": 16820 }, { "ce_loss": 0.06793922185897827, "epoch": 5.610406937958639, "step": 16820 }, { "distill_loss": 0.2093472182750702, "epoch": 5.610406937958639, "step": 16820 }, { "epoch": 5.610406937958639, "ref_ce_loss": 0.0721978098154068, "step": 16820 }, { "epoch": 5.610406937958639, "loss": 0.45053979754447937, "step": 16820 }, { "ce_loss": 0.07777457684278488, "epoch": 5.610406937958639, "step": 16820 }, { "distill_loss": 0.1466236710548401, "epoch": 5.610406937958639, "step": 16820 }, { "epoch": 5.610406937958639, "ref_ce_loss": 0.08509445935487747, "step": 16820 }, { "epoch": 5.610406937958639, "loss": 0.2358899563550949, "step": 16820 }, { "ce_loss": 0.04887387901544571, "epoch": 5.610406937958639, "step": 16820 }, { "distill_loss": 0.12546847760677338, "epoch": 5.610406937958639, "step": 16820 }, { "epoch": 5.610406937958639, "ref_ce_loss": 0.06151670962572098, "step": 16820 }, { "epoch": 5.613742494996664, "loss": 0.3501, "step": 16830 }, { "epoch": 5.613742494996664, "grad_norm": 2.5225648880004883, "step": 16830 }, { "epoch": 5.613742494996664, "learning_rate": 6.472105479701425e-05, "step": 16830 }, { "epoch": 5.613742494996664, "loss": 0.31352949142456055, "step": 16830 }, { "ce_loss": 0.10307907313108444, "epoch": 5.613742494996664, "step": 16830 }, { "distill_loss": 0.13297227025032043, "epoch": 5.613742494996664, "step": 16830 }, { "epoch": 5.613742494996664, "ref_ce_loss": 0.07737308740615845, "step": 16830 }, { "epoch": 5.613742494996664, "loss": 0.2373771369457245, "step": 16830 }, { "ce_loss": 0.03195342794060707, "epoch": 5.613742494996664, "step": 16830 }, { "distill_loss": 0.12204240262508392, "epoch": 5.613742494996664, "step": 16830 }, { "epoch": 5.613742494996664, "ref_ce_loss": 0.0517796166241169, "step": 16830 }, { "epoch": 5.613742494996664, "loss": 0.38204991817474365, "step": 16830 }, { "ce_loss": 0.05168701708316803, "epoch": 5.613742494996664, "step": 16830 }, { "distill_loss": 0.18729457259178162, "epoch": 5.613742494996664, "step": 16830 }, { "epoch": 5.613742494996664, "ref_ce_loss": 0.0903494581580162, "step": 16830 }, { "epoch": 5.613742494996664, "loss": 0.5538869500160217, "step": 16830 }, { "ce_loss": 0.10379138588905334, "epoch": 5.613742494996664, "step": 16830 }, { "distill_loss": 0.24181246757507324, "epoch": 5.613742494996664, "step": 16830 }, { "epoch": 5.613742494996664, "ref_ce_loss": 0.05665380507707596, "step": 16830 }, { "epoch": 5.61707805203469, "loss": 0.3983, "step": 16840 }, { "epoch": 5.61707805203469, "grad_norm": 6.060856819152832, "step": 16840 }, { "epoch": 5.61707805203469, "learning_rate": 6.455449239338809e-05, "step": 16840 }, { "epoch": 5.61707805203469, "loss": 0.6989403963088989, "step": 16840 }, { "ce_loss": 0.10969628393650055, "epoch": 5.61707805203469, "step": 16840 }, { "distill_loss": 0.29851293563842773, "epoch": 5.61707805203469, "step": 16840 }, { "epoch": 5.61707805203469, "ref_ce_loss": 0.06750264763832092, "step": 16840 }, { "epoch": 5.61707805203469, "loss": 0.31590399146080017, "step": 16840 }, { "ce_loss": 0.12308245897293091, "epoch": 5.61707805203469, "step": 16840 }, { "distill_loss": 0.11415868997573853, "epoch": 5.61707805203469, "step": 16840 }, { "epoch": 5.61707805203469, "ref_ce_loss": 0.05798859894275665, "step": 16840 }, { "epoch": 5.61707805203469, "loss": 0.27054455876350403, "step": 16840 }, { "ce_loss": 0.050939805805683136, "epoch": 5.61707805203469, "step": 16840 }, { "distill_loss": 0.16063706576824188, "epoch": 5.61707805203469, "step": 16840 }, { "epoch": 5.61707805203469, "ref_ce_loss": 0.05888758972287178, "step": 16840 }, { "epoch": 5.61707805203469, "loss": 0.2523441016674042, "step": 16840 }, { "ce_loss": 0.038503505289554596, "epoch": 5.61707805203469, "step": 16840 }, { "distill_loss": 0.1113259419798851, "epoch": 5.61707805203469, "step": 16840 }, { "epoch": 5.61707805203469, "ref_ce_loss": 0.08058229088783264, "step": 16840 }, { "epoch": 5.620413609072715, "loss": 0.3615, "step": 16850 }, { "epoch": 5.620413609072715, "grad_norm": 3.9339704513549805, "step": 16850 }, { "epoch": 5.620413609072715, "learning_rate": 6.43880858086623e-05, "step": 16850 }, { "epoch": 5.620413609072715, "loss": 0.31341302394866943, "step": 16850 }, { "ce_loss": 0.06522848457098007, "epoch": 5.620413609072715, "step": 16850 }, { "distill_loss": 0.10614801943302155, "epoch": 5.620413609072715, "step": 16850 }, { "epoch": 5.620413609072715, "ref_ce_loss": 0.07481067627668381, "step": 16850 }, { "epoch": 5.620413609072715, "loss": 0.37363332509994507, "step": 16850 }, { "ce_loss": 0.0904751867055893, "epoch": 5.620413609072715, "step": 16850 }, { "distill_loss": 0.20257927477359772, "epoch": 5.620413609072715, "step": 16850 }, { "epoch": 5.620413609072715, "ref_ce_loss": 0.0803053081035614, "step": 16850 }, { "epoch": 5.620413609072715, "loss": 0.9973900318145752, "step": 16850 }, { "ce_loss": 0.1294299215078354, "epoch": 5.620413609072715, "step": 16850 }, { "distill_loss": 0.2549104690551758, "epoch": 5.620413609072715, "step": 16850 }, { "epoch": 5.620413609072715, "ref_ce_loss": 0.1427856683731079, "step": 16850 }, { "epoch": 5.620413609072715, "loss": 0.27538955211639404, "step": 16850 }, { "ce_loss": 0.03127444162964821, "epoch": 5.620413609072715, "step": 16850 }, { "distill_loss": 0.11600720882415771, "epoch": 5.620413609072715, "step": 16850 }, { "epoch": 5.620413609072715, "ref_ce_loss": 0.053942952305078506, "step": 16850 }, { "epoch": 5.62374916611074, "loss": 0.376, "step": 16860 }, { "epoch": 5.62374916611074, "grad_norm": 1.79304039478302, "step": 16860 }, { "epoch": 5.62374916611074, "learning_rate": 6.42218353462969e-05, "step": 16860 }, { "epoch": 5.62374916611074, "loss": 0.5045366287231445, "step": 16860 }, { "ce_loss": 0.07572513073682785, "epoch": 5.62374916611074, "step": 16860 }, { "distill_loss": 0.14832082390785217, "epoch": 5.62374916611074, "step": 16860 }, { "epoch": 5.62374916611074, "ref_ce_loss": 0.0814862921833992, "step": 16860 }, { "epoch": 5.62374916611074, "loss": 0.3642059564590454, "step": 16860 }, { "ce_loss": 0.05447271838784218, "epoch": 5.62374916611074, "step": 16860 }, { "distill_loss": 0.15132412314414978, "epoch": 5.62374916611074, "step": 16860 }, { "epoch": 5.62374916611074, "ref_ce_loss": 0.07785908877849579, "step": 16860 }, { "epoch": 5.62374916611074, "loss": 0.1737252026796341, "step": 16860 }, { "ce_loss": 0.006255322601646185, "epoch": 5.62374916611074, "step": 16860 }, { "distill_loss": 0.07501223683357239, "epoch": 5.62374916611074, "step": 16860 }, { "epoch": 5.62374916611074, "ref_ce_loss": 0.05216903239488602, "step": 16860 }, { "epoch": 5.62374916611074, "loss": 0.4993632137775421, "step": 16860 }, { "ce_loss": 0.09058297425508499, "epoch": 5.62374916611074, "step": 16860 }, { "distill_loss": 0.10913488268852234, "epoch": 5.62374916611074, "step": 16860 }, { "epoch": 5.62374916611074, "ref_ce_loss": 0.07340264320373535, "step": 16860 }, { "epoch": 5.627084723148766, "loss": 0.3919, "step": 16870 }, { "epoch": 5.627084723148766, "grad_norm": 1.8264517784118652, "step": 16870 }, { "epoch": 5.627084723148766, "learning_rate": 6.405574130946707e-05, "step": 16870 }, { "epoch": 5.627084723148766, "loss": 0.30188092589378357, "step": 16870 }, { "ce_loss": 0.08277290314435959, "epoch": 5.627084723148766, "step": 16870 }, { "distill_loss": 0.17446166276931763, "epoch": 5.627084723148766, "step": 16870 }, { "epoch": 5.627084723148766, "ref_ce_loss": 0.04445138946175575, "step": 16870 }, { "epoch": 5.627084723148766, "loss": 0.32595294713974, "step": 16870 }, { "ce_loss": 0.07430987805128098, "epoch": 5.627084723148766, "step": 16870 }, { "distill_loss": 0.15796136856079102, "epoch": 5.627084723148766, "step": 16870 }, { "epoch": 5.627084723148766, "ref_ce_loss": 0.07466117292642593, "step": 16870 }, { "epoch": 5.627084723148766, "loss": 0.3349049687385559, "step": 16870 }, { "ce_loss": 0.0747389942407608, "epoch": 5.627084723148766, "step": 16870 }, { "distill_loss": 0.1390739381313324, "epoch": 5.627084723148766, "step": 16870 }, { "epoch": 5.627084723148766, "ref_ce_loss": 0.12086381763219833, "step": 16870 }, { "epoch": 5.627084723148766, "loss": 0.3198974132537842, "step": 16870 }, { "ce_loss": 0.0835222452878952, "epoch": 5.627084723148766, "step": 16870 }, { "distill_loss": 0.14694853127002716, "epoch": 5.627084723148766, "step": 16870 }, { "epoch": 5.627084723148766, "ref_ce_loss": 0.06452935934066772, "step": 16870 }, { "epoch": 5.630420280186791, "loss": 0.3441, "step": 16880 }, { "epoch": 5.630420280186791, "grad_norm": 5.504924774169922, "step": 16880 }, { "epoch": 5.630420280186791, "learning_rate": 6.388980400106283e-05, "step": 16880 }, { "epoch": 5.630420280186791, "loss": 0.4352351129055023, "step": 16880 }, { "ce_loss": 0.07789971679449081, "epoch": 5.630420280186791, "step": 16880 }, { "distill_loss": 0.13848719000816345, "epoch": 5.630420280186791, "step": 16880 }, { "epoch": 5.630420280186791, "ref_ce_loss": 0.12263132631778717, "step": 16880 }, { "epoch": 5.630420280186791, "loss": 0.43858495354652405, "step": 16880 }, { "ce_loss": 0.11291355639696121, "epoch": 5.630420280186791, "step": 16880 }, { "distill_loss": 0.14818082749843597, "epoch": 5.630420280186791, "step": 16880 }, { "epoch": 5.630420280186791, "ref_ce_loss": 0.06953940540552139, "step": 16880 }, { "epoch": 5.630420280186791, "loss": 0.2217913717031479, "step": 16880 }, { "ce_loss": 0.040353529155254364, "epoch": 5.630420280186791, "step": 16880 }, { "distill_loss": 0.12617231905460358, "epoch": 5.630420280186791, "step": 16880 }, { "epoch": 5.630420280186791, "ref_ce_loss": 0.05498276278376579, "step": 16880 }, { "epoch": 5.630420280186791, "loss": 0.28420495986938477, "step": 16880 }, { "ce_loss": 0.06727323681116104, "epoch": 5.630420280186791, "step": 16880 }, { "distill_loss": 0.11587470769882202, "epoch": 5.630420280186791, "step": 16880 }, { "epoch": 5.630420280186791, "ref_ce_loss": 0.05712687596678734, "step": 16880 }, { "epoch": 5.633755837224816, "loss": 0.3626, "step": 16890 }, { "epoch": 5.633755837224816, "grad_norm": 3.8169634342193604, "step": 16890 }, { "epoch": 5.633755837224816, "learning_rate": 6.37240237236884e-05, "step": 16890 }, { "epoch": 5.633755837224816, "loss": 0.34174394607543945, "step": 16890 }, { "ce_loss": 0.06275150179862976, "epoch": 5.633755837224816, "step": 16890 }, { "distill_loss": 0.15911883115768433, "epoch": 5.633755837224816, "step": 16890 }, { "epoch": 5.633755837224816, "ref_ce_loss": 0.08189556747674942, "step": 16890 }, { "epoch": 5.633755837224816, "loss": 0.2643011510372162, "step": 16890 }, { "ce_loss": 0.06280254572629929, "epoch": 5.633755837224816, "step": 16890 }, { "distill_loss": 0.11299487203359604, "epoch": 5.633755837224816, "step": 16890 }, { "epoch": 5.633755837224816, "ref_ce_loss": 0.05963292345404625, "step": 16890 }, { "epoch": 5.633755837224816, "loss": 0.3587222397327423, "step": 16890 }, { "ce_loss": 0.11320937424898148, "epoch": 5.633755837224816, "step": 16890 }, { "distill_loss": 0.16261257231235504, "epoch": 5.633755837224816, "step": 16890 }, { "epoch": 5.633755837224816, "ref_ce_loss": 0.08281068503856659, "step": 16890 }, { "epoch": 5.633755837224816, "loss": 0.23671752214431763, "step": 16890 }, { "ce_loss": 0.03498915210366249, "epoch": 5.633755837224816, "step": 16890 }, { "distill_loss": 0.10112226009368896, "epoch": 5.633755837224816, "step": 16890 }, { "epoch": 5.633755837224816, "ref_ce_loss": 0.06465116888284683, "step": 16890 }, { "epoch": 5.637091394262842, "loss": 0.3735, "step": 16900 }, { "epoch": 5.637091394262842, "grad_norm": 2.1169912815093994, "step": 16900 }, { "epoch": 5.637091394262842, "learning_rate": 6.355840077966158e-05, "step": 16900 }, { "epoch": 5.637091394262842, "loss": 0.23858742415905, "step": 16900 }, { "ce_loss": 0.041046276688575745, "epoch": 5.637091394262842, "step": 16900 }, { "distill_loss": 0.07950280606746674, "epoch": 5.637091394262842, "step": 16900 }, { "epoch": 5.637091394262842, "ref_ce_loss": 0.072368323802948, "step": 16900 }, { "epoch": 5.637091394262842, "loss": 0.28376126289367676, "step": 16900 }, { "ce_loss": 0.02801305614411831, "epoch": 5.637091394262842, "step": 16900 }, { "distill_loss": 0.12259427458047867, "epoch": 5.637091394262842, "step": 16900 }, { "epoch": 5.637091394262842, "ref_ce_loss": 0.07551668584346771, "step": 16900 }, { "epoch": 5.637091394262842, "loss": 0.18908122181892395, "step": 16900 }, { "ce_loss": 0.032690152525901794, "epoch": 5.637091394262842, "step": 16900 }, { "distill_loss": 0.0932382196187973, "epoch": 5.637091394262842, "step": 16900 }, { "epoch": 5.637091394262842, "ref_ce_loss": 0.06297314912080765, "step": 16900 }, { "epoch": 5.637091394262842, "loss": 0.3903302252292633, "step": 16900 }, { "ce_loss": 0.09799889475107193, "epoch": 5.637091394262842, "step": 16900 }, { "distill_loss": 0.1463698148727417, "epoch": 5.637091394262842, "step": 16900 }, { "epoch": 5.637091394262842, "ref_ce_loss": 0.07008067518472672, "step": 16900 }, { "epoch": 5.640426951300867, "loss": 0.389, "step": 16910 }, { "epoch": 5.640426951300867, "grad_norm": 3.601564884185791, "step": 16910 }, { "epoch": 5.640426951300867, "learning_rate": 6.339293547101339e-05, "step": 16910 }, { "epoch": 5.640426951300867, "loss": 0.6289790272712708, "step": 16910 }, { "ce_loss": 0.05350485444068909, "epoch": 5.640426951300867, "step": 16910 }, { "distill_loss": 0.3808191120624542, "epoch": 5.640426951300867, "step": 16910 }, { "epoch": 5.640426951300867, "ref_ce_loss": 0.09556426852941513, "step": 16910 }, { "epoch": 5.640426951300867, "loss": 0.3073519468307495, "step": 16910 }, { "ce_loss": 0.012231869623064995, "epoch": 5.640426951300867, "step": 16910 }, { "distill_loss": 0.10754086822271347, "epoch": 5.640426951300867, "step": 16910 }, { "epoch": 5.640426951300867, "ref_ce_loss": 0.07653357833623886, "step": 16910 }, { "epoch": 5.640426951300867, "loss": 0.4514399766921997, "step": 16910 }, { "ce_loss": 0.10036049783229828, "epoch": 5.640426951300867, "step": 16910 }, { "distill_loss": 0.14256389439105988, "epoch": 5.640426951300867, "step": 16910 }, { "epoch": 5.640426951300867, "ref_ce_loss": 0.09835103154182434, "step": 16910 }, { "epoch": 5.640426951300867, "loss": 0.3855137228965759, "step": 16910 }, { "ce_loss": 0.12630429863929749, "epoch": 5.640426951300867, "step": 16910 }, { "distill_loss": 0.15595805644989014, "epoch": 5.640426951300867, "step": 16910 }, { "epoch": 5.640426951300867, "ref_ce_loss": 0.0788608267903328, "step": 16910 }, { "epoch": 5.6437625083388925, "loss": 0.3837, "step": 16920 }, { "epoch": 5.6437625083388925, "grad_norm": 1.9282772541046143, "step": 16920 }, { "epoch": 5.6437625083388925, "learning_rate": 6.322762809948714e-05, "step": 16920 }, { "epoch": 5.6437625083388925, "loss": 0.3533139228820801, "step": 16920 }, { "ce_loss": 0.12137754261493683, "epoch": 5.6437625083388925, "step": 16920 }, { "distill_loss": 0.14143449068069458, "epoch": 5.6437625083388925, "step": 16920 }, { "epoch": 5.6437625083388925, "ref_ce_loss": 0.07143258303403854, "step": 16920 }, { "epoch": 5.6437625083388925, "loss": 0.31891804933547974, "step": 16920 }, { "ce_loss": 0.07180596888065338, "epoch": 5.6437625083388925, "step": 16920 }, { "distill_loss": 0.18838739395141602, "epoch": 5.6437625083388925, "step": 16920 }, { "epoch": 5.6437625083388925, "ref_ce_loss": 0.05863086134195328, "step": 16920 }, { "epoch": 5.6437625083388925, "loss": 0.4280124306678772, "step": 16920 }, { "ce_loss": 0.06295457482337952, "epoch": 5.6437625083388925, "step": 16920 }, { "distill_loss": 0.1645834743976593, "epoch": 5.6437625083388925, "step": 16920 }, { "epoch": 5.6437625083388925, "ref_ce_loss": 0.06780682504177094, "step": 16920 }, { "epoch": 5.6437625083388925, "loss": 0.31792157888412476, "step": 16920 }, { "ce_loss": 0.08119270205497742, "epoch": 5.6437625083388925, "step": 16920 }, { "distill_loss": 0.10686053335666656, "epoch": 5.6437625083388925, "step": 16920 }, { "epoch": 5.6437625083388925, "ref_ce_loss": 0.08020225912332535, "step": 16920 }, { "epoch": 5.647098065376918, "loss": 0.4228, "step": 16930 }, { "epoch": 5.647098065376918, "grad_norm": 2.3339974880218506, "step": 16930 }, { "epoch": 5.647098065376918, "learning_rate": 6.306247896653833e-05, "step": 16930 }, { "epoch": 5.647098065376918, "loss": 0.3029731512069702, "step": 16930 }, { "ce_loss": 0.024777529761195183, "epoch": 5.647098065376918, "step": 16930 }, { "distill_loss": 0.16923952102661133, "epoch": 5.647098065376918, "step": 16930 }, { "epoch": 5.647098065376918, "ref_ce_loss": 0.04961796849966049, "step": 16930 }, { "epoch": 5.647098065376918, "loss": 0.29297712445259094, "step": 16930 }, { "ce_loss": 0.08377854526042938, "epoch": 5.647098065376918, "step": 16930 }, { "distill_loss": 0.12032924592494965, "epoch": 5.647098065376918, "step": 16930 }, { "epoch": 5.647098065376918, "ref_ce_loss": 0.06323269009590149, "step": 16930 }, { "epoch": 5.647098065376918, "loss": 0.40155941247940063, "step": 16930 }, { "ce_loss": 0.08552870154380798, "epoch": 5.647098065376918, "step": 16930 }, { "distill_loss": 0.1203310415148735, "epoch": 5.647098065376918, "step": 16930 }, { "epoch": 5.647098065376918, "ref_ce_loss": 0.1074351966381073, "step": 16930 }, { "epoch": 5.647098065376918, "loss": 0.3517897129058838, "step": 16930 }, { "ce_loss": 0.05377979949116707, "epoch": 5.647098065376918, "step": 16930 }, { "distill_loss": 0.1443416327238083, "epoch": 5.647098065376918, "step": 16930 }, { "epoch": 5.647098065376918, "ref_ce_loss": 0.08523765951395035, "step": 16930 }, { "epoch": 5.650433622414943, "loss": 0.3688, "step": 16940 }, { "epoch": 5.650433622414943, "grad_norm": 2.552321434020996, "step": 16940 }, { "epoch": 5.650433622414943, "learning_rate": 6.289748837333383e-05, "step": 16940 }, { "epoch": 5.650433622414943, "loss": 0.43256378173828125, "step": 16940 }, { "ce_loss": 0.07758132368326187, "epoch": 5.650433622414943, "step": 16940 }, { "distill_loss": 0.11776245385408401, "epoch": 5.650433622414943, "step": 16940 }, { "epoch": 5.650433622414943, "ref_ce_loss": 0.08114629238843918, "step": 16940 }, { "epoch": 5.650433622414943, "loss": 0.4100405275821686, "step": 16940 }, { "ce_loss": 0.11772602796554565, "epoch": 5.650433622414943, "step": 16940 }, { "distill_loss": 0.1447276473045349, "epoch": 5.650433622414943, "step": 16940 }, { "epoch": 5.650433622414943, "ref_ce_loss": 0.11454854905605316, "step": 16940 }, { "epoch": 5.650433622414943, "loss": 0.4397696256637573, "step": 16940 }, { "ce_loss": 0.1683981567621231, "epoch": 5.650433622414943, "step": 16940 }, { "distill_loss": 0.157260924577713, "epoch": 5.650433622414943, "step": 16940 }, { "epoch": 5.650433622414943, "ref_ce_loss": 0.08736620843410492, "step": 16940 }, { "epoch": 5.650433622414943, "loss": 0.38817211985588074, "step": 16940 }, { "ce_loss": 0.0835593119263649, "epoch": 5.650433622414943, "step": 16940 }, { "distill_loss": 0.13545547425746918, "epoch": 5.650433622414943, "step": 16940 }, { "epoch": 5.650433622414943, "ref_ce_loss": 0.08349437266588211, "step": 16940 }, { "epoch": 5.6537691794529685, "loss": 0.385, "step": 16950 }, { "epoch": 5.6537691794529685, "grad_norm": 3.4251179695129395, "step": 16950 }, { "epoch": 5.6537691794529685, "learning_rate": 6.273265662075142e-05, "step": 16950 }, { "epoch": 5.6537691794529685, "loss": 0.23143433034420013, "step": 16950 }, { "ce_loss": 0.05177433043718338, "epoch": 5.6537691794529685, "step": 16950 }, { "distill_loss": 0.08931882679462433, "epoch": 5.6537691794529685, "step": 16950 }, { "epoch": 5.6537691794529685, "ref_ce_loss": 0.06682384759187698, "step": 16950 }, { "epoch": 5.6537691794529685, "loss": 0.4629412293434143, "step": 16950 }, { "ce_loss": 0.11971709877252579, "epoch": 5.6537691794529685, "step": 16950 }, { "distill_loss": 0.1258133500814438, "epoch": 5.6537691794529685, "step": 16950 }, { "epoch": 5.6537691794529685, "ref_ce_loss": 0.07613131403923035, "step": 16950 }, { "epoch": 5.6537691794529685, "loss": 0.2683376669883728, "step": 16950 }, { "ce_loss": 0.05023961886763573, "epoch": 5.6537691794529685, "step": 16950 }, { "distill_loss": 0.13034687936306, "epoch": 5.6537691794529685, "step": 16950 }, { "epoch": 5.6537691794529685, "ref_ce_loss": 0.05619325861334801, "step": 16950 }, { "epoch": 5.6537691794529685, "loss": 0.2849885821342468, "step": 16950 }, { "ce_loss": 0.07378362864255905, "epoch": 5.6537691794529685, "step": 16950 }, { "distill_loss": 0.1290549486875534, "epoch": 5.6537691794529685, "step": 16950 }, { "epoch": 5.6537691794529685, "ref_ce_loss": 0.081927590072155, "step": 16950 }, { "epoch": 5.657104736490994, "loss": 0.3898, "step": 16960 }, { "epoch": 5.657104736490994, "grad_norm": 5.112725734710693, "step": 16960 }, { "epoch": 5.657104736490994, "learning_rate": 6.256798400937919e-05, "step": 16960 }, { "epoch": 5.657104736490994, "loss": 0.306372731924057, "step": 16960 }, { "ce_loss": 0.05546395480632782, "epoch": 5.657104736490994, "step": 16960 }, { "distill_loss": 0.11730388551950455, "epoch": 5.657104736490994, "step": 16960 }, { "epoch": 5.657104736490994, "ref_ce_loss": 0.08945717662572861, "step": 16960 }, { "epoch": 5.657104736490994, "loss": 0.35184144973754883, "step": 16960 }, { "ce_loss": 0.12297777831554413, "epoch": 5.657104736490994, "step": 16960 }, { "distill_loss": 0.13996382057666779, "epoch": 5.657104736490994, "step": 16960 }, { "epoch": 5.657104736490994, "ref_ce_loss": 0.07640225440263748, "step": 16960 }, { "epoch": 5.657104736490994, "loss": 0.4762941002845764, "step": 16960 }, { "ce_loss": 0.1554713249206543, "epoch": 5.657104736490994, "step": 16960 }, { "distill_loss": 0.1667623519897461, "epoch": 5.657104736490994, "step": 16960 }, { "epoch": 5.657104736490994, "ref_ce_loss": 0.07829466462135315, "step": 16960 }, { "epoch": 5.657104736490994, "loss": 0.557092547416687, "step": 16960 }, { "ce_loss": 0.07773231714963913, "epoch": 5.657104736490994, "step": 16960 }, { "distill_loss": 0.09895443171262741, "epoch": 5.657104736490994, "step": 16960 }, { "epoch": 5.657104736490994, "ref_ce_loss": 0.05536096170544624, "step": 16960 }, { "epoch": 5.660440293529019, "loss": 0.4094, "step": 16970 }, { "epoch": 5.660440293529019, "grad_norm": 3.074315071105957, "step": 16970 }, { "epoch": 5.660440293529019, "learning_rate": 6.240347083951498e-05, "step": 16970 }, { "epoch": 5.660440293529019, "loss": 0.3062307834625244, "step": 16970 }, { "ce_loss": 0.04686376824975014, "epoch": 5.660440293529019, "step": 16970 }, { "distill_loss": 0.12231691926717758, "epoch": 5.660440293529019, "step": 16970 }, { "epoch": 5.660440293529019, "ref_ce_loss": 0.06324928253889084, "step": 16970 }, { "epoch": 5.660440293529019, "loss": 0.38145822286605835, "step": 16970 }, { "ce_loss": 0.07497313618659973, "epoch": 5.660440293529019, "step": 16970 }, { "distill_loss": 0.10484956949949265, "epoch": 5.660440293529019, "step": 16970 }, { "epoch": 5.660440293529019, "ref_ce_loss": 0.0924435704946518, "step": 16970 }, { "epoch": 5.660440293529019, "loss": 0.25619828701019287, "step": 16970 }, { "ce_loss": 0.03664708137512207, "epoch": 5.660440293529019, "step": 16970 }, { "distill_loss": 0.10653015226125717, "epoch": 5.660440293529019, "step": 16970 }, { "epoch": 5.660440293529019, "ref_ce_loss": 0.06614420562982559, "step": 16970 }, { "epoch": 5.660440293529019, "loss": 0.21806739270687103, "step": 16970 }, { "ce_loss": 0.04098616912961006, "epoch": 5.660440293529019, "step": 16970 }, { "distill_loss": 0.10077305883169174, "epoch": 5.660440293529019, "step": 16970 }, { "epoch": 5.660440293529019, "ref_ce_loss": 0.05704665184020996, "step": 16970 }, { "epoch": 5.663775850567045, "loss": 0.3637, "step": 16980 }, { "epoch": 5.663775850567045, "grad_norm": 1.8063230514526367, "step": 16980 }, { "epoch": 5.663775850567045, "learning_rate": 6.223911741116595e-05, "step": 16980 }, { "epoch": 5.663775850567045, "loss": 0.6903234720230103, "step": 16980 }, { "ce_loss": 0.11006470024585724, "epoch": 5.663775850567045, "step": 16980 }, { "distill_loss": 0.14529787003993988, "epoch": 5.663775850567045, "step": 16980 }, { "epoch": 5.663775850567045, "ref_ce_loss": 0.10468210279941559, "step": 16980 }, { "epoch": 5.663775850567045, "loss": 0.349357932806015, "step": 16980 }, { "ce_loss": 0.056875284761190414, "epoch": 5.663775850567045, "step": 16980 }, { "distill_loss": 0.17770813405513763, "epoch": 5.663775850567045, "step": 16980 }, { "epoch": 5.663775850567045, "ref_ce_loss": 0.0949898213148117, "step": 16980 }, { "epoch": 5.663775850567045, "loss": 0.42283138632774353, "step": 16980 }, { "ce_loss": 0.12381055951118469, "epoch": 5.663775850567045, "step": 16980 }, { "distill_loss": 0.18214209377765656, "epoch": 5.663775850567045, "step": 16980 }, { "epoch": 5.663775850567045, "ref_ce_loss": 0.1167239099740982, "step": 16980 }, { "epoch": 5.663775850567045, "loss": 0.23594720661640167, "step": 16980 }, { "ce_loss": 0.04936056211590767, "epoch": 5.663775850567045, "step": 16980 }, { "distill_loss": 0.12174536287784576, "epoch": 5.663775850567045, "step": 16980 }, { "epoch": 5.663775850567045, "ref_ce_loss": 0.0527084581553936, "step": 16980 }, { "epoch": 5.66711140760507, "loss": 0.3661, "step": 16990 }, { "epoch": 5.66711140760507, "grad_norm": 3.128441572189331, "step": 16990 }, { "epoch": 5.66711140760507, "learning_rate": 6.207492402404793e-05, "step": 16990 }, { "epoch": 5.66711140760507, "loss": 0.24315060675144196, "step": 16990 }, { "ce_loss": 0.026851223781704903, "epoch": 5.66711140760507, "step": 16990 }, { "distill_loss": 0.09447737038135529, "epoch": 5.66711140760507, "step": 16990 }, { "epoch": 5.66711140760507, "ref_ce_loss": 0.0893881618976593, "step": 16990 }, { "epoch": 5.66711140760507, "loss": 0.3972874581813812, "step": 16990 }, { "ce_loss": 0.09030399471521378, "epoch": 5.66711140760507, "step": 16990 }, { "distill_loss": 0.16162776947021484, "epoch": 5.66711140760507, "step": 16990 }, { "epoch": 5.66711140760507, "ref_ce_loss": 0.08049967885017395, "step": 16990 }, { "epoch": 5.66711140760507, "loss": 0.33712124824523926, "step": 16990 }, { "ce_loss": 0.1013912633061409, "epoch": 5.66711140760507, "step": 16990 }, { "distill_loss": 0.13497503101825714, "epoch": 5.66711140760507, "step": 16990 }, { "epoch": 5.66711140760507, "ref_ce_loss": 0.08126402646303177, "step": 16990 }, { "epoch": 5.66711140760507, "loss": 0.28021666407585144, "step": 16990 }, { "ce_loss": 0.1139599084854126, "epoch": 5.66711140760507, "step": 16990 }, { "distill_loss": 0.10009388625621796, "epoch": 5.66711140760507, "step": 16990 }, { "epoch": 5.66711140760507, "ref_ce_loss": 0.06535064429044724, "step": 16990 }, { "epoch": 5.670446964643095, "loss": 0.3448, "step": 17000 }, { "epoch": 5.670446964643095, "grad_norm": 2.8193626403808594, "step": 17000 }, { "epoch": 5.670446964643095, "learning_rate": 6.191089097758485e-05, "step": 17000 }, { "epoch": 5.670446964643095, "loss": 0.3630388379096985, "step": 17000 }, { "ce_loss": 0.04292258620262146, "epoch": 5.670446964643095, "step": 17000 }, { "distill_loss": 0.17938470840454102, "epoch": 5.670446964643095, "step": 17000 }, { "epoch": 5.670446964643095, "ref_ce_loss": 0.06609540432691574, "step": 17000 }, { "epoch": 5.670446964643095, "loss": 0.26592352986335754, "step": 17000 }, { "ce_loss": 0.05733026936650276, "epoch": 5.670446964643095, "step": 17000 }, { "distill_loss": 0.1041751280426979, "epoch": 5.670446964643095, "step": 17000 }, { "epoch": 5.670446964643095, "ref_ce_loss": 0.08246053755283356, "step": 17000 }, { "epoch": 5.670446964643095, "loss": 0.30712056159973145, "step": 17000 }, { "ce_loss": 0.08909639716148376, "epoch": 5.670446964643095, "step": 17000 }, { "distill_loss": 0.15928807854652405, "epoch": 5.670446964643095, "step": 17000 }, { "epoch": 5.670446964643095, "ref_ce_loss": 0.058509256690740585, "step": 17000 }, { "epoch": 5.670446964643095, "loss": 0.3685610294342041, "step": 17000 }, { "ce_loss": 0.0719846785068512, "epoch": 5.670446964643095, "step": 17000 }, { "distill_loss": 0.19032107293605804, "epoch": 5.670446964643095, "step": 17000 }, { "epoch": 5.670446964643095, "ref_ce_loss": 0.06409589946269989, "step": 17000 }, { "epoch": 5.673782521681121, "loss": 0.3253, "step": 17010 }, { "epoch": 5.673782521681121, "grad_norm": 3.6958439350128174, "step": 17010 }, { "epoch": 5.673782521681121, "learning_rate": 6.174701857090838e-05, "step": 17010 }, { "epoch": 5.673782521681121, "loss": 0.3091316521167755, "step": 17010 }, { "ce_loss": 0.05044399946928024, "epoch": 5.673782521681121, "step": 17010 }, { "distill_loss": 0.11642865091562271, "epoch": 5.673782521681121, "step": 17010 }, { "epoch": 5.673782521681121, "ref_ce_loss": 0.10243427008390427, "step": 17010 }, { "epoch": 5.673782521681121, "loss": 0.3319244384765625, "step": 17010 }, { "ce_loss": 0.05720686912536621, "epoch": 5.673782521681121, "step": 17010 }, { "distill_loss": 0.1569894701242447, "epoch": 5.673782521681121, "step": 17010 }, { "epoch": 5.673782521681121, "ref_ce_loss": 0.08333617448806763, "step": 17010 }, { "epoch": 5.673782521681121, "loss": 0.3339140713214874, "step": 17010 }, { "ce_loss": 0.05138056352734566, "epoch": 5.673782521681121, "step": 17010 }, { "distill_loss": 0.1706089973449707, "epoch": 5.673782521681121, "step": 17010 }, { "epoch": 5.673782521681121, "ref_ce_loss": 0.06299076974391937, "step": 17010 }, { "epoch": 5.673782521681121, "loss": 0.7556455135345459, "step": 17010 }, { "ce_loss": 0.10137326270341873, "epoch": 5.673782521681121, "step": 17010 }, { "distill_loss": 0.14703679084777832, "epoch": 5.673782521681121, "step": 17010 }, { "epoch": 5.673782521681121, "ref_ce_loss": 0.08514633029699326, "step": 17010 }, { "epoch": 5.677118078719146, "loss": 0.3976, "step": 17020 }, { "epoch": 5.677118078719146, "grad_norm": 5.015145778656006, "step": 17020 }, { "epoch": 5.677118078719146, "learning_rate": 6.158330710285702e-05, "step": 17020 }, { "epoch": 5.677118078719146, "loss": 0.2708377242088318, "step": 17020 }, { "ce_loss": 0.03134353458881378, "epoch": 5.677118078719146, "step": 17020 }, { "distill_loss": 0.09554773569107056, "epoch": 5.677118078719146, "step": 17020 }, { "epoch": 5.677118078719146, "ref_ce_loss": 0.07234134525060654, "step": 17020 }, { "epoch": 5.677118078719146, "loss": 0.22705619037151337, "step": 17020 }, { "ce_loss": 0.024353666231036186, "epoch": 5.677118078719146, "step": 17020 }, { "distill_loss": 0.1192890852689743, "epoch": 5.677118078719146, "step": 17020 }, { "epoch": 5.677118078719146, "ref_ce_loss": 0.08326216042041779, "step": 17020 }, { "epoch": 5.677118078719146, "loss": 0.302390992641449, "step": 17020 }, { "ce_loss": 0.06740739196538925, "epoch": 5.677118078719146, "step": 17020 }, { "distill_loss": 0.10257144272327423, "epoch": 5.677118078719146, "step": 17020 }, { "epoch": 5.677118078719146, "ref_ce_loss": 0.10527370125055313, "step": 17020 }, { "epoch": 5.677118078719146, "loss": 0.4007715880870819, "step": 17020 }, { "ce_loss": 0.10588482767343521, "epoch": 5.677118078719146, "step": 17020 }, { "distill_loss": 0.22049292922019958, "epoch": 5.677118078719146, "step": 17020 }, { "epoch": 5.677118078719146, "ref_ce_loss": 0.07412932068109512, "step": 17020 }, { "epoch": 5.680453635757171, "loss": 0.3595, "step": 17030 }, { "epoch": 5.680453635757171, "grad_norm": 2.110219717025757, "step": 17030 }, { "epoch": 5.680453635757171, "learning_rate": 6.141975687197596e-05, "step": 17030 }, { "epoch": 5.680453635757171, "loss": 0.9901852607727051, "step": 17030 }, { "ce_loss": 0.06740203499794006, "epoch": 5.680453635757171, "step": 17030 }, { "distill_loss": 0.14835187792778015, "epoch": 5.680453635757171, "step": 17030 }, { "epoch": 5.680453635757171, "ref_ce_loss": 0.048046503216028214, "step": 17030 }, { "epoch": 5.680453635757171, "loss": 1.149698257446289, "step": 17030 }, { "ce_loss": 0.09891572594642639, "epoch": 5.680453635757171, "step": 17030 }, { "distill_loss": 0.12695176899433136, "epoch": 5.680453635757171, "step": 17030 }, { "epoch": 5.680453635757171, "ref_ce_loss": 0.07718627154827118, "step": 17030 }, { "epoch": 5.680453635757171, "loss": 0.485831618309021, "step": 17030 }, { "ce_loss": 0.1584722250699997, "epoch": 5.680453635757171, "step": 17030 }, { "distill_loss": 0.1864461898803711, "epoch": 5.680453635757171, "step": 17030 }, { "epoch": 5.680453635757171, "ref_ce_loss": 0.11007975041866302, "step": 17030 }, { "epoch": 5.680453635757171, "loss": 0.3302626311779022, "step": 17030 }, { "ce_loss": 0.10280175507068634, "epoch": 5.680453635757171, "step": 17030 }, { "distill_loss": 0.124929279088974, "epoch": 5.680453635757171, "step": 17030 }, { "epoch": 5.680453635757171, "ref_ce_loss": 0.07476763427257538, "step": 17030 }, { "epoch": 5.683789192795197, "loss": 0.3904, "step": 17040 }, { "epoch": 5.683789192795197, "grad_norm": 2.1810994148254395, "step": 17040 }, { "epoch": 5.683789192795197, "learning_rate": 6.125636817651632e-05, "step": 17040 }, { "epoch": 5.683789192795197, "loss": 0.22979770600795746, "step": 17040 }, { "ce_loss": 0.04653044044971466, "epoch": 5.683789192795197, "step": 17040 }, { "distill_loss": 0.13956502079963684, "epoch": 5.683789192795197, "step": 17040 }, { "epoch": 5.683789192795197, "ref_ce_loss": 0.04300351068377495, "step": 17040 }, { "epoch": 5.683789192795197, "loss": 0.2592867314815521, "step": 17040 }, { "ce_loss": 0.06638824939727783, "epoch": 5.683789192795197, "step": 17040 }, { "distill_loss": 0.12841610610485077, "epoch": 5.683789192795197, "step": 17040 }, { "epoch": 5.683789192795197, "ref_ce_loss": 0.06432758271694183, "step": 17040 }, { "epoch": 5.683789192795197, "loss": 0.5075886249542236, "step": 17040 }, { "ce_loss": 0.11375562846660614, "epoch": 5.683789192795197, "step": 17040 }, { "distill_loss": 0.16561579704284668, "epoch": 5.683789192795197, "step": 17040 }, { "epoch": 5.683789192795197, "ref_ce_loss": 0.08213319629430771, "step": 17040 }, { "epoch": 5.683789192795197, "loss": 0.5379311442375183, "step": 17040 }, { "ce_loss": 0.1698356419801712, "epoch": 5.683789192795197, "step": 17040 }, { "distill_loss": 0.1929055154323578, "epoch": 5.683789192795197, "step": 17040 }, { "epoch": 5.683789192795197, "ref_ce_loss": 0.09995917230844498, "step": 17040 }, { "epoch": 5.687124749833222, "loss": 0.3788, "step": 17050 }, { "epoch": 5.687124749833222, "grad_norm": 3.6914074420928955, "step": 17050 }, { "epoch": 5.687124749833222, "learning_rate": 6.109314131443462e-05, "step": 17050 }, { "epoch": 5.687124749833222, "loss": 0.30641597509384155, "step": 17050 }, { "ce_loss": 0.06863492727279663, "epoch": 5.687124749833222, "step": 17050 }, { "distill_loss": 0.12574729323387146, "epoch": 5.687124749833222, "step": 17050 }, { "epoch": 5.687124749833222, "ref_ce_loss": 0.04029463976621628, "step": 17050 }, { "epoch": 5.687124749833222, "loss": 0.2634451389312744, "step": 17050 }, { "ce_loss": 0.0588807575404644, "epoch": 5.687124749833222, "step": 17050 }, { "distill_loss": 0.11593613773584366, "epoch": 5.687124749833222, "step": 17050 }, { "epoch": 5.687124749833222, "ref_ce_loss": 0.08846764266490936, "step": 17050 }, { "epoch": 5.687124749833222, "loss": 0.3977257311344147, "step": 17050 }, { "ce_loss": 0.14133258163928986, "epoch": 5.687124749833222, "step": 17050 }, { "distill_loss": 0.14880916476249695, "epoch": 5.687124749833222, "step": 17050 }, { "epoch": 5.687124749833222, "ref_ce_loss": 0.09101521223783493, "step": 17050 }, { "epoch": 5.687124749833222, "loss": 0.49362966418266296, "step": 17050 }, { "ce_loss": 0.05905555561184883, "epoch": 5.687124749833222, "step": 17050 }, { "distill_loss": 0.14283868670463562, "epoch": 5.687124749833222, "step": 17050 }, { "epoch": 5.687124749833222, "ref_ce_loss": 0.06556601077318192, "step": 17050 }, { "epoch": 5.690460306871247, "loss": 0.3621, "step": 17060 }, { "epoch": 5.690460306871247, "grad_norm": 2.039789915084839, "step": 17060 }, { "epoch": 5.690460306871247, "learning_rate": 6.0930076583392305e-05, "step": 17060 }, { "epoch": 5.690460306871247, "loss": 0.3623387813568115, "step": 17060 }, { "ce_loss": 0.06750385463237762, "epoch": 5.690460306871247, "step": 17060 }, { "distill_loss": 0.10620886087417603, "epoch": 5.690460306871247, "step": 17060 }, { "epoch": 5.690460306871247, "ref_ce_loss": 0.06084444001317024, "step": 17060 }, { "epoch": 5.690460306871247, "loss": 0.2275630533695221, "step": 17060 }, { "ce_loss": 0.04591386765241623, "epoch": 5.690460306871247, "step": 17060 }, { "distill_loss": 0.10855705291032791, "epoch": 5.690460306871247, "step": 17060 }, { "epoch": 5.690460306871247, "ref_ce_loss": 0.0493580587208271, "step": 17060 }, { "epoch": 5.690460306871247, "loss": 0.2691376805305481, "step": 17060 }, { "ce_loss": 0.03460320085287094, "epoch": 5.690460306871247, "step": 17060 }, { "distill_loss": 0.10596905648708344, "epoch": 5.690460306871247, "step": 17060 }, { "epoch": 5.690460306871247, "ref_ce_loss": 0.06285975873470306, "step": 17060 }, { "epoch": 5.690460306871247, "loss": 0.32650092244148254, "step": 17060 }, { "ce_loss": 0.06436863541603088, "epoch": 5.690460306871247, "step": 17060 }, { "distill_loss": 0.1277579665184021, "epoch": 5.690460306871247, "step": 17060 }, { "epoch": 5.690460306871247, "ref_ce_loss": 0.09064862132072449, "step": 17060 }, { "epoch": 5.693795863909273, "loss": 0.3159, "step": 17070 }, { "epoch": 5.693795863909273, "grad_norm": 2.3171262741088867, "step": 17070 }, { "epoch": 5.693795863909273, "learning_rate": 6.076717428075505e-05, "step": 17070 }, { "epoch": 5.693795863909273, "loss": 0.2595212161540985, "step": 17070 }, { "ce_loss": 0.06113705039024353, "epoch": 5.693795863909273, "step": 17070 }, { "distill_loss": 0.11590202152729034, "epoch": 5.693795863909273, "step": 17070 }, { "epoch": 5.693795863909273, "ref_ce_loss": 0.0593080148100853, "step": 17070 }, { "epoch": 5.693795863909273, "loss": 0.5655686855316162, "step": 17070 }, { "ce_loss": 0.05499016121029854, "epoch": 5.693795863909273, "step": 17070 }, { "distill_loss": 0.17960451543331146, "epoch": 5.693795863909273, "step": 17070 }, { "epoch": 5.693795863909273, "ref_ce_loss": 0.06927433609962463, "step": 17070 }, { "epoch": 5.693795863909273, "loss": 0.4495111107826233, "step": 17070 }, { "ce_loss": 0.12065106630325317, "epoch": 5.693795863909273, "step": 17070 }, { "distill_loss": 0.1322806179523468, "epoch": 5.693795863909273, "step": 17070 }, { "epoch": 5.693795863909273, "ref_ce_loss": 0.12023723125457764, "step": 17070 }, { "epoch": 5.693795863909273, "loss": 0.31495094299316406, "step": 17070 }, { "ce_loss": 0.07359825074672699, "epoch": 5.693795863909273, "step": 17070 }, { "distill_loss": 0.11552683264017105, "epoch": 5.693795863909273, "step": 17070 }, { "epoch": 5.693795863909273, "ref_ce_loss": 0.09320148080587387, "step": 17070 }, { "epoch": 5.697131420947298, "loss": 0.3576, "step": 17080 }, { "epoch": 5.697131420947298, "grad_norm": 2.8441379070281982, "step": 17080 }, { "epoch": 5.697131420947298, "learning_rate": 6.060443470359243e-05, "step": 17080 }, { "epoch": 5.697131420947298, "loss": 0.18535450100898743, "step": 17080 }, { "ce_loss": 0.014140546321868896, "epoch": 5.697131420947298, "step": 17080 }, { "distill_loss": 0.10060036182403564, "epoch": 5.697131420947298, "step": 17080 }, { "epoch": 5.697131420947298, "ref_ce_loss": 0.05045443773269653, "step": 17080 }, { "epoch": 5.697131420947298, "loss": 0.23556338250637054, "step": 17080 }, { "ce_loss": 0.0404030904173851, "epoch": 5.697131420947298, "step": 17080 }, { "distill_loss": 0.11873391270637512, "epoch": 5.697131420947298, "step": 17080 }, { "epoch": 5.697131420947298, "ref_ce_loss": 0.052613597363233566, "step": 17080 }, { "epoch": 5.697131420947298, "loss": 0.3786413371562958, "step": 17080 }, { "ce_loss": 0.05384358391165733, "epoch": 5.697131420947298, "step": 17080 }, { "distill_loss": 0.08683878928422928, "epoch": 5.697131420947298, "step": 17080 }, { "epoch": 5.697131420947298, "ref_ce_loss": 0.07605697959661484, "step": 17080 }, { "epoch": 5.697131420947298, "loss": 0.3993130326271057, "step": 17080 }, { "ce_loss": 0.05315512791275978, "epoch": 5.697131420947298, "step": 17080 }, { "distill_loss": 0.1281694769859314, "epoch": 5.697131420947298, "step": 17080 }, { "epoch": 5.697131420947298, "ref_ce_loss": 0.07677015662193298, "step": 17080 }, { "epoch": 5.700466977985323, "loss": 0.3881, "step": 17090 }, { "epoch": 5.700466977985323, "grad_norm": 6.708038806915283, "step": 17090 }, { "epoch": 5.700466977985323, "learning_rate": 6.0441858148677274e-05, "step": 17090 }, { "epoch": 5.700466977985323, "loss": 0.4696926176548004, "step": 17090 }, { "ce_loss": 0.1001918688416481, "epoch": 5.700466977985323, "step": 17090 }, { "distill_loss": 0.21125702559947968, "epoch": 5.700466977985323, "step": 17090 }, { "epoch": 5.700466977985323, "ref_ce_loss": 0.05627693980932236, "step": 17090 }, { "epoch": 5.700466977985323, "loss": 0.7283337116241455, "step": 17090 }, { "ce_loss": 0.1716901659965515, "epoch": 5.700466977985323, "step": 17090 }, { "distill_loss": 0.171518474817276, "epoch": 5.700466977985323, "step": 17090 }, { "epoch": 5.700466977985323, "ref_ce_loss": 0.05830475315451622, "step": 17090 }, { "epoch": 5.700466977985323, "loss": 0.2751966714859009, "step": 17090 }, { "ce_loss": 0.039496466517448425, "epoch": 5.700466977985323, "step": 17090 }, { "distill_loss": 0.14405354857444763, "epoch": 5.700466977985323, "step": 17090 }, { "epoch": 5.700466977985323, "ref_ce_loss": 0.059192027896642685, "step": 17090 }, { "epoch": 5.700466977985323, "loss": 0.28596171736717224, "step": 17090 }, { "ce_loss": 0.0590527206659317, "epoch": 5.700466977985323, "step": 17090 }, { "distill_loss": 0.10830798745155334, "epoch": 5.700466977985323, "step": 17090 }, { "epoch": 5.700466977985323, "ref_ce_loss": 0.06655246019363403, "step": 17090 }, { "epoch": 5.703802535023349, "loss": 0.3464, "step": 17100 }, { "epoch": 5.703802535023349, "grad_norm": 3.195554256439209, "step": 17100 }, { "epoch": 5.703802535023349, "learning_rate": 6.027944491248502e-05, "step": 17100 }, { "epoch": 5.703802535023349, "loss": 0.4235726594924927, "step": 17100 }, { "ce_loss": 0.07542421668767929, "epoch": 5.703802535023349, "step": 17100 }, { "distill_loss": 0.11276789754629135, "epoch": 5.703802535023349, "step": 17100 }, { "epoch": 5.703802535023349, "ref_ce_loss": 0.05181581526994705, "step": 17100 }, { "epoch": 5.703802535023349, "loss": 0.26524561643600464, "step": 17100 }, { "ce_loss": 0.021398169919848442, "epoch": 5.703802535023349, "step": 17100 }, { "distill_loss": 0.12772291898727417, "epoch": 5.703802535023349, "step": 17100 }, { "epoch": 5.703802535023349, "ref_ce_loss": 0.0794157013297081, "step": 17100 }, { "epoch": 5.703802535023349, "loss": 0.5993280410766602, "step": 17100 }, { "ce_loss": 0.1594715267419815, "epoch": 5.703802535023349, "step": 17100 }, { "distill_loss": 0.15677529573440552, "epoch": 5.703802535023349, "step": 17100 }, { "epoch": 5.703802535023349, "ref_ce_loss": 0.11099915206432343, "step": 17100 }, { "epoch": 5.703802535023349, "loss": 0.2803640365600586, "step": 17100 }, { "ce_loss": 0.039205145090818405, "epoch": 5.703802535023349, "step": 17100 }, { "distill_loss": 0.11149227619171143, "epoch": 5.703802535023349, "step": 17100 }, { "epoch": 5.703802535023349, "ref_ce_loss": 0.09155328571796417, "step": 17100 }, { "epoch": 5.707138092061374, "loss": 0.3497, "step": 17110 }, { "epoch": 5.707138092061374, "grad_norm": 1.5642236471176147, "step": 17110 }, { "epoch": 5.707138092061374, "learning_rate": 6.011719529119337e-05, "step": 17110 }, { "epoch": 5.707138092061374, "loss": 0.3479704260826111, "step": 17110 }, { "ce_loss": 0.06622432172298431, "epoch": 5.707138092061374, "step": 17110 }, { "distill_loss": 0.11067965626716614, "epoch": 5.707138092061374, "step": 17110 }, { "epoch": 5.707138092061374, "ref_ce_loss": 0.09841424971818924, "step": 17110 }, { "epoch": 5.707138092061374, "loss": 0.30340105295181274, "step": 17110 }, { "ce_loss": 0.0672907829284668, "epoch": 5.707138092061374, "step": 17110 }, { "distill_loss": 0.12754839658737183, "epoch": 5.707138092061374, "step": 17110 }, { "epoch": 5.707138092061374, "ref_ce_loss": 0.0743776187300682, "step": 17110 }, { "epoch": 5.707138092061374, "loss": 0.4672259986400604, "step": 17110 }, { "ce_loss": 0.11600927263498306, "epoch": 5.707138092061374, "step": 17110 }, { "distill_loss": 0.16114374995231628, "epoch": 5.707138092061374, "step": 17110 }, { "epoch": 5.707138092061374, "ref_ce_loss": 0.08801127225160599, "step": 17110 }, { "epoch": 5.707138092061374, "loss": 0.4922545254230499, "step": 17110 }, { "ce_loss": 0.1250251680612564, "epoch": 5.707138092061374, "step": 17110 }, { "distill_loss": 0.16420485079288483, "epoch": 5.707138092061374, "step": 17110 }, { "epoch": 5.707138092061374, "ref_ce_loss": 0.09376829862594604, "step": 17110 }, { "epoch": 5.7104736490993995, "loss": 0.37, "step": 17120 }, { "epoch": 5.7104736490993995, "grad_norm": 2.751373052597046, "step": 17120 }, { "epoch": 5.7104736490993995, "learning_rate": 5.995510958068162e-05, "step": 17120 }, { "epoch": 5.7104736490993995, "loss": 0.3050478398799896, "step": 17120 }, { "ce_loss": 0.07524151355028152, "epoch": 5.7104736490993995, "step": 17120 }, { "distill_loss": 0.14798586070537567, "epoch": 5.7104736490993995, "step": 17120 }, { "epoch": 5.7104736490993995, "ref_ce_loss": 0.06977467238903046, "step": 17120 }, { "epoch": 5.7104736490993995, "loss": 0.2592675983905792, "step": 17120 }, { "ce_loss": 0.043805621564388275, "epoch": 5.7104736490993995, "step": 17120 }, { "distill_loss": 0.11673689633607864, "epoch": 5.7104736490993995, "step": 17120 }, { "epoch": 5.7104736490993995, "ref_ce_loss": 0.07951228320598602, "step": 17120 }, { "epoch": 5.7104736490993995, "loss": 0.27545806765556335, "step": 17120 }, { "ce_loss": 0.02748233452439308, "epoch": 5.7104736490993995, "step": 17120 }, { "distill_loss": 0.09332962334156036, "epoch": 5.7104736490993995, "step": 17120 }, { "epoch": 5.7104736490993995, "ref_ce_loss": 0.05149763822555542, "step": 17120 }, { "epoch": 5.7104736490993995, "loss": 0.22703437507152557, "step": 17120 }, { "ce_loss": 0.02626241184771061, "epoch": 5.7104736490993995, "step": 17120 }, { "distill_loss": 0.1262034773826599, "epoch": 5.7104736490993995, "step": 17120 }, { "epoch": 5.7104736490993995, "ref_ce_loss": 0.07425781339406967, "step": 17120 }, { "epoch": 5.713809206137425, "loss": 0.311, "step": 17130 }, { "epoch": 5.713809206137425, "grad_norm": 3.511167526245117, "step": 17130 }, { "epoch": 5.713809206137425, "learning_rate": 5.979318807653019e-05, "step": 17130 }, { "epoch": 5.713809206137425, "loss": 0.1948380023241043, "step": 17130 }, { "ce_loss": 0.04529014602303505, "epoch": 5.713809206137425, "step": 17130 }, { "distill_loss": 0.1010599136352539, "epoch": 5.713809206137425, "step": 17130 }, { "epoch": 5.713809206137425, "ref_ce_loss": 0.0482073649764061, "step": 17130 }, { "epoch": 5.713809206137425, "loss": 0.23362573981285095, "step": 17130 }, { "ce_loss": 0.043237727135419846, "epoch": 5.713809206137425, "step": 17130 }, { "distill_loss": 0.1262238621711731, "epoch": 5.713809206137425, "step": 17130 }, { "epoch": 5.713809206137425, "ref_ce_loss": 0.0638972595334053, "step": 17130 }, { "epoch": 5.713809206137425, "loss": 0.2678848206996918, "step": 17130 }, { "ce_loss": 0.03875793516635895, "epoch": 5.713809206137425, "step": 17130 }, { "distill_loss": 0.1362183392047882, "epoch": 5.713809206137425, "step": 17130 }, { "epoch": 5.713809206137425, "ref_ce_loss": 0.09272968024015427, "step": 17130 }, { "epoch": 5.713809206137425, "loss": 0.28923311829566956, "step": 17130 }, { "ce_loss": 0.07208915799856186, "epoch": 5.713809206137425, "step": 17130 }, { "distill_loss": 0.13289576768875122, "epoch": 5.713809206137425, "step": 17130 }, { "epoch": 5.713809206137425, "ref_ce_loss": 0.06253129988908768, "step": 17130 }, { "epoch": 5.71714476317545, "loss": 0.33, "step": 17140 }, { "epoch": 5.71714476317545, "grad_norm": 2.24771785736084, "step": 17140 }, { "epoch": 5.71714476317545, "learning_rate": 5.963143107402007e-05, "step": 17140 }, { "epoch": 5.71714476317545, "loss": 0.2893480956554413, "step": 17140 }, { "ce_loss": 0.06652390211820602, "epoch": 5.71714476317545, "step": 17140 }, { "distill_loss": 0.14231842756271362, "epoch": 5.71714476317545, "step": 17140 }, { "epoch": 5.71714476317545, "ref_ce_loss": 0.08033139258623123, "step": 17140 }, { "epoch": 5.71714476317545, "loss": 0.40271690487861633, "step": 17140 }, { "ce_loss": 0.040531329810619354, "epoch": 5.71714476317545, "step": 17140 }, { "distill_loss": 0.1344706416130066, "epoch": 5.71714476317545, "step": 17140 }, { "epoch": 5.71714476317545, "ref_ce_loss": 0.10532485693693161, "step": 17140 }, { "epoch": 5.71714476317545, "loss": 0.2142089456319809, "step": 17140 }, { "ce_loss": 0.03037523478269577, "epoch": 5.71714476317545, "step": 17140 }, { "distill_loss": 0.09983956813812256, "epoch": 5.71714476317545, "step": 17140 }, { "epoch": 5.71714476317545, "ref_ce_loss": 0.05833209678530693, "step": 17140 }, { "epoch": 5.71714476317545, "loss": 0.4400641918182373, "step": 17140 }, { "ce_loss": 0.06064482778310776, "epoch": 5.71714476317545, "step": 17140 }, { "distill_loss": 0.12813520431518555, "epoch": 5.71714476317545, "step": 17140 }, { "epoch": 5.71714476317545, "ref_ce_loss": 0.044088393449783325, "step": 17140 }, { "epoch": 5.7204803202134755, "loss": 0.3816, "step": 17150 }, { "epoch": 5.7204803202134755, "grad_norm": 2.1068670749664307, "step": 17150 }, { "epoch": 5.7204803202134755, "learning_rate": 5.946983886813216e-05, "step": 17150 }, { "epoch": 5.7204803202134755, "loss": 0.3585885763168335, "step": 17150 }, { "ce_loss": 0.0682455524802208, "epoch": 5.7204803202134755, "step": 17150 }, { "distill_loss": 0.17169511318206787, "epoch": 5.7204803202134755, "step": 17150 }, { "epoch": 5.7204803202134755, "ref_ce_loss": 0.08379658311605453, "step": 17150 }, { "epoch": 5.7204803202134755, "loss": 0.342392235994339, "step": 17150 }, { "ce_loss": 0.03174396976828575, "epoch": 5.7204803202134755, "step": 17150 }, { "distill_loss": 0.14371605217456818, "epoch": 5.7204803202134755, "step": 17150 }, { "epoch": 5.7204803202134755, "ref_ce_loss": 0.09976787120103836, "step": 17150 }, { "epoch": 5.7204803202134755, "loss": 0.31579867005348206, "step": 17150 }, { "ce_loss": 0.06596563011407852, "epoch": 5.7204803202134755, "step": 17150 }, { "distill_loss": 0.12314214557409286, "epoch": 5.7204803202134755, "step": 17150 }, { "epoch": 5.7204803202134755, "ref_ce_loss": 0.06553999334573746, "step": 17150 }, { "epoch": 5.7204803202134755, "loss": 0.615726113319397, "step": 17150 }, { "ce_loss": 0.1368461549282074, "epoch": 5.7204803202134755, "step": 17150 }, { "distill_loss": 0.14078648388385773, "epoch": 5.7204803202134755, "step": 17150 }, { "epoch": 5.7204803202134755, "ref_ce_loss": 0.07783728837966919, "step": 17150 }, { "epoch": 5.723815877251501, "loss": 0.349, "step": 17160 }, { "epoch": 5.723815877251501, "grad_norm": 1.7467983961105347, "step": 17160 }, { "epoch": 5.723815877251501, "learning_rate": 5.930841175354689e-05, "step": 17160 }, { "epoch": 5.723815877251501, "loss": 0.2381708174943924, "step": 17160 }, { "ce_loss": 0.041369061917066574, "epoch": 5.723815877251501, "step": 17160 }, { "distill_loss": 0.1097210943698883, "epoch": 5.723815877251501, "step": 17160 }, { "epoch": 5.723815877251501, "ref_ce_loss": 0.046333197504282, "step": 17160 }, { "epoch": 5.723815877251501, "loss": 0.2818985879421234, "step": 17160 }, { "ce_loss": 0.05317312851548195, "epoch": 5.723815877251501, "step": 17160 }, { "distill_loss": 0.13140282034873962, "epoch": 5.723815877251501, "step": 17160 }, { "epoch": 5.723815877251501, "ref_ce_loss": 0.07650244235992432, "step": 17160 }, { "epoch": 5.723815877251501, "loss": 0.2387668788433075, "step": 17160 }, { "ce_loss": 0.036102037876844406, "epoch": 5.723815877251501, "step": 17160 }, { "distill_loss": 0.10024573653936386, "epoch": 5.723815877251501, "step": 17160 }, { "epoch": 5.723815877251501, "ref_ce_loss": 0.06697791069746017, "step": 17160 }, { "epoch": 5.723815877251501, "loss": 0.35411015152931213, "step": 17160 }, { "ce_loss": 0.12947840988636017, "epoch": 5.723815877251501, "step": 17160 }, { "distill_loss": 0.11806891113519669, "epoch": 5.723815877251501, "step": 17160 }, { "epoch": 5.723815877251501, "ref_ce_loss": 0.09559057652950287, "step": 17160 }, { "epoch": 5.727151434289526, "loss": 0.3435, "step": 17170 }, { "epoch": 5.727151434289526, "grad_norm": 2.8510990142822266, "step": 17170 }, { "epoch": 5.727151434289526, "learning_rate": 5.914715002464368e-05, "step": 17170 }, { "epoch": 5.727151434289526, "loss": 0.376874178647995, "step": 17170 }, { "ce_loss": 0.10144776105880737, "epoch": 5.727151434289526, "step": 17170 }, { "distill_loss": 0.10674238950014114, "epoch": 5.727151434289526, "step": 17170 }, { "epoch": 5.727151434289526, "ref_ce_loss": 0.08582198619842529, "step": 17170 }, { "epoch": 5.727151434289526, "loss": 0.4589444100856781, "step": 17170 }, { "ce_loss": 0.07079615443944931, "epoch": 5.727151434289526, "step": 17170 }, { "distill_loss": 0.14036592841148376, "epoch": 5.727151434289526, "step": 17170 }, { "epoch": 5.727151434289526, "ref_ce_loss": 0.09123189002275467, "step": 17170 }, { "epoch": 5.727151434289526, "loss": 0.30794355273246765, "step": 17170 }, { "ce_loss": 0.05897359549999237, "epoch": 5.727151434289526, "step": 17170 }, { "distill_loss": 0.10716037452220917, "epoch": 5.727151434289526, "step": 17170 }, { "epoch": 5.727151434289526, "ref_ce_loss": 0.07225260138511658, "step": 17170 }, { "epoch": 5.727151434289526, "loss": 0.24795758724212646, "step": 17170 }, { "ce_loss": 0.06725875288248062, "epoch": 5.727151434289526, "step": 17170 }, { "distill_loss": 0.1040724664926529, "epoch": 5.727151434289526, "step": 17170 }, { "epoch": 5.727151434289526, "ref_ce_loss": 0.07633961737155914, "step": 17170 }, { "epoch": 5.730486991327552, "loss": 0.3443, "step": 17180 }, { "epoch": 5.730486991327552, "grad_norm": 3.2798383235931396, "step": 17180 }, { "epoch": 5.730486991327552, "learning_rate": 5.8986053975500306e-05, "step": 17180 }, { "epoch": 5.730486991327552, "loss": 0.30777508020401, "step": 17180 }, { "ce_loss": 0.06996277719736099, "epoch": 5.730486991327552, "step": 17180 }, { "distill_loss": 0.13840298354625702, "epoch": 5.730486991327552, "step": 17180 }, { "epoch": 5.730486991327552, "ref_ce_loss": 0.056659650057554245, "step": 17180 }, { "epoch": 5.730486991327552, "loss": 0.29943349957466125, "step": 17180 }, { "ce_loss": 0.019586697220802307, "epoch": 5.730486991327552, "step": 17180 }, { "distill_loss": 0.09618957340717316, "epoch": 5.730486991327552, "step": 17180 }, { "epoch": 5.730486991327552, "ref_ce_loss": 0.05273442342877388, "step": 17180 }, { "epoch": 5.730486991327552, "loss": 0.3717377781867981, "step": 17180 }, { "ce_loss": 0.14365200698375702, "epoch": 5.730486991327552, "step": 17180 }, { "distill_loss": 0.15098531544208527, "epoch": 5.730486991327552, "step": 17180 }, { "epoch": 5.730486991327552, "ref_ce_loss": 0.0676068514585495, "step": 17180 }, { "epoch": 5.730486991327552, "loss": 0.3774576485157013, "step": 17180 }, { "ce_loss": 0.12183333188295364, "epoch": 5.730486991327552, "step": 17180 }, { "distill_loss": 0.15439927577972412, "epoch": 5.730486991327552, "step": 17180 }, { "epoch": 5.730486991327552, "ref_ce_loss": 0.07511351257562637, "step": 17180 }, { "epoch": 5.733822548365577, "loss": 0.3472, "step": 17190 }, { "epoch": 5.733822548365577, "grad_norm": 3.112159490585327, "step": 17190 }, { "epoch": 5.733822548365577, "learning_rate": 5.882512389989244e-05, "step": 17190 }, { "epoch": 5.733822548365577, "loss": 0.4421817660331726, "step": 17190 }, { "ce_loss": 0.08099653571844101, "epoch": 5.733822548365577, "step": 17190 }, { "distill_loss": 0.20904618501663208, "epoch": 5.733822548365577, "step": 17190 }, { "epoch": 5.733822548365577, "ref_ce_loss": 0.05432404205203056, "step": 17190 }, { "epoch": 5.733822548365577, "loss": 0.5608204007148743, "step": 17190 }, { "ce_loss": 0.09235455095767975, "epoch": 5.733822548365577, "step": 17190 }, { "distill_loss": 0.20927207171916962, "epoch": 5.733822548365577, "step": 17190 }, { "epoch": 5.733822548365577, "ref_ce_loss": 0.10277006030082703, "step": 17190 }, { "epoch": 5.733822548365577, "loss": 0.2641429305076599, "step": 17190 }, { "ce_loss": 0.08527445793151855, "epoch": 5.733822548365577, "step": 17190 }, { "distill_loss": 0.12051382660865784, "epoch": 5.733822548365577, "step": 17190 }, { "epoch": 5.733822548365577, "ref_ce_loss": 0.044529445469379425, "step": 17190 }, { "epoch": 5.733822548365577, "loss": 0.7164769172668457, "step": 17190 }, { "ce_loss": 0.11412998288869858, "epoch": 5.733822548365577, "step": 17190 }, { "distill_loss": 0.16576185822486877, "epoch": 5.733822548365577, "step": 17190 }, { "epoch": 5.733822548365577, "ref_ce_loss": 0.1029839813709259, "step": 17190 }, { "epoch": 5.737158105403602, "loss": 0.3407, "step": 17200 }, { "epoch": 5.737158105403602, "grad_norm": 2.1736953258514404, "step": 17200 }, { "epoch": 5.737158105403602, "learning_rate": 5.866436009129299e-05, "step": 17200 }, { "epoch": 5.737158105403602, "loss": 0.40977779030799866, "step": 17200 }, { "ce_loss": 0.0977315753698349, "epoch": 5.737158105403602, "step": 17200 }, { "distill_loss": 0.14270806312561035, "epoch": 5.737158105403602, "step": 17200 }, { "epoch": 5.737158105403602, "ref_ce_loss": 0.04262208193540573, "step": 17200 }, { "epoch": 5.737158105403602, "loss": 0.176670104265213, "step": 17200 }, { "ce_loss": 0.04305120185017586, "epoch": 5.737158105403602, "step": 17200 }, { "distill_loss": 0.09481915831565857, "epoch": 5.737158105403602, "step": 17200 }, { "epoch": 5.737158105403602, "ref_ce_loss": 0.038723744451999664, "step": 17200 }, { "epoch": 5.737158105403602, "loss": 0.36046162247657776, "step": 17200 }, { "ce_loss": 0.05129627510905266, "epoch": 5.737158105403602, "step": 17200 }, { "distill_loss": 0.1609860509634018, "epoch": 5.737158105403602, "step": 17200 }, { "epoch": 5.737158105403602, "ref_ce_loss": 0.06556078791618347, "step": 17200 }, { "epoch": 5.737158105403602, "loss": 0.329052209854126, "step": 17200 }, { "ce_loss": 0.023697247728705406, "epoch": 5.737158105403602, "step": 17200 }, { "distill_loss": 0.20539245009422302, "epoch": 5.737158105403602, "step": 17200 }, { "epoch": 5.737158105403602, "ref_ce_loss": 0.06529682129621506, "step": 17200 }, { "epoch": 5.740493662441628, "loss": 0.3736, "step": 17210 }, { "epoch": 5.740493662441628, "grad_norm": 1.4739378690719604, "step": 17210 }, { "epoch": 5.740493662441628, "learning_rate": 5.850376284287177e-05, "step": 17210 }, { "epoch": 5.740493662441628, "loss": 0.3846653401851654, "step": 17210 }, { "ce_loss": 0.009561138227581978, "epoch": 5.740493662441628, "step": 17210 }, { "distill_loss": 0.11265572905540466, "epoch": 5.740493662441628, "step": 17210 }, { "epoch": 5.740493662441628, "ref_ce_loss": 0.07206114381551743, "step": 17210 }, { "epoch": 5.740493662441628, "loss": 0.27818945050239563, "step": 17210 }, { "ce_loss": 0.04575086012482643, "epoch": 5.740493662441628, "step": 17210 }, { "distill_loss": 0.12364290654659271, "epoch": 5.740493662441628, "step": 17210 }, { "epoch": 5.740493662441628, "ref_ce_loss": 0.046676069498062134, "step": 17210 }, { "epoch": 5.740493662441628, "loss": 0.31520622968673706, "step": 17210 }, { "ce_loss": 0.03989662602543831, "epoch": 5.740493662441628, "step": 17210 }, { "distill_loss": 0.184968501329422, "epoch": 5.740493662441628, "step": 17210 }, { "epoch": 5.740493662441628, "ref_ce_loss": 0.06694689393043518, "step": 17210 }, { "epoch": 5.740493662441628, "loss": 0.31031399965286255, "step": 17210 }, { "ce_loss": 0.07887408137321472, "epoch": 5.740493662441628, "step": 17210 }, { "distill_loss": 0.16484485566616058, "epoch": 5.740493662441628, "step": 17210 }, { "epoch": 5.740493662441628, "ref_ce_loss": 0.06637794524431229, "step": 17210 }, { "epoch": 5.743829219479653, "loss": 0.3617, "step": 17220 }, { "epoch": 5.743829219479653, "grad_norm": 2.416130781173706, "step": 17220 }, { "epoch": 5.743829219479653, "learning_rate": 5.8343332447494786e-05, "step": 17220 }, { "epoch": 5.743829219479653, "loss": 0.3554649353027344, "step": 17220 }, { "ce_loss": 0.08824531733989716, "epoch": 5.743829219479653, "step": 17220 }, { "distill_loss": 0.0907503142952919, "epoch": 5.743829219479653, "step": 17220 }, { "epoch": 5.743829219479653, "ref_ce_loss": 0.07792174071073532, "step": 17220 }, { "epoch": 5.743829219479653, "loss": 0.36913350224494934, "step": 17220 }, { "ce_loss": 0.11172915250062943, "epoch": 5.743829219479653, "step": 17220 }, { "distill_loss": 0.14446821808815002, "epoch": 5.743829219479653, "step": 17220 }, { "epoch": 5.743829219479653, "ref_ce_loss": 0.08583048731088638, "step": 17220 }, { "epoch": 5.743829219479653, "loss": 0.290985643863678, "step": 17220 }, { "ce_loss": 0.054969705641269684, "epoch": 5.743829219479653, "step": 17220 }, { "distill_loss": 0.1072135642170906, "epoch": 5.743829219479653, "step": 17220 }, { "epoch": 5.743829219479653, "ref_ce_loss": 0.07966025918722153, "step": 17220 }, { "epoch": 5.743829219479653, "loss": 0.3522094488143921, "step": 17220 }, { "ce_loss": 0.02910817228257656, "epoch": 5.743829219479653, "step": 17220 }, { "distill_loss": 0.1539473682641983, "epoch": 5.743829219479653, "step": 17220 }, { "epoch": 5.743829219479653, "ref_ce_loss": 0.06737984716892242, "step": 17220 }, { "epoch": 5.747164776517678, "loss": 0.3585, "step": 17230 }, { "epoch": 5.747164776517678, "grad_norm": 2.5325045585632324, "step": 17230 }, { "epoch": 5.747164776517678, "learning_rate": 5.818306919772382e-05, "step": 17230 }, { "epoch": 5.747164776517678, "loss": 0.3878948986530304, "step": 17230 }, { "ce_loss": 0.13507422804832458, "epoch": 5.747164776517678, "step": 17230 }, { "distill_loss": 0.13404646515846252, "epoch": 5.747164776517678, "step": 17230 }, { "epoch": 5.747164776517678, "ref_ce_loss": 0.07569249719381332, "step": 17230 }, { "epoch": 5.747164776517678, "loss": 0.4121205806732178, "step": 17230 }, { "ce_loss": 0.14450865983963013, "epoch": 5.747164776517678, "step": 17230 }, { "distill_loss": 0.16785818338394165, "epoch": 5.747164776517678, "step": 17230 }, { "epoch": 5.747164776517678, "ref_ce_loss": 0.07864559441804886, "step": 17230 }, { "epoch": 5.747164776517678, "loss": 0.25712525844573975, "step": 17230 }, { "ce_loss": 0.027721816673874855, "epoch": 5.747164776517678, "step": 17230 }, { "distill_loss": 0.14876680076122284, "epoch": 5.747164776517678, "step": 17230 }, { "epoch": 5.747164776517678, "ref_ce_loss": 0.05708444118499756, "step": 17230 }, { "epoch": 5.747164776517678, "loss": 0.35496917366981506, "step": 17230 }, { "ce_loss": 0.09509100019931793, "epoch": 5.747164776517678, "step": 17230 }, { "distill_loss": 0.13708890974521637, "epoch": 5.747164776517678, "step": 17230 }, { "epoch": 5.747164776517678, "ref_ce_loss": 0.08199819922447205, "step": 17230 }, { "epoch": 5.750500333555704, "loss": 0.3272, "step": 17240 }, { "epoch": 5.750500333555704, "grad_norm": 2.6290535926818848, "step": 17240 }, { "epoch": 5.750500333555704, "learning_rate": 5.802297338581588e-05, "step": 17240 }, { "epoch": 5.750500333555704, "loss": 0.29643863439559937, "step": 17240 }, { "ce_loss": 0.04670005664229393, "epoch": 5.750500333555704, "step": 17240 }, { "distill_loss": 0.16171149909496307, "epoch": 5.750500333555704, "step": 17240 }, { "epoch": 5.750500333555704, "ref_ce_loss": 0.08766219764947891, "step": 17240 }, { "epoch": 5.750500333555704, "loss": 0.3715093433856964, "step": 17240 }, { "ce_loss": 0.05629168450832367, "epoch": 5.750500333555704, "step": 17240 }, { "distill_loss": 0.12367063015699387, "epoch": 5.750500333555704, "step": 17240 }, { "epoch": 5.750500333555704, "ref_ce_loss": 0.08564885705709457, "step": 17240 }, { "epoch": 5.750500333555704, "loss": 0.3038267493247986, "step": 17240 }, { "ce_loss": 0.06020534783601761, "epoch": 5.750500333555704, "step": 17240 }, { "distill_loss": 0.10410208255052567, "epoch": 5.750500333555704, "step": 17240 }, { "epoch": 5.750500333555704, "ref_ce_loss": 0.05112629383802414, "step": 17240 }, { "epoch": 5.750500333555704, "loss": 0.4775630235671997, "step": 17240 }, { "ce_loss": 0.0565582811832428, "epoch": 5.750500333555704, "step": 17240 }, { "distill_loss": 0.23624451458454132, "epoch": 5.750500333555704, "step": 17240 }, { "epoch": 5.750500333555704, "ref_ce_loss": 0.06023179367184639, "step": 17240 }, { "epoch": 5.753835890593729, "loss": 0.3725, "step": 17250 }, { "epoch": 5.753835890593729, "grad_norm": 3.8670239448547363, "step": 17250 }, { "epoch": 5.753835890593729, "learning_rate": 5.786304530372244e-05, "step": 17250 }, { "epoch": 5.753835890593729, "loss": 0.2475840002298355, "step": 17250 }, { "ce_loss": 0.04578208178281784, "epoch": 5.753835890593729, "step": 17250 }, { "distill_loss": 0.12310880422592163, "epoch": 5.753835890593729, "step": 17250 }, { "epoch": 5.753835890593729, "ref_ce_loss": 0.07860726118087769, "step": 17250 }, { "epoch": 5.753835890593729, "loss": 0.3713269531726837, "step": 17250 }, { "ce_loss": 0.08053571730852127, "epoch": 5.753835890593729, "step": 17250 }, { "distill_loss": 0.12296618521213531, "epoch": 5.753835890593729, "step": 17250 }, { "epoch": 5.753835890593729, "ref_ce_loss": 0.08550073206424713, "step": 17250 }, { "epoch": 5.753835890593729, "loss": 0.26044490933418274, "step": 17250 }, { "ce_loss": 0.012742365710437298, "epoch": 5.753835890593729, "step": 17250 }, { "distill_loss": 0.09784765541553497, "epoch": 5.753835890593729, "step": 17250 }, { "epoch": 5.753835890593729, "ref_ce_loss": 0.07161511480808258, "step": 17250 }, { "epoch": 5.753835890593729, "loss": 0.4403138756752014, "step": 17250 }, { "ce_loss": 0.07575368136167526, "epoch": 5.753835890593729, "step": 17250 }, { "distill_loss": 0.13287517428398132, "epoch": 5.753835890593729, "step": 17250 }, { "epoch": 5.753835890593729, "ref_ce_loss": 0.062234990298748016, "step": 17250 }, { "epoch": 5.757171447631754, "loss": 0.3399, "step": 17260 }, { "epoch": 5.757171447631754, "grad_norm": 2.8273019790649414, "step": 17260 }, { "epoch": 5.757171447631754, "learning_rate": 5.770328524308932e-05, "step": 17260 }, { "epoch": 5.757171447631754, "loss": 0.3089262545108795, "step": 17260 }, { "ce_loss": 0.03648150712251663, "epoch": 5.757171447631754, "step": 17260 }, { "distill_loss": 0.1352384388446808, "epoch": 5.757171447631754, "step": 17260 }, { "epoch": 5.757171447631754, "ref_ce_loss": 0.0866464227437973, "step": 17260 }, { "epoch": 5.757171447631754, "loss": 0.43987128138542175, "step": 17260 }, { "ce_loss": 0.14717067778110504, "epoch": 5.757171447631754, "step": 17260 }, { "distill_loss": 0.16674935817718506, "epoch": 5.757171447631754, "step": 17260 }, { "epoch": 5.757171447631754, "ref_ce_loss": 0.06602030247449875, "step": 17260 }, { "epoch": 5.757171447631754, "loss": 0.32174310088157654, "step": 17260 }, { "ce_loss": 0.11366469413042068, "epoch": 5.757171447631754, "step": 17260 }, { "distill_loss": 0.1256704330444336, "epoch": 5.757171447631754, "step": 17260 }, { "epoch": 5.757171447631754, "ref_ce_loss": 0.06479135155677795, "step": 17260 }, { "epoch": 5.757171447631754, "loss": 0.2875996530056, "step": 17260 }, { "ce_loss": 0.05902746692299843, "epoch": 5.757171447631754, "step": 17260 }, { "distill_loss": 0.11191102862358093, "epoch": 5.757171447631754, "step": 17260 }, { "epoch": 5.757171447631754, "ref_ce_loss": 0.08170730620622635, "step": 17260 }, { "epoch": 5.76050700466978, "loss": 0.3647, "step": 17270 }, { "epoch": 5.76050700466978, "grad_norm": 2.8032124042510986, "step": 17270 }, { "epoch": 5.76050700466978, "learning_rate": 5.754369349525581e-05, "step": 17270 }, { "epoch": 5.76050700466978, "loss": 0.48637938499450684, "step": 17270 }, { "ce_loss": 0.10797074437141418, "epoch": 5.76050700466978, "step": 17270 }, { "distill_loss": 0.14722202718257904, "epoch": 5.76050700466978, "step": 17270 }, { "epoch": 5.76050700466978, "ref_ce_loss": 0.08467122912406921, "step": 17270 }, { "epoch": 5.76050700466978, "loss": 0.4154958724975586, "step": 17270 }, { "ce_loss": 0.05239958316087723, "epoch": 5.76050700466978, "step": 17270 }, { "distill_loss": 0.09364564716815948, "epoch": 5.76050700466978, "step": 17270 }, { "epoch": 5.76050700466978, "ref_ce_loss": 0.044636160135269165, "step": 17270 }, { "epoch": 5.76050700466978, "loss": 0.2659384310245514, "step": 17270 }, { "ce_loss": 0.03654512017965317, "epoch": 5.76050700466978, "step": 17270 }, { "distill_loss": 0.0760873556137085, "epoch": 5.76050700466978, "step": 17270 }, { "epoch": 5.76050700466978, "ref_ce_loss": 0.06807711720466614, "step": 17270 }, { "epoch": 5.76050700466978, "loss": 0.46144965291023254, "step": 17270 }, { "ce_loss": 0.03658605366945267, "epoch": 5.76050700466978, "step": 17270 }, { "distill_loss": 0.1725316047668457, "epoch": 5.76050700466978, "step": 17270 }, { "epoch": 5.76050700466978, "ref_ce_loss": 0.07646312564611435, "step": 17270 }, { "epoch": 5.763842561707805, "loss": 0.3639, "step": 17280 }, { "epoch": 5.763842561707805, "grad_norm": 3.59045147895813, "step": 17280 }, { "epoch": 5.763842561707805, "learning_rate": 5.738427035125435e-05, "step": 17280 }, { "epoch": 5.763842561707805, "loss": 0.22502562403678894, "step": 17280 }, { "ce_loss": 0.038079943507909775, "epoch": 5.763842561707805, "step": 17280 }, { "distill_loss": 0.10309861600399017, "epoch": 5.763842561707805, "step": 17280 }, { "epoch": 5.763842561707805, "ref_ce_loss": 0.05601663887500763, "step": 17280 }, { "epoch": 5.763842561707805, "loss": 0.4142107665538788, "step": 17280 }, { "ce_loss": 0.11307626217603683, "epoch": 5.763842561707805, "step": 17280 }, { "distill_loss": 0.12099690735340118, "epoch": 5.763842561707805, "step": 17280 }, { "epoch": 5.763842561707805, "ref_ce_loss": 0.10019832104444504, "step": 17280 }, { "epoch": 5.763842561707805, "loss": 0.3148443102836609, "step": 17280 }, { "ce_loss": 0.08613236248493195, "epoch": 5.763842561707805, "step": 17280 }, { "distill_loss": 0.12056463956832886, "epoch": 5.763842561707805, "step": 17280 }, { "epoch": 5.763842561707805, "ref_ce_loss": 0.06543400883674622, "step": 17280 }, { "epoch": 5.763842561707805, "loss": 0.17869870364665985, "step": 17280 }, { "ce_loss": 0.029490550979971886, "epoch": 5.763842561707805, "step": 17280 }, { "distill_loss": 0.08921138197183609, "epoch": 5.763842561707805, "step": 17280 }, { "epoch": 5.763842561707805, "ref_ce_loss": 0.03786267712712288, "step": 17280 }, { "epoch": 5.76717811874583, "loss": 0.3259, "step": 17290 }, { "epoch": 5.76717811874583, "grad_norm": 2.233980894088745, "step": 17290 }, { "epoch": 5.76717811874583, "learning_rate": 5.722501610180984e-05, "step": 17290 }, { "epoch": 5.76717811874583, "loss": 0.27729642391204834, "step": 17290 }, { "ce_loss": 0.05460566282272339, "epoch": 5.76717811874583, "step": 17290 }, { "distill_loss": 0.12321322411298752, "epoch": 5.76717811874583, "step": 17290 }, { "epoch": 5.76717811874583, "ref_ce_loss": 0.07130646705627441, "step": 17290 }, { "epoch": 5.76717811874583, "loss": 0.516023576259613, "step": 17290 }, { "ce_loss": 0.05338827893137932, "epoch": 5.76717811874583, "step": 17290 }, { "distill_loss": 0.11183921992778778, "epoch": 5.76717811874583, "step": 17290 }, { "epoch": 5.76717811874583, "ref_ce_loss": 0.057988960295915604, "step": 17290 }, { "epoch": 5.76717811874583, "loss": 0.48194584250450134, "step": 17290 }, { "ce_loss": 0.06910925358533859, "epoch": 5.76717811874583, "step": 17290 }, { "distill_loss": 0.221344456076622, "epoch": 5.76717811874583, "step": 17290 }, { "epoch": 5.76717811874583, "ref_ce_loss": 0.08294906467199326, "step": 17290 }, { "epoch": 5.76717811874583, "loss": 0.2483086884021759, "step": 17290 }, { "ce_loss": 0.05446473881602287, "epoch": 5.76717811874583, "step": 17290 }, { "distill_loss": 0.1190895289182663, "epoch": 5.76717811874583, "step": 17290 }, { "epoch": 5.76717811874583, "ref_ce_loss": 0.07463015615940094, "step": 17290 }, { "epoch": 5.770513675783856, "loss": 0.3169, "step": 17300 }, { "epoch": 5.770513675783856, "grad_norm": 1.6905715465545654, "step": 17300 }, { "epoch": 5.770513675783856, "learning_rate": 5.706593103733926e-05, "step": 17300 }, { "epoch": 5.770513675783856, "loss": 0.36666467785835266, "step": 17300 }, { "ce_loss": 0.1196926012635231, "epoch": 5.770513675783856, "step": 17300 }, { "distill_loss": 0.12403534352779388, "epoch": 5.770513675783856, "step": 17300 }, { "epoch": 5.770513675783856, "ref_ce_loss": 0.09720993787050247, "step": 17300 }, { "epoch": 5.770513675783856, "loss": 0.23492708802223206, "step": 17300 }, { "ce_loss": 0.047469962388277054, "epoch": 5.770513675783856, "step": 17300 }, { "distill_loss": 0.13368260860443115, "epoch": 5.770513675783856, "step": 17300 }, { "epoch": 5.770513675783856, "ref_ce_loss": 0.05329669639468193, "step": 17300 }, { "epoch": 5.770513675783856, "loss": 0.24395941197872162, "step": 17300 }, { "ce_loss": 0.03127888962626457, "epoch": 5.770513675783856, "step": 17300 }, { "distill_loss": 0.1213611513376236, "epoch": 5.770513675783856, "step": 17300 }, { "epoch": 5.770513675783856, "ref_ce_loss": 0.09058709442615509, "step": 17300 }, { "epoch": 5.770513675783856, "loss": 0.39396345615386963, "step": 17300 }, { "ce_loss": 0.09582968801259995, "epoch": 5.770513675783856, "step": 17300 }, { "distill_loss": 0.19662624597549438, "epoch": 5.770513675783856, "step": 17300 }, { "epoch": 5.770513675783856, "ref_ce_loss": 0.10104110091924667, "step": 17300 }, { "epoch": 5.773849232821881, "loss": 0.3468, "step": 17310 }, { "epoch": 5.773849232821881, "grad_norm": 2.5617034435272217, "step": 17310 }, { "epoch": 5.773849232821881, "learning_rate": 5.690701544795092e-05, "step": 17310 }, { "epoch": 5.773849232821881, "loss": 0.193987637758255, "step": 17310 }, { "ce_loss": 0.0359039269387722, "epoch": 5.773849232821881, "step": 17310 }, { "distill_loss": 0.08433974534273148, "epoch": 5.773849232821881, "step": 17310 }, { "epoch": 5.773849232821881, "ref_ce_loss": 0.05012137070298195, "step": 17310 }, { "epoch": 5.773849232821881, "loss": 0.3732142746448517, "step": 17310 }, { "ce_loss": 0.07121843844652176, "epoch": 5.773849232821881, "step": 17310 }, { "distill_loss": 0.1710732877254486, "epoch": 5.773849232821881, "step": 17310 }, { "epoch": 5.773849232821881, "ref_ce_loss": 0.06881940364837646, "step": 17310 }, { "epoch": 5.773849232821881, "loss": 0.21098724007606506, "step": 17310 }, { "ce_loss": 0.024630699306726456, "epoch": 5.773849232821881, "step": 17310 }, { "distill_loss": 0.13975226879119873, "epoch": 5.773849232821881, "step": 17310 }, { "epoch": 5.773849232821881, "ref_ce_loss": 0.046379558742046356, "step": 17310 }, { "epoch": 5.773849232821881, "loss": 0.261945515871048, "step": 17310 }, { "ce_loss": 0.03864102438092232, "epoch": 5.773849232821881, "step": 17310 }, { "distill_loss": 0.11703591793775558, "epoch": 5.773849232821881, "step": 17310 }, { "epoch": 5.773849232821881, "ref_ce_loss": 0.055460765957832336, "step": 17310 }, { "epoch": 5.7771847898599065, "loss": 0.3371, "step": 17320 }, { "epoch": 5.7771847898599065, "grad_norm": 4.389605522155762, "step": 17320 }, { "epoch": 5.7771847898599065, "learning_rate": 5.6748269623444264e-05, "step": 17320 }, { "epoch": 5.7771847898599065, "loss": 0.21445885300636292, "step": 17320 }, { "ce_loss": 0.037108927965164185, "epoch": 5.7771847898599065, "step": 17320 }, { "distill_loss": 0.0989280715584755, "epoch": 5.7771847898599065, "step": 17320 }, { "epoch": 5.7771847898599065, "ref_ce_loss": 0.029425648972392082, "step": 17320 }, { "epoch": 5.7771847898599065, "loss": 0.3353267312049866, "step": 17320 }, { "ce_loss": 0.040188174694776535, "epoch": 5.7771847898599065, "step": 17320 }, { "distill_loss": 0.0841199979186058, "epoch": 5.7771847898599065, "step": 17320 }, { "epoch": 5.7771847898599065, "ref_ce_loss": 0.07783445715904236, "step": 17320 }, { "epoch": 5.7771847898599065, "loss": 0.2943035066127777, "step": 17320 }, { "ce_loss": 0.06536614894866943, "epoch": 5.7771847898599065, "step": 17320 }, { "distill_loss": 0.12621468305587769, "epoch": 5.7771847898599065, "step": 17320 }, { "epoch": 5.7771847898599065, "ref_ce_loss": 0.07469505071640015, "step": 17320 }, { "epoch": 5.7771847898599065, "loss": 0.2589566707611084, "step": 17320 }, { "ce_loss": 0.030605845153331757, "epoch": 5.7771847898599065, "step": 17320 }, { "distill_loss": 0.11211664229631424, "epoch": 5.7771847898599065, "step": 17320 }, { "epoch": 5.7771847898599065, "ref_ce_loss": 0.06319501250982285, "step": 17320 }, { "epoch": 5.780520346897932, "loss": 0.3417, "step": 17330 }, { "epoch": 5.780520346897932, "grad_norm": 2.9060959815979004, "step": 17330 }, { "epoch": 5.780520346897932, "learning_rate": 5.658969385330891e-05, "step": 17330 }, { "epoch": 5.780520346897932, "loss": 0.506881833076477, "step": 17330 }, { "ce_loss": 0.07976707071065903, "epoch": 5.780520346897932, "step": 17330 }, { "distill_loss": 0.09284459054470062, "epoch": 5.780520346897932, "step": 17330 }, { "epoch": 5.780520346897932, "ref_ce_loss": 0.07745490968227386, "step": 17330 }, { "epoch": 5.780520346897932, "loss": 0.4200765788555145, "step": 17330 }, { "ce_loss": 0.07465557754039764, "epoch": 5.780520346897932, "step": 17330 }, { "distill_loss": 0.14287123084068298, "epoch": 5.780520346897932, "step": 17330 }, { "epoch": 5.780520346897932, "ref_ce_loss": 0.06909674406051636, "step": 17330 }, { "epoch": 5.780520346897932, "loss": 0.2414490431547165, "step": 17330 }, { "ce_loss": 0.05098666995763779, "epoch": 5.780520346897932, "step": 17330 }, { "distill_loss": 0.1095428466796875, "epoch": 5.780520346897932, "step": 17330 }, { "epoch": 5.780520346897932, "ref_ce_loss": 0.05398302897810936, "step": 17330 }, { "epoch": 5.780520346897932, "loss": 0.33691316843032837, "step": 17330 }, { "ce_loss": 0.11903326958417892, "epoch": 5.780520346897932, "step": 17330 }, { "distill_loss": 0.11317283660173416, "epoch": 5.780520346897932, "step": 17330 }, { "epoch": 5.780520346897932, "ref_ce_loss": 0.08404218405485153, "step": 17330 }, { "epoch": 5.783855903935957, "loss": 0.3367, "step": 17340 }, { "epoch": 5.783855903935957, "grad_norm": 2.7188363075256348, "step": 17340 }, { "epoch": 5.783855903935957, "learning_rate": 5.643128842672467e-05, "step": 17340 }, { "epoch": 5.783855903935957, "loss": 0.47430357336997986, "step": 17340 }, { "ce_loss": 0.076082244515419, "epoch": 5.783855903935957, "step": 17340 }, { "distill_loss": 0.1695503294467926, "epoch": 5.783855903935957, "step": 17340 }, { "epoch": 5.783855903935957, "ref_ce_loss": 0.06374223530292511, "step": 17340 }, { "epoch": 5.783855903935957, "loss": 0.36884844303131104, "step": 17340 }, { "ce_loss": 0.0587586909532547, "epoch": 5.783855903935957, "step": 17340 }, { "distill_loss": 0.11374857276678085, "epoch": 5.783855903935957, "step": 17340 }, { "epoch": 5.783855903935957, "ref_ce_loss": 0.08658315241336823, "step": 17340 }, { "epoch": 5.783855903935957, "loss": 0.432818740606308, "step": 17340 }, { "ce_loss": 0.09153823554515839, "epoch": 5.783855903935957, "step": 17340 }, { "distill_loss": 0.2312244325876236, "epoch": 5.783855903935957, "step": 17340 }, { "epoch": 5.783855903935957, "ref_ce_loss": 0.08305595070123672, "step": 17340 }, { "epoch": 5.783855903935957, "loss": 0.4313298761844635, "step": 17340 }, { "ce_loss": 0.060097139328718185, "epoch": 5.783855903935957, "step": 17340 }, { "distill_loss": 0.13053864240646362, "epoch": 5.783855903935957, "step": 17340 }, { "epoch": 5.783855903935957, "ref_ce_loss": 0.08896686881780624, "step": 17340 }, { "epoch": 5.7871914609739825, "loss": 0.4102, "step": 17350 }, { "epoch": 5.7871914609739825, "grad_norm": 4.448433876037598, "step": 17350 }, { "epoch": 5.7871914609739825, "learning_rate": 5.627305363256054e-05, "step": 17350 }, { "epoch": 5.7871914609739825, "loss": 0.36065179109573364, "step": 17350 }, { "ce_loss": 0.07306523621082306, "epoch": 5.7871914609739825, "step": 17350 }, { "distill_loss": 0.17802460491657257, "epoch": 5.7871914609739825, "step": 17350 }, { "epoch": 5.7871914609739825, "ref_ce_loss": 0.08533138036727905, "step": 17350 }, { "epoch": 5.7871914609739825, "loss": 0.3039696216583252, "step": 17350 }, { "ce_loss": 0.06814741343259811, "epoch": 5.7871914609739825, "step": 17350 }, { "distill_loss": 0.13009023666381836, "epoch": 5.7871914609739825, "step": 17350 }, { "epoch": 5.7871914609739825, "ref_ce_loss": 0.07732559740543365, "step": 17350 }, { "epoch": 5.7871914609739825, "loss": 0.2832317054271698, "step": 17350 }, { "ce_loss": 0.0839838832616806, "epoch": 5.7871914609739825, "step": 17350 }, { "distill_loss": 0.09487424045801163, "epoch": 5.7871914609739825, "step": 17350 }, { "epoch": 5.7871914609739825, "ref_ce_loss": 0.06302163749933243, "step": 17350 }, { "epoch": 5.7871914609739825, "loss": 0.34753546118736267, "step": 17350 }, { "ce_loss": 0.0784483551979065, "epoch": 5.7871914609739825, "step": 17350 }, { "distill_loss": 0.17030097544193268, "epoch": 5.7871914609739825, "step": 17350 }, { "epoch": 5.7871914609739825, "ref_ce_loss": 0.09857328981161118, "step": 17350 }, { "epoch": 5.790527018012008, "loss": 0.3631, "step": 17360 }, { "epoch": 5.790527018012008, "grad_norm": 3.4604597091674805, "step": 17360 }, { "epoch": 5.790527018012008, "learning_rate": 5.6114989759374264e-05, "step": 17360 }, { "epoch": 5.790527018012008, "loss": 0.3250521421432495, "step": 17360 }, { "ce_loss": 0.06812068819999695, "epoch": 5.790527018012008, "step": 17360 }, { "distill_loss": 0.14425088465213776, "epoch": 5.790527018012008, "step": 17360 }, { "epoch": 5.790527018012008, "ref_ce_loss": 0.05242958292365074, "step": 17360 }, { "epoch": 5.790527018012008, "loss": 0.38723093271255493, "step": 17360 }, { "ce_loss": 0.04637950658798218, "epoch": 5.790527018012008, "step": 17360 }, { "distill_loss": 0.12929755449295044, "epoch": 5.790527018012008, "step": 17360 }, { "epoch": 5.790527018012008, "ref_ce_loss": 0.09616874158382416, "step": 17360 }, { "epoch": 5.790527018012008, "loss": 0.4408058822154999, "step": 17360 }, { "ce_loss": 0.10426130890846252, "epoch": 5.790527018012008, "step": 17360 }, { "distill_loss": 0.13043643534183502, "epoch": 5.790527018012008, "step": 17360 }, { "epoch": 5.790527018012008, "ref_ce_loss": 0.10126718878746033, "step": 17360 }, { "epoch": 5.790527018012008, "loss": 0.184561625123024, "step": 17360 }, { "ce_loss": 0.028995148837566376, "epoch": 5.790527018012008, "step": 17360 }, { "distill_loss": 0.10370610654354095, "epoch": 5.790527018012008, "step": 17360 }, { "epoch": 5.790527018012008, "ref_ce_loss": 0.05134010314941406, "step": 17360 }, { "epoch": 5.793862575050033, "loss": 0.3432, "step": 17370 }, { "epoch": 5.793862575050033, "grad_norm": 3.010852336883545, "step": 17370 }, { "epoch": 5.793862575050033, "learning_rate": 5.595709709541212e-05, "step": 17370 }, { "epoch": 5.793862575050033, "loss": 0.4072456359863281, "step": 17370 }, { "ce_loss": 0.06752721965312958, "epoch": 5.793862575050033, "step": 17370 }, { "distill_loss": 0.12647153437137604, "epoch": 5.793862575050033, "step": 17370 }, { "epoch": 5.793862575050033, "ref_ce_loss": 0.07747430354356766, "step": 17370 }, { "epoch": 5.793862575050033, "loss": 0.370725154876709, "step": 17370 }, { "ce_loss": 0.05985059216618538, "epoch": 5.793862575050033, "step": 17370 }, { "distill_loss": 0.14739835262298584, "epoch": 5.793862575050033, "step": 17370 }, { "epoch": 5.793862575050033, "ref_ce_loss": 0.059420146048069, "step": 17370 }, { "epoch": 5.793862575050033, "loss": 0.28383949398994446, "step": 17370 }, { "ce_loss": 0.06301853060722351, "epoch": 5.793862575050033, "step": 17370 }, { "distill_loss": 0.11598997563123703, "epoch": 5.793862575050033, "step": 17370 }, { "epoch": 5.793862575050033, "ref_ce_loss": 0.09168814867734909, "step": 17370 }, { "epoch": 5.793862575050033, "loss": 0.2755222022533417, "step": 17370 }, { "ce_loss": 0.04786836355924606, "epoch": 5.793862575050033, "step": 17370 }, { "distill_loss": 0.15717007219791412, "epoch": 5.793862575050033, "step": 17370 }, { "epoch": 5.793862575050033, "ref_ce_loss": 0.053128596395254135, "step": 17370 }, { "epoch": 5.797198132088059, "loss": 0.35, "step": 17380 }, { "epoch": 5.797198132088059, "grad_norm": 1.6434465646743774, "step": 17380 }, { "epoch": 5.797198132088059, "learning_rate": 5.5799375928607897e-05, "step": 17380 }, { "epoch": 5.797198132088059, "loss": 0.7022756338119507, "step": 17380 }, { "ce_loss": 0.09961540251970291, "epoch": 5.797198132088059, "step": 17380 }, { "distill_loss": 0.19559799134731293, "epoch": 5.797198132088059, "step": 17380 }, { "epoch": 5.797198132088059, "ref_ce_loss": 0.12969614565372467, "step": 17380 }, { "epoch": 5.797198132088059, "loss": 0.44806501269340515, "step": 17380 }, { "ce_loss": 0.12347184866666794, "epoch": 5.797198132088059, "step": 17380 }, { "distill_loss": 0.1917111724615097, "epoch": 5.797198132088059, "step": 17380 }, { "epoch": 5.797198132088059, "ref_ce_loss": 0.09800612181425095, "step": 17380 }, { "epoch": 5.797198132088059, "loss": 0.27114999294281006, "step": 17380 }, { "ce_loss": 0.04157739877700806, "epoch": 5.797198132088059, "step": 17380 }, { "distill_loss": 0.13314248621463776, "epoch": 5.797198132088059, "step": 17380 }, { "epoch": 5.797198132088059, "ref_ce_loss": 0.06825023144483566, "step": 17380 }, { "epoch": 5.797198132088059, "loss": 0.30957531929016113, "step": 17380 }, { "ce_loss": 0.05008748173713684, "epoch": 5.797198132088059, "step": 17380 }, { "distill_loss": 0.1175016239285469, "epoch": 5.797198132088059, "step": 17380 }, { "epoch": 5.797198132088059, "ref_ce_loss": 0.06017220392823219, "step": 17380 }, { "epoch": 5.800533689126084, "loss": 0.343, "step": 17390 }, { "epoch": 5.800533689126084, "grad_norm": 1.723335862159729, "step": 17390 }, { "epoch": 5.800533689126084, "learning_rate": 5.5641826546582844e-05, "step": 17390 }, { "epoch": 5.800533689126084, "loss": 0.22227926552295685, "step": 17390 }, { "ce_loss": 0.054145898669958115, "epoch": 5.800533689126084, "step": 17390 }, { "distill_loss": 0.12056168913841248, "epoch": 5.800533689126084, "step": 17390 }, { "epoch": 5.800533689126084, "ref_ce_loss": 0.046798594295978546, "step": 17390 }, { "epoch": 5.800533689126084, "loss": 0.27474644780158997, "step": 17390 }, { "ce_loss": 0.031725261360406876, "epoch": 5.800533689126084, "step": 17390 }, { "distill_loss": 0.1447744369506836, "epoch": 5.800533689126084, "step": 17390 }, { "epoch": 5.800533689126084, "ref_ce_loss": 0.06469041854143143, "step": 17390 }, { "epoch": 5.800533689126084, "loss": 0.42471349239349365, "step": 17390 }, { "ce_loss": 0.06565036624670029, "epoch": 5.800533689126084, "step": 17390 }, { "distill_loss": 0.15499937534332275, "epoch": 5.800533689126084, "step": 17390 }, { "epoch": 5.800533689126084, "ref_ce_loss": 0.0806792750954628, "step": 17390 }, { "epoch": 5.800533689126084, "loss": 0.43315747380256653, "step": 17390 }, { "ce_loss": 0.13766764104366302, "epoch": 5.800533689126084, "step": 17390 }, { "distill_loss": 0.16289286315441132, "epoch": 5.800533689126084, "step": 17390 }, { "epoch": 5.800533689126084, "ref_ce_loss": 0.06286300718784332, "step": 17390 }, { "epoch": 5.803869246164109, "loss": 0.352, "step": 17400 }, { "epoch": 5.803869246164109, "grad_norm": 2.084108352661133, "step": 17400 }, { "epoch": 5.803869246164109, "learning_rate": 5.548444923664499e-05, "step": 17400 }, { "epoch": 5.803869246164109, "loss": 0.4230390191078186, "step": 17400 }, { "ce_loss": 0.12096190452575684, "epoch": 5.803869246164109, "step": 17400 }, { "distill_loss": 0.1789836287498474, "epoch": 5.803869246164109, "step": 17400 }, { "epoch": 5.803869246164109, "ref_ce_loss": 0.12275371700525284, "step": 17400 }, { "epoch": 5.803869246164109, "loss": 0.24342182278633118, "step": 17400 }, { "ce_loss": 0.04757703095674515, "epoch": 5.803869246164109, "step": 17400 }, { "distill_loss": 0.09931950271129608, "epoch": 5.803869246164109, "step": 17400 }, { "epoch": 5.803869246164109, "ref_ce_loss": 0.06264876574277878, "step": 17400 }, { "epoch": 5.803869246164109, "loss": 0.37538662552833557, "step": 17400 }, { "ce_loss": 0.026338813826441765, "epoch": 5.803869246164109, "step": 17400 }, { "distill_loss": 0.21343332529067993, "epoch": 5.803869246164109, "step": 17400 }, { "epoch": 5.803869246164109, "ref_ce_loss": 0.06337901949882507, "step": 17400 }, { "epoch": 5.803869246164109, "loss": 0.3582431972026825, "step": 17400 }, { "ce_loss": 0.11757601052522659, "epoch": 5.803869246164109, "step": 17400 }, { "distill_loss": 0.14947518706321716, "epoch": 5.803869246164109, "step": 17400 }, { "epoch": 5.803869246164109, "ref_ce_loss": 0.09114629030227661, "step": 17400 }, { "epoch": 5.807204803202135, "loss": 0.3419, "step": 17410 }, { "epoch": 5.807204803202135, "grad_norm": 3.1050589084625244, "step": 17410 }, { "epoch": 5.807204803202135, "learning_rate": 5.532724428578834e-05, "step": 17410 }, { "epoch": 5.807204803202135, "loss": 0.2950383722782135, "step": 17410 }, { "ce_loss": 0.019280388951301575, "epoch": 5.807204803202135, "step": 17410 }, { "distill_loss": 0.14772720634937286, "epoch": 5.807204803202135, "step": 17410 }, { "epoch": 5.807204803202135, "ref_ce_loss": 0.08486374467611313, "step": 17410 }, { "epoch": 5.807204803202135, "loss": 0.664941668510437, "step": 17410 }, { "ce_loss": 0.12758208811283112, "epoch": 5.807204803202135, "step": 17410 }, { "distill_loss": 0.15463386476039886, "epoch": 5.807204803202135, "step": 17410 }, { "epoch": 5.807204803202135, "ref_ce_loss": 0.09197648614645004, "step": 17410 }, { "epoch": 5.807204803202135, "loss": 0.510972797870636, "step": 17410 }, { "ce_loss": 0.1270759105682373, "epoch": 5.807204803202135, "step": 17410 }, { "distill_loss": 0.14177629351615906, "epoch": 5.807204803202135, "step": 17410 }, { "epoch": 5.807204803202135, "ref_ce_loss": 0.09685118496417999, "step": 17410 }, { "epoch": 5.807204803202135, "loss": 0.34812983870506287, "step": 17410 }, { "ce_loss": 0.12147245556116104, "epoch": 5.807204803202135, "step": 17410 }, { "distill_loss": 0.1561581939458847, "epoch": 5.807204803202135, "step": 17410 }, { "epoch": 5.807204803202135, "ref_ce_loss": 0.07011862099170685, "step": 17410 }, { "epoch": 5.81054036024016, "loss": 0.3592, "step": 17420 }, { "epoch": 5.81054036024016, "grad_norm": 1.6063811779022217, "step": 17420 }, { "epoch": 5.81054036024016, "learning_rate": 5.517021198069276e-05, "step": 17420 }, { "epoch": 5.81054036024016, "loss": 0.292193204164505, "step": 17420 }, { "ce_loss": 0.05114936828613281, "epoch": 5.81054036024016, "step": 17420 }, { "distill_loss": 0.13860230147838593, "epoch": 5.81054036024016, "step": 17420 }, { "epoch": 5.81054036024016, "ref_ce_loss": 0.06149422004818916, "step": 17420 }, { "epoch": 5.81054036024016, "loss": 0.3692050278186798, "step": 17420 }, { "ce_loss": 0.07337262481451035, "epoch": 5.81054036024016, "step": 17420 }, { "distill_loss": 0.1356513649225235, "epoch": 5.81054036024016, "step": 17420 }, { "epoch": 5.81054036024016, "ref_ce_loss": 0.06378420442342758, "step": 17420 }, { "epoch": 5.81054036024016, "loss": 0.42367690801620483, "step": 17420 }, { "ce_loss": 0.0620979443192482, "epoch": 5.81054036024016, "step": 17420 }, { "distill_loss": 0.1174180656671524, "epoch": 5.81054036024016, "step": 17420 }, { "epoch": 5.81054036024016, "ref_ce_loss": 0.05651075392961502, "step": 17420 }, { "epoch": 5.81054036024016, "loss": 0.3047388792037964, "step": 17420 }, { "ce_loss": 0.09635266661643982, "epoch": 5.81054036024016, "step": 17420 }, { "distill_loss": 0.15084218978881836, "epoch": 5.81054036024016, "step": 17420 }, { "epoch": 5.81054036024016, "ref_ce_loss": 0.05727013573050499, "step": 17420 }, { "epoch": 5.813875917278185, "loss": 0.3478, "step": 17430 }, { "epoch": 5.813875917278185, "grad_norm": 2.7375216484069824, "step": 17430 }, { "epoch": 5.813875917278185, "learning_rate": 5.501335260772329e-05, "step": 17430 }, { "epoch": 5.813875917278185, "loss": 0.2118167281150818, "step": 17430 }, { "ce_loss": 0.04239267483353615, "epoch": 5.813875917278185, "step": 17430 }, { "distill_loss": 0.10191035270690918, "epoch": 5.813875917278185, "step": 17430 }, { "epoch": 5.813875917278185, "ref_ce_loss": 0.03936781361699104, "step": 17430 }, { "epoch": 5.813875917278185, "loss": 0.49797767400741577, "step": 17430 }, { "ce_loss": 0.045204028487205505, "epoch": 5.813875917278185, "step": 17430 }, { "distill_loss": 0.10383464395999908, "epoch": 5.813875917278185, "step": 17430 }, { "epoch": 5.813875917278185, "ref_ce_loss": 0.05662389472126961, "step": 17430 }, { "epoch": 5.813875917278185, "loss": 0.3084079623222351, "step": 17430 }, { "ce_loss": 0.07492610812187195, "epoch": 5.813875917278185, "step": 17430 }, { "distill_loss": 0.11228092014789581, "epoch": 5.813875917278185, "step": 17430 }, { "epoch": 5.813875917278185, "ref_ce_loss": 0.11168135702610016, "step": 17430 }, { "epoch": 5.813875917278185, "loss": 0.4190921187400818, "step": 17430 }, { "ce_loss": 0.08887006342411041, "epoch": 5.813875917278185, "step": 17430 }, { "distill_loss": 0.13845597207546234, "epoch": 5.813875917278185, "step": 17430 }, { "epoch": 5.813875917278185, "ref_ce_loss": 0.08437550812959671, "step": 17430 }, { "epoch": 5.817211474316211, "loss": 0.3496, "step": 17440 }, { "epoch": 5.817211474316211, "grad_norm": 3.1267213821411133, "step": 17440 }, { "epoch": 5.817211474316211, "learning_rate": 5.4856666452929435e-05, "step": 17440 }, { "epoch": 5.817211474316211, "loss": 0.35733237862586975, "step": 17440 }, { "ce_loss": 0.08284957706928253, "epoch": 5.817211474316211, "step": 17440 }, { "distill_loss": 0.16209322214126587, "epoch": 5.817211474316211, "step": 17440 }, { "epoch": 5.817211474316211, "ref_ce_loss": 0.07235975563526154, "step": 17440 }, { "epoch": 5.817211474316211, "loss": 0.3992215096950531, "step": 17440 }, { "ce_loss": 0.09641101956367493, "epoch": 5.817211474316211, "step": 17440 }, { "distill_loss": 0.1932550072669983, "epoch": 5.817211474316211, "step": 17440 }, { "epoch": 5.817211474316211, "ref_ce_loss": 0.08727872371673584, "step": 17440 }, { "epoch": 5.817211474316211, "loss": 0.23189151287078857, "step": 17440 }, { "ce_loss": 0.04314230754971504, "epoch": 5.817211474316211, "step": 17440 }, { "distill_loss": 0.10661713778972626, "epoch": 5.817211474316211, "step": 17440 }, { "epoch": 5.817211474316211, "ref_ce_loss": 0.05453113839030266, "step": 17440 }, { "epoch": 5.817211474316211, "loss": 0.4198133945465088, "step": 17440 }, { "ce_loss": 0.07765809446573257, "epoch": 5.817211474316211, "step": 17440 }, { "distill_loss": 0.23638515174388885, "epoch": 5.817211474316211, "step": 17440 }, { "epoch": 5.817211474316211, "ref_ce_loss": 0.08266009390354156, "step": 17440 }, { "epoch": 5.820547031354236, "loss": 0.3357, "step": 17450 }, { "epoch": 5.820547031354236, "grad_norm": 1.607082486152649, "step": 17450 }, { "epoch": 5.820547031354236, "learning_rate": 5.470015380204498e-05, "step": 17450 }, { "epoch": 5.820547031354236, "loss": 0.4796401858329773, "step": 17450 }, { "ce_loss": 0.1269017606973648, "epoch": 5.820547031354236, "step": 17450 }, { "distill_loss": 0.15161138772964478, "epoch": 5.820547031354236, "step": 17450 }, { "epoch": 5.820547031354236, "ref_ce_loss": 0.13180014491081238, "step": 17450 }, { "epoch": 5.820547031354236, "loss": 0.3101658225059509, "step": 17450 }, { "ce_loss": 0.06727251410484314, "epoch": 5.820547031354236, "step": 17450 }, { "distill_loss": 0.14738528430461884, "epoch": 5.820547031354236, "step": 17450 }, { "epoch": 5.820547031354236, "ref_ce_loss": 0.062444526702165604, "step": 17450 }, { "epoch": 5.820547031354236, "loss": 0.3178739845752716, "step": 17450 }, { "ce_loss": 0.06257525831460953, "epoch": 5.820547031354236, "step": 17450 }, { "distill_loss": 0.0920637771487236, "epoch": 5.820547031354236, "step": 17450 }, { "epoch": 5.820547031354236, "ref_ce_loss": 0.07175959646701813, "step": 17450 }, { "epoch": 5.820547031354236, "loss": 0.30642449855804443, "step": 17450 }, { "ce_loss": 0.050568338483572006, "epoch": 5.820547031354236, "step": 17450 }, { "distill_loss": 0.09929658472537994, "epoch": 5.820547031354236, "step": 17450 }, { "epoch": 5.820547031354236, "ref_ce_loss": 0.05786219611763954, "step": 17450 }, { "epoch": 5.823882588392261, "loss": 0.3484, "step": 17460 }, { "epoch": 5.823882588392261, "grad_norm": 2.263108730316162, "step": 17460 }, { "epoch": 5.823882588392261, "learning_rate": 5.454381494048726e-05, "step": 17460 }, { "epoch": 5.823882588392261, "loss": 0.24826695024967194, "step": 17460 }, { "ce_loss": 0.06059538573026657, "epoch": 5.823882588392261, "step": 17460 }, { "distill_loss": 0.1367242932319641, "epoch": 5.823882588392261, "step": 17460 }, { "epoch": 5.823882588392261, "ref_ce_loss": 0.0508265420794487, "step": 17460 }, { "epoch": 5.823882588392261, "loss": 0.2540883719921112, "step": 17460 }, { "ce_loss": 0.02674366906285286, "epoch": 5.823882588392261, "step": 17460 }, { "distill_loss": 0.08802241832017899, "epoch": 5.823882588392261, "step": 17460 }, { "epoch": 5.823882588392261, "ref_ce_loss": 0.06723759323358536, "step": 17460 }, { "epoch": 5.823882588392261, "loss": 0.3104031980037689, "step": 17460 }, { "ce_loss": 0.06915073096752167, "epoch": 5.823882588392261, "step": 17460 }, { "distill_loss": 0.12898601591587067, "epoch": 5.823882588392261, "step": 17460 }, { "epoch": 5.823882588392261, "ref_ce_loss": 0.05971602350473404, "step": 17460 }, { "epoch": 5.823882588392261, "loss": 0.28427770733833313, "step": 17460 }, { "ce_loss": 0.037224024534225464, "epoch": 5.823882588392261, "step": 17460 }, { "distill_loss": 0.11982965469360352, "epoch": 5.823882588392261, "step": 17460 }, { "epoch": 5.823882588392261, "ref_ce_loss": 0.07563572376966476, "step": 17460 }, { "epoch": 5.827218145430287, "loss": 0.3525, "step": 17470 }, { "epoch": 5.827218145430287, "grad_norm": 3.3239011764526367, "step": 17470 }, { "epoch": 5.827218145430287, "learning_rate": 5.4387650153356715e-05, "step": 17470 }, { "epoch": 5.827218145430287, "loss": 0.37470170855522156, "step": 17470 }, { "ce_loss": 0.08236923068761826, "epoch": 5.827218145430287, "step": 17470 }, { "distill_loss": 0.17556804418563843, "epoch": 5.827218145430287, "step": 17470 }, { "epoch": 5.827218145430287, "ref_ce_loss": 0.057811666280031204, "step": 17470 }, { "epoch": 5.827218145430287, "loss": 0.25655317306518555, "step": 17470 }, { "ce_loss": 0.06295975297689438, "epoch": 5.827218145430287, "step": 17470 }, { "distill_loss": 0.10989764332771301, "epoch": 5.827218145430287, "step": 17470 }, { "epoch": 5.827218145430287, "ref_ce_loss": 0.0832274779677391, "step": 17470 }, { "epoch": 5.827218145430287, "loss": 0.2870965600013733, "step": 17470 }, { "ce_loss": 0.04364733025431633, "epoch": 5.827218145430287, "step": 17470 }, { "distill_loss": 0.1371404379606247, "epoch": 5.827218145430287, "step": 17470 }, { "epoch": 5.827218145430287, "ref_ce_loss": 0.07653002440929413, "step": 17470 }, { "epoch": 5.827218145430287, "loss": 0.30099189281463623, "step": 17470 }, { "ce_loss": 0.07090586423873901, "epoch": 5.827218145430287, "step": 17470 }, { "distill_loss": 0.11548236757516861, "epoch": 5.827218145430287, "step": 17470 }, { "epoch": 5.827218145430287, "ref_ce_loss": 0.07585054636001587, "step": 17470 }, { "epoch": 5.830553702468312, "loss": 0.3701, "step": 17480 }, { "epoch": 5.830553702468312, "grad_norm": 3.6772897243499756, "step": 17480 }, { "epoch": 5.830553702468312, "learning_rate": 5.423165972543634e-05, "step": 17480 }, { "epoch": 5.830553702468312, "loss": 0.49872735142707825, "step": 17480 }, { "ce_loss": 0.11087694019079208, "epoch": 5.830553702468312, "step": 17480 }, { "distill_loss": 0.1530749797821045, "epoch": 5.830553702468312, "step": 17480 }, { "epoch": 5.830553702468312, "ref_ce_loss": 0.10035675019025803, "step": 17480 }, { "epoch": 5.830553702468312, "loss": 0.6843752861022949, "step": 17480 }, { "ce_loss": 0.09181319177150726, "epoch": 5.830553702468312, "step": 17480 }, { "distill_loss": 0.12344271689653397, "epoch": 5.830553702468312, "step": 17480 }, { "epoch": 5.830553702468312, "ref_ce_loss": 0.10776031762361526, "step": 17480 }, { "epoch": 5.830553702468312, "loss": 0.2865215241909027, "step": 17480 }, { "ce_loss": 0.037878841161727905, "epoch": 5.830553702468312, "step": 17480 }, { "distill_loss": 0.12363813072443008, "epoch": 5.830553702468312, "step": 17480 }, { "epoch": 5.830553702468312, "ref_ce_loss": 0.07112205773591995, "step": 17480 }, { "epoch": 5.830553702468312, "loss": 0.3730589747428894, "step": 17480 }, { "ce_loss": 0.030367298051714897, "epoch": 5.830553702468312, "step": 17480 }, { "distill_loss": 0.15017357468605042, "epoch": 5.830553702468312, "step": 17480 }, { "epoch": 5.830553702468312, "ref_ce_loss": 0.05540158227086067, "step": 17480 }, { "epoch": 5.833889259506337, "loss": 0.3894, "step": 17490 }, { "epoch": 5.833889259506337, "grad_norm": 2.250351905822754, "step": 17490 }, { "epoch": 5.833889259506337, "learning_rate": 5.4075843941191046e-05, "step": 17490 }, { "epoch": 5.833889259506337, "loss": 0.2717742919921875, "step": 17490 }, { "ce_loss": 0.02255961485207081, "epoch": 5.833889259506337, "step": 17490 }, { "distill_loss": 0.17978093028068542, "epoch": 5.833889259506337, "step": 17490 }, { "epoch": 5.833889259506337, "ref_ce_loss": 0.06858284026384354, "step": 17490 }, { "epoch": 5.833889259506337, "loss": 0.3072040379047394, "step": 17490 }, { "ce_loss": 0.056123632937669754, "epoch": 5.833889259506337, "step": 17490 }, { "distill_loss": 0.1385165899991989, "epoch": 5.833889259506337, "step": 17490 }, { "epoch": 5.833889259506337, "ref_ce_loss": 0.0832224190235138, "step": 17490 }, { "epoch": 5.833889259506337, "loss": 0.2829488515853882, "step": 17490 }, { "ce_loss": 0.06095289811491966, "epoch": 5.833889259506337, "step": 17490 }, { "distill_loss": 0.10436280816793442, "epoch": 5.833889259506337, "step": 17490 }, { "epoch": 5.833889259506337, "ref_ce_loss": 0.05648965388536453, "step": 17490 }, { "epoch": 5.833889259506337, "loss": 0.42669305205345154, "step": 17490 }, { "ce_loss": 0.121896892786026, "epoch": 5.833889259506337, "step": 17490 }, { "distill_loss": 0.19189319014549255, "epoch": 5.833889259506337, "step": 17490 }, { "epoch": 5.833889259506337, "ref_ce_loss": 0.08206455409526825, "step": 17490 }, { "epoch": 5.837224816544363, "loss": 0.3718, "step": 17500 }, { "epoch": 5.837224816544363, "grad_norm": 1.9109488725662231, "step": 17500 }, { "epoch": 5.837224816544363, "learning_rate": 5.3920203084767406e-05, "step": 17500 }, { "epoch": 5.837224816544363, "loss": 0.24839021265506744, "step": 17500 }, { "ce_loss": 0.02857845090329647, "epoch": 5.837224816544363, "step": 17500 }, { "distill_loss": 0.11244728416204453, "epoch": 5.837224816544363, "step": 17500 }, { "epoch": 5.837224816544363, "ref_ce_loss": 0.04522886872291565, "step": 17500 }, { "epoch": 5.837224816544363, "loss": 0.39638009667396545, "step": 17500 }, { "ce_loss": 0.09089220315217972, "epoch": 5.837224816544363, "step": 17500 }, { "distill_loss": 0.167978435754776, "epoch": 5.837224816544363, "step": 17500 }, { "epoch": 5.837224816544363, "ref_ce_loss": 0.09620627760887146, "step": 17500 }, { "epoch": 5.837224816544363, "loss": 0.6351642608642578, "step": 17500 }, { "ce_loss": 0.08827564865350723, "epoch": 5.837224816544363, "step": 17500 }, { "distill_loss": 0.1794801503419876, "epoch": 5.837224816544363, "step": 17500 }, { "epoch": 5.837224816544363, "ref_ce_loss": 0.08518512547016144, "step": 17500 }, { "epoch": 5.837224816544363, "loss": 0.8153314590454102, "step": 17500 }, { "ce_loss": 0.12424998730421066, "epoch": 5.837224816544363, "step": 17500 }, { "distill_loss": 0.14850200712680817, "epoch": 5.837224816544363, "step": 17500 }, { "epoch": 5.837224816544363, "ref_ce_loss": 0.06928473711013794, "step": 17500 }, { "epoch": 5.840560373582388, "loss": 0.3703, "step": 17510 }, { "epoch": 5.840560373582388, "grad_norm": 4.421696662902832, "step": 17510 }, { "epoch": 5.840560373582388, "learning_rate": 5.3764737439992964e-05, "step": 17510 }, { "epoch": 5.840560373582388, "loss": 0.40214866399765015, "step": 17510 }, { "ce_loss": 0.13289524614810944, "epoch": 5.840560373582388, "step": 17510 }, { "distill_loss": 0.17023883759975433, "epoch": 5.840560373582388, "step": 17510 }, { "epoch": 5.840560373582388, "ref_ce_loss": 0.05464457720518112, "step": 17510 }, { "epoch": 5.840560373582388, "loss": 0.2773127257823944, "step": 17510 }, { "ce_loss": 0.04269995912909508, "epoch": 5.840560373582388, "step": 17510 }, { "distill_loss": 0.10322048515081406, "epoch": 5.840560373582388, "step": 17510 }, { "epoch": 5.840560373582388, "ref_ce_loss": 0.06018978729844093, "step": 17510 }, { "epoch": 5.840560373582388, "loss": 0.4813617765903473, "step": 17510 }, { "ce_loss": 0.0832713320851326, "epoch": 5.840560373582388, "step": 17510 }, { "distill_loss": 0.2649557590484619, "epoch": 5.840560373582388, "step": 17510 }, { "epoch": 5.840560373582388, "ref_ce_loss": 0.09463027119636536, "step": 17510 }, { "epoch": 5.840560373582388, "loss": 0.2821495234966278, "step": 17510 }, { "ce_loss": 0.04585276544094086, "epoch": 5.840560373582388, "step": 17510 }, { "distill_loss": 0.1394762545824051, "epoch": 5.840560373582388, "step": 17510 }, { "epoch": 5.840560373582388, "ref_ce_loss": 0.06220794469118118, "step": 17510 }, { "epoch": 5.8438959306204135, "loss": 0.3736, "step": 17520 }, { "epoch": 5.8438959306204135, "grad_norm": 4.105420112609863, "step": 17520 }, { "epoch": 5.8438959306204135, "learning_rate": 5.360944729037572e-05, "step": 17520 }, { "epoch": 5.8438959306204135, "loss": 0.3745241165161133, "step": 17520 }, { "ce_loss": 0.0848168134689331, "epoch": 5.8438959306204135, "step": 17520 }, { "distill_loss": 0.1598915159702301, "epoch": 5.8438959306204135, "step": 17520 }, { "epoch": 5.8438959306204135, "ref_ce_loss": 0.05103028565645218, "step": 17520 }, { "epoch": 5.8438959306204135, "loss": 0.7227115631103516, "step": 17520 }, { "ce_loss": 0.0658605769276619, "epoch": 5.8438959306204135, "step": 17520 }, { "distill_loss": 0.1462346315383911, "epoch": 5.8438959306204135, "step": 17520 }, { "epoch": 5.8438959306204135, "ref_ce_loss": 0.06259340047836304, "step": 17520 }, { "epoch": 5.8438959306204135, "loss": 0.5101954936981201, "step": 17520 }, { "ce_loss": 0.02249034121632576, "epoch": 5.8438959306204135, "step": 17520 }, { "distill_loss": 0.09204475581645966, "epoch": 5.8438959306204135, "step": 17520 }, { "epoch": 5.8438959306204135, "ref_ce_loss": 0.08358345180749893, "step": 17520 }, { "epoch": 5.8438959306204135, "loss": 0.25509950518608093, "step": 17520 }, { "ce_loss": 0.0710252895951271, "epoch": 5.8438959306204135, "step": 17520 }, { "distill_loss": 0.10144200921058655, "epoch": 5.8438959306204135, "step": 17520 }, { "epoch": 5.8438959306204135, "ref_ce_loss": 0.054177094250917435, "step": 17520 }, { "epoch": 5.847231487658439, "loss": 0.3925, "step": 17530 }, { "epoch": 5.847231487658439, "grad_norm": 3.8222227096557617, "step": 17530 }, { "epoch": 5.847231487658439, "learning_rate": 5.345433291910368e-05, "step": 17530 }, { "epoch": 5.847231487658439, "loss": 0.3156295418739319, "step": 17530 }, { "ce_loss": 0.04494838789105415, "epoch": 5.847231487658439, "step": 17530 }, { "distill_loss": 0.1247267797589302, "epoch": 5.847231487658439, "step": 17530 }, { "epoch": 5.847231487658439, "ref_ce_loss": 0.05149764195084572, "step": 17530 }, { "epoch": 5.847231487658439, "loss": 0.2993130385875702, "step": 17530 }, { "ce_loss": 0.024480029940605164, "epoch": 5.847231487658439, "step": 17530 }, { "distill_loss": 0.15972600877285004, "epoch": 5.847231487658439, "step": 17530 }, { "epoch": 5.847231487658439, "ref_ce_loss": 0.0775722861289978, "step": 17530 }, { "epoch": 5.847231487658439, "loss": 0.3961556553840637, "step": 17530 }, { "ce_loss": 0.08216272294521332, "epoch": 5.847231487658439, "step": 17530 }, { "distill_loss": 0.20880329608917236, "epoch": 5.847231487658439, "step": 17530 }, { "epoch": 5.847231487658439, "ref_ce_loss": 0.07281126081943512, "step": 17530 }, { "epoch": 5.847231487658439, "loss": 0.37639302015304565, "step": 17530 }, { "ce_loss": 0.04775812849402428, "epoch": 5.847231487658439, "step": 17530 }, { "distill_loss": 0.11027499288320541, "epoch": 5.847231487658439, "step": 17530 }, { "epoch": 5.847231487658439, "ref_ce_loss": 0.07947772741317749, "step": 17530 }, { "epoch": 5.850567044696464, "loss": 0.3653, "step": 17540 }, { "epoch": 5.850567044696464, "grad_norm": 2.4197661876678467, "step": 17540 }, { "epoch": 5.850567044696464, "learning_rate": 5.3299394609044204e-05, "step": 17540 }, { "epoch": 5.850567044696464, "loss": 0.3402426540851593, "step": 17540 }, { "ce_loss": 0.08751311153173447, "epoch": 5.850567044696464, "step": 17540 }, { "distill_loss": 0.09833948314189911, "epoch": 5.850567044696464, "step": 17540 }, { "epoch": 5.850567044696464, "ref_ce_loss": 0.08320891112089157, "step": 17540 }, { "epoch": 5.850567044696464, "loss": 0.3248119652271271, "step": 17540 }, { "ce_loss": 0.05598754063248634, "epoch": 5.850567044696464, "step": 17540 }, { "distill_loss": 0.13237228989601135, "epoch": 5.850567044696464, "step": 17540 }, { "epoch": 5.850567044696464, "ref_ce_loss": 0.05231640860438347, "step": 17540 }, { "epoch": 5.850567044696464, "loss": 0.21130509674549103, "step": 17540 }, { "ce_loss": 0.026743734255433083, "epoch": 5.850567044696464, "step": 17540 }, { "distill_loss": 0.10511060059070587, "epoch": 5.850567044696464, "step": 17540 }, { "epoch": 5.850567044696464, "ref_ce_loss": 0.05859754607081413, "step": 17540 }, { "epoch": 5.850567044696464, "loss": 0.4116530418395996, "step": 17540 }, { "ce_loss": 0.08053731918334961, "epoch": 5.850567044696464, "step": 17540 }, { "distill_loss": 0.21697655320167542, "epoch": 5.850567044696464, "step": 17540 }, { "epoch": 5.850567044696464, "ref_ce_loss": 0.08334068953990936, "step": 17540 }, { "epoch": 5.8539026017344895, "loss": 0.3875, "step": 17550 }, { "epoch": 5.8539026017344895, "grad_norm": 1.7867008447647095, "step": 17550 }, { "epoch": 5.8539026017344895, "learning_rate": 5.314463264274367e-05, "step": 17550 }, { "epoch": 5.8539026017344895, "loss": 0.32216352224349976, "step": 17550 }, { "ce_loss": 0.08698655664920807, "epoch": 5.8539026017344895, "step": 17550 }, { "distill_loss": 0.12306316196918488, "epoch": 5.8539026017344895, "step": 17550 }, { "epoch": 5.8539026017344895, "ref_ce_loss": 0.07335870712995529, "step": 17550 }, { "epoch": 5.8539026017344895, "loss": 0.3635510802268982, "step": 17550 }, { "ce_loss": 0.06758643686771393, "epoch": 5.8539026017344895, "step": 17550 }, { "distill_loss": 0.19247770309448242, "epoch": 5.8539026017344895, "step": 17550 }, { "epoch": 5.8539026017344895, "ref_ce_loss": 0.08190000057220459, "step": 17550 }, { "epoch": 5.8539026017344895, "loss": 0.4638984799385071, "step": 17550 }, { "ce_loss": 0.09081947803497314, "epoch": 5.8539026017344895, "step": 17550 }, { "distill_loss": 0.175666943192482, "epoch": 5.8539026017344895, "step": 17550 }, { "epoch": 5.8539026017344895, "ref_ce_loss": 0.09541428089141846, "step": 17550 }, { "epoch": 5.8539026017344895, "loss": 0.3402867913246155, "step": 17550 }, { "ce_loss": 0.08957655727863312, "epoch": 5.8539026017344895, "step": 17550 }, { "distill_loss": 0.1153869479894638, "epoch": 5.8539026017344895, "step": 17550 }, { "epoch": 5.8539026017344895, "ref_ce_loss": 0.09455670416355133, "step": 17550 }, { "epoch": 5.857238158772515, "loss": 0.3689, "step": 17560 }, { "epoch": 5.857238158772515, "grad_norm": 2.4782440662384033, "step": 17560 }, { "epoch": 5.857238158772515, "learning_rate": 5.2990047302426894e-05, "step": 17560 }, { "epoch": 5.857238158772515, "loss": 0.25175487995147705, "step": 17560 }, { "ce_loss": 0.018894441425800323, "epoch": 5.857238158772515, "step": 17560 }, { "distill_loss": 0.110230453312397, "epoch": 5.857238158772515, "step": 17560 }, { "epoch": 5.857238158772515, "ref_ce_loss": 0.04608863964676857, "step": 17560 }, { "epoch": 5.857238158772515, "loss": 0.4973832666873932, "step": 17560 }, { "ce_loss": 0.09207753837108612, "epoch": 5.857238158772515, "step": 17560 }, { "distill_loss": 0.14106489717960358, "epoch": 5.857238158772515, "step": 17560 }, { "epoch": 5.857238158772515, "ref_ce_loss": 0.05099315196275711, "step": 17560 }, { "epoch": 5.857238158772515, "loss": 0.27829593420028687, "step": 17560 }, { "ce_loss": 0.0815405324101448, "epoch": 5.857238158772515, "step": 17560 }, { "distill_loss": 0.11980370432138443, "epoch": 5.857238158772515, "step": 17560 }, { "epoch": 5.857238158772515, "ref_ce_loss": 0.053519003093242645, "step": 17560 }, { "epoch": 5.857238158772515, "loss": 0.5003367066383362, "step": 17560 }, { "ce_loss": 0.1114412397146225, "epoch": 5.857238158772515, "step": 17560 }, { "distill_loss": 0.17173542082309723, "epoch": 5.857238158772515, "step": 17560 }, { "epoch": 5.857238158772515, "ref_ce_loss": 0.09367650002241135, "step": 17560 }, { "epoch": 5.86057371581054, "loss": 0.3489, "step": 17570 }, { "epoch": 5.86057371581054, "grad_norm": 2.932281494140625, "step": 17570 }, { "epoch": 5.86057371581054, "learning_rate": 5.283563886999651e-05, "step": 17570 }, { "epoch": 5.86057371581054, "loss": 0.41849589347839355, "step": 17570 }, { "ce_loss": 0.07301030308008194, "epoch": 5.86057371581054, "step": 17570 }, { "distill_loss": 0.14979524910449982, "epoch": 5.86057371581054, "step": 17570 }, { "epoch": 5.86057371581054, "ref_ce_loss": 0.08146942406892776, "step": 17570 }, { "epoch": 5.86057371581054, "loss": 0.2620297074317932, "step": 17570 }, { "ce_loss": 0.060599468648433685, "epoch": 5.86057371581054, "step": 17570 }, { "distill_loss": 0.09925365447998047, "epoch": 5.86057371581054, "step": 17570 }, { "epoch": 5.86057371581054, "ref_ce_loss": 0.06362560391426086, "step": 17570 }, { "epoch": 5.86057371581054, "loss": 0.4664449691772461, "step": 17570 }, { "ce_loss": 0.04816756024956703, "epoch": 5.86057371581054, "step": 17570 }, { "distill_loss": 0.14493735134601593, "epoch": 5.86057371581054, "step": 17570 }, { "epoch": 5.86057371581054, "ref_ce_loss": 0.09040951728820801, "step": 17570 }, { "epoch": 5.86057371581054, "loss": 0.23289237916469574, "step": 17570 }, { "ce_loss": 0.015162667259573936, "epoch": 5.86057371581054, "step": 17570 }, { "distill_loss": 0.11945579946041107, "epoch": 5.86057371581054, "step": 17570 }, { "epoch": 5.86057371581054, "ref_ce_loss": 0.05992235243320465, "step": 17570 }, { "epoch": 5.863909272848566, "loss": 0.3386, "step": 17580 }, { "epoch": 5.863909272848566, "grad_norm": 1.6773673295974731, "step": 17580 }, { "epoch": 5.863909272848566, "learning_rate": 5.268140762703269e-05, "step": 17580 }, { "epoch": 5.863909272848566, "loss": 0.3034585118293762, "step": 17580 }, { "ce_loss": 0.04708658531308174, "epoch": 5.863909272848566, "step": 17580 }, { "distill_loss": 0.12628522515296936, "epoch": 5.863909272848566, "step": 17580 }, { "epoch": 5.863909272848566, "ref_ce_loss": 0.08748694509267807, "step": 17580 }, { "epoch": 5.863909272848566, "loss": 0.3233617842197418, "step": 17580 }, { "ce_loss": 0.10379615426063538, "epoch": 5.863909272848566, "step": 17580 }, { "distill_loss": 0.10835333913564682, "epoch": 5.863909272848566, "step": 17580 }, { "epoch": 5.863909272848566, "ref_ce_loss": 0.08786173909902573, "step": 17580 }, { "epoch": 5.863909272848566, "loss": 0.3202160596847534, "step": 17580 }, { "ce_loss": 0.042189840227365494, "epoch": 5.863909272848566, "step": 17580 }, { "distill_loss": 0.15545186400413513, "epoch": 5.863909272848566, "step": 17580 }, { "epoch": 5.863909272848566, "ref_ce_loss": 0.08296966552734375, "step": 17580 }, { "epoch": 5.863909272848566, "loss": 0.49785640835762024, "step": 17580 }, { "ce_loss": 0.09063033759593964, "epoch": 5.863909272848566, "step": 17580 }, { "distill_loss": 0.15018562972545624, "epoch": 5.863909272848566, "step": 17580 }, { "epoch": 5.863909272848566, "ref_ce_loss": 0.08623871207237244, "step": 17580 }, { "epoch": 5.867244829886591, "loss": 0.3401, "step": 17590 }, { "epoch": 5.867244829886591, "grad_norm": 4.520524501800537, "step": 17590 }, { "epoch": 5.867244829886591, "learning_rate": 5.2527353854792236e-05, "step": 17590 }, { "epoch": 5.867244829886591, "loss": 0.43522706627845764, "step": 17590 }, { "ce_loss": 0.0914255827665329, "epoch": 5.867244829886591, "step": 17590 }, { "distill_loss": 0.191763773560524, "epoch": 5.867244829886591, "step": 17590 }, { "epoch": 5.867244829886591, "ref_ce_loss": 0.07178188115358353, "step": 17590 }, { "epoch": 5.867244829886591, "loss": 0.31930312514305115, "step": 17590 }, { "ce_loss": 0.062497545033693314, "epoch": 5.867244829886591, "step": 17590 }, { "distill_loss": 0.186318039894104, "epoch": 5.867244829886591, "step": 17590 }, { "epoch": 5.867244829886591, "ref_ce_loss": 0.05268324166536331, "step": 17590 }, { "epoch": 5.867244829886591, "loss": 0.386909157037735, "step": 17590 }, { "ce_loss": 0.11041705310344696, "epoch": 5.867244829886591, "step": 17590 }, { "distill_loss": 0.1834145039319992, "epoch": 5.867244829886591, "step": 17590 }, { "epoch": 5.867244829886591, "ref_ce_loss": 0.09284200519323349, "step": 17590 }, { "epoch": 5.867244829886591, "loss": 0.2399304360151291, "step": 17590 }, { "ce_loss": 0.035906922072172165, "epoch": 5.867244829886591, "step": 17590 }, { "distill_loss": 0.08376122266054153, "epoch": 5.867244829886591, "step": 17590 }, { "epoch": 5.867244829886591, "ref_ce_loss": 0.03996507450938225, "step": 17590 }, { "epoch": 5.870580386924616, "loss": 0.3953, "step": 17600 }, { "epoch": 5.870580386924616, "grad_norm": 3.3450121879577637, "step": 17600 }, { "epoch": 5.870580386924616, "learning_rate": 5.237347783420854e-05, "step": 17600 }, { "epoch": 5.870580386924616, "loss": 0.6455330848693848, "step": 17600 }, { "ce_loss": 0.11936826258897781, "epoch": 5.870580386924616, "step": 17600 }, { "distill_loss": 0.1688607782125473, "epoch": 5.870580386924616, "step": 17600 }, { "epoch": 5.870580386924616, "ref_ce_loss": 0.08025792241096497, "step": 17600 }, { "epoch": 5.870580386924616, "loss": 0.41483816504478455, "step": 17600 }, { "ce_loss": 0.072411447763443, "epoch": 5.870580386924616, "step": 17600 }, { "distill_loss": 0.16342517733573914, "epoch": 5.870580386924616, "step": 17600 }, { "epoch": 5.870580386924616, "ref_ce_loss": 0.03735579922795296, "step": 17600 }, { "epoch": 5.870580386924616, "loss": 0.44591468572616577, "step": 17600 }, { "ce_loss": 0.036500539630651474, "epoch": 5.870580386924616, "step": 17600 }, { "distill_loss": 0.10168357193470001, "epoch": 5.870580386924616, "step": 17600 }, { "epoch": 5.870580386924616, "ref_ce_loss": 0.06447294354438782, "step": 17600 }, { "epoch": 5.870580386924616, "loss": 0.4203828275203705, "step": 17600 }, { "ce_loss": 0.12469840794801712, "epoch": 5.870580386924616, "step": 17600 }, { "distill_loss": 0.18332841992378235, "epoch": 5.870580386924616, "step": 17600 }, { "epoch": 5.870580386924616, "ref_ce_loss": 0.06972688436508179, "step": 17600 }, { "epoch": 5.873915943962642, "loss": 0.3433, "step": 17610 }, { "epoch": 5.873915943962642, "grad_norm": 2.724898099899292, "step": 17610 }, { "epoch": 5.873915943962642, "learning_rate": 5.221977984589075e-05, "step": 17610 }, { "epoch": 5.873915943962642, "loss": 0.5014619827270508, "step": 17610 }, { "ce_loss": 0.14343143999576569, "epoch": 5.873915943962642, "step": 17610 }, { "distill_loss": 0.16040818393230438, "epoch": 5.873915943962642, "step": 17610 }, { "epoch": 5.873915943962642, "ref_ce_loss": 0.09495589882135391, "step": 17610 }, { "epoch": 5.873915943962642, "loss": 0.2154809683561325, "step": 17610 }, { "ce_loss": 0.06345026940107346, "epoch": 5.873915943962642, "step": 17610 }, { "distill_loss": 0.10396941751241684, "epoch": 5.873915943962642, "step": 17610 }, { "epoch": 5.873915943962642, "ref_ce_loss": 0.047588132321834564, "step": 17610 }, { "epoch": 5.873915943962642, "loss": 0.28772875666618347, "step": 17610 }, { "ce_loss": 0.06648959964513779, "epoch": 5.873915943962642, "step": 17610 }, { "distill_loss": 0.1429111659526825, "epoch": 5.873915943962642, "step": 17610 }, { "epoch": 5.873915943962642, "ref_ce_loss": 0.05615722015500069, "step": 17610 }, { "epoch": 5.873915943962642, "loss": 0.19824743270874023, "step": 17610 }, { "ce_loss": 0.057200632989406586, "epoch": 5.873915943962642, "step": 17610 }, { "distill_loss": 0.10092482715845108, "epoch": 5.873915943962642, "step": 17610 }, { "epoch": 5.873915943962642, "ref_ce_loss": 0.04004830867052078, "step": 17610 }, { "epoch": 5.877251501000667, "loss": 0.3221, "step": 17620 }, { "epoch": 5.877251501000667, "grad_norm": 6.1942949295043945, "step": 17620 }, { "epoch": 5.877251501000667, "learning_rate": 5.206626017012337e-05, "step": 17620 }, { "epoch": 5.877251501000667, "loss": 0.5739012360572815, "step": 17620 }, { "ce_loss": 0.2185136377811432, "epoch": 5.877251501000667, "step": 17620 }, { "distill_loss": 0.15722893178462982, "epoch": 5.877251501000667, "step": 17620 }, { "epoch": 5.877251501000667, "ref_ce_loss": 0.11173129081726074, "step": 17620 }, { "epoch": 5.877251501000667, "loss": 0.3570323586463928, "step": 17620 }, { "ce_loss": 0.07485233247280121, "epoch": 5.877251501000667, "step": 17620 }, { "distill_loss": 0.12562787532806396, "epoch": 5.877251501000667, "step": 17620 }, { "epoch": 5.877251501000667, "ref_ce_loss": 0.08965486288070679, "step": 17620 }, { "epoch": 5.877251501000667, "loss": 0.5900654196739197, "step": 17620 }, { "ce_loss": 0.03555797412991524, "epoch": 5.877251501000667, "step": 17620 }, { "distill_loss": 0.10250119864940643, "epoch": 5.877251501000667, "step": 17620 }, { "epoch": 5.877251501000667, "ref_ce_loss": 0.05633779987692833, "step": 17620 }, { "epoch": 5.877251501000667, "loss": 0.23747488856315613, "step": 17620 }, { "ce_loss": 0.024216409772634506, "epoch": 5.877251501000667, "step": 17620 }, { "distill_loss": 0.11333998292684555, "epoch": 5.877251501000667, "step": 17620 }, { "epoch": 5.877251501000667, "ref_ce_loss": 0.05125695839524269, "step": 17620 }, { "epoch": 5.880587058038692, "loss": 0.3655, "step": 17630 }, { "epoch": 5.880587058038692, "grad_norm": 2.7308340072631836, "step": 17630 }, { "epoch": 5.880587058038692, "learning_rate": 5.1912919086865784e-05, "step": 17630 }, { "epoch": 5.880587058038692, "loss": 0.32668551802635193, "step": 17630 }, { "ce_loss": 0.07541286200284958, "epoch": 5.880587058038692, "step": 17630 }, { "distill_loss": 0.14657814800739288, "epoch": 5.880587058038692, "step": 17630 }, { "epoch": 5.880587058038692, "ref_ce_loss": 0.07333363592624664, "step": 17630 }, { "epoch": 5.880587058038692, "loss": 0.2846378684043884, "step": 17630 }, { "ce_loss": 0.04143878072500229, "epoch": 5.880587058038692, "step": 17630 }, { "distill_loss": 0.13445129990577698, "epoch": 5.880587058038692, "step": 17630 }, { "epoch": 5.880587058038692, "ref_ce_loss": 0.07378637790679932, "step": 17630 }, { "epoch": 5.880587058038692, "loss": 0.24098753929138184, "step": 17630 }, { "ce_loss": 0.029380042105913162, "epoch": 5.880587058038692, "step": 17630 }, { "distill_loss": 0.118621826171875, "epoch": 5.880587058038692, "step": 17630 }, { "epoch": 5.880587058038692, "ref_ce_loss": 0.06290895491838455, "step": 17630 }, { "epoch": 5.880587058038692, "loss": 0.3457111418247223, "step": 17630 }, { "ce_loss": 0.05599174648523331, "epoch": 5.880587058038692, "step": 17630 }, { "distill_loss": 0.13150471448898315, "epoch": 5.880587058038692, "step": 17630 }, { "epoch": 5.880587058038692, "ref_ce_loss": 0.08102560043334961, "step": 17630 }, { "epoch": 5.883922615076718, "loss": 0.3634, "step": 17640 }, { "epoch": 5.883922615076718, "grad_norm": 2.354527711868286, "step": 17640 }, { "epoch": 5.883922615076718, "learning_rate": 5.1759756875751543e-05, "step": 17640 }, { "epoch": 5.883922615076718, "loss": 0.4567744731903076, "step": 17640 }, { "ce_loss": 0.06487561762332916, "epoch": 5.883922615076718, "step": 17640 }, { "distill_loss": 0.14871536195278168, "epoch": 5.883922615076718, "step": 17640 }, { "epoch": 5.883922615076718, "ref_ce_loss": 0.1132284477353096, "step": 17640 }, { "epoch": 5.883922615076718, "loss": 0.3961792290210724, "step": 17640 }, { "ce_loss": 0.11353077739477158, "epoch": 5.883922615076718, "step": 17640 }, { "distill_loss": 0.12885455787181854, "epoch": 5.883922615076718, "step": 17640 }, { "epoch": 5.883922615076718, "ref_ce_loss": 0.09726857393980026, "step": 17640 }, { "epoch": 5.883922615076718, "loss": 0.32877179980278015, "step": 17640 }, { "ce_loss": 0.0983106791973114, "epoch": 5.883922615076718, "step": 17640 }, { "distill_loss": 0.15973711013793945, "epoch": 5.883922615076718, "step": 17640 }, { "epoch": 5.883922615076718, "ref_ce_loss": 0.07053600996732712, "step": 17640 }, { "epoch": 5.883922615076718, "loss": 0.19096627831459045, "step": 17640 }, { "ce_loss": 0.03290770947933197, "epoch": 5.883922615076718, "step": 17640 }, { "distill_loss": 0.11705254018306732, "epoch": 5.883922615076718, "step": 17640 }, { "epoch": 5.883922615076718, "ref_ce_loss": 0.04061080887913704, "step": 17640 }, { "epoch": 5.887258172114743, "loss": 0.3413, "step": 17650 }, { "epoch": 5.887258172114743, "grad_norm": 3.297376871109009, "step": 17650 }, { "epoch": 5.887258172114743, "learning_rate": 5.160677381608814e-05, "step": 17650 }, { "epoch": 5.887258172114743, "loss": 0.3749341070652008, "step": 17650 }, { "ce_loss": 0.05691100284457207, "epoch": 5.887258172114743, "step": 17650 }, { "distill_loss": 0.12351744621992111, "epoch": 5.887258172114743, "step": 17650 }, { "epoch": 5.887258172114743, "ref_ce_loss": 0.055408477783203125, "step": 17650 }, { "epoch": 5.887258172114743, "loss": 0.43304702639579773, "step": 17650 }, { "ce_loss": 0.07342436164617538, "epoch": 5.887258172114743, "step": 17650 }, { "distill_loss": 0.11726836860179901, "epoch": 5.887258172114743, "step": 17650 }, { "epoch": 5.887258172114743, "ref_ce_loss": 0.08709575235843658, "step": 17650 }, { "epoch": 5.887258172114743, "loss": 0.3348458409309387, "step": 17650 }, { "ce_loss": 0.019180668517947197, "epoch": 5.887258172114743, "step": 17650 }, { "distill_loss": 0.1126505583524704, "epoch": 5.887258172114743, "step": 17650 }, { "epoch": 5.887258172114743, "ref_ce_loss": 0.030465707182884216, "step": 17650 }, { "epoch": 5.887258172114743, "loss": 0.6081949472427368, "step": 17650 }, { "ce_loss": 0.104547418653965, "epoch": 5.887258172114743, "step": 17650 }, { "distill_loss": 0.17620466649532318, "epoch": 5.887258172114743, "step": 17650 }, { "epoch": 5.887258172114743, "ref_ce_loss": 0.11676531285047531, "step": 17650 }, { "epoch": 5.890593729152768, "loss": 0.3933, "step": 17660 }, { "epoch": 5.890593729152768, "grad_norm": 3.335609197616577, "step": 17660 }, { "epoch": 5.890593729152768, "learning_rate": 5.14539701868564e-05, "step": 17660 }, { "epoch": 5.890593729152768, "loss": 0.4062153100967407, "step": 17660 }, { "ce_loss": 0.07004745304584503, "epoch": 5.890593729152768, "step": 17660 }, { "distill_loss": 0.16154888272285461, "epoch": 5.890593729152768, "step": 17660 }, { "epoch": 5.890593729152768, "ref_ce_loss": 0.07381752133369446, "step": 17660 }, { "epoch": 5.890593729152768, "loss": 0.5447138547897339, "step": 17660 }, { "ce_loss": 0.10842008143663406, "epoch": 5.890593729152768, "step": 17660 }, { "distill_loss": 0.3149142861366272, "epoch": 5.890593729152768, "step": 17660 }, { "epoch": 5.890593729152768, "ref_ce_loss": 0.12105908244848251, "step": 17660 }, { "epoch": 5.890593729152768, "loss": 0.37604960799217224, "step": 17660 }, { "ce_loss": 0.07257735729217529, "epoch": 5.890593729152768, "step": 17660 }, { "distill_loss": 0.1173940971493721, "epoch": 5.890593729152768, "step": 17660 }, { "epoch": 5.890593729152768, "ref_ce_loss": 0.09473676234483719, "step": 17660 }, { "epoch": 5.890593729152768, "loss": 0.3164910078048706, "step": 17660 }, { "ce_loss": 0.03366474434733391, "epoch": 5.890593729152768, "step": 17660 }, { "distill_loss": 0.1109549030661583, "epoch": 5.890593729152768, "step": 17660 }, { "epoch": 5.890593729152768, "ref_ce_loss": 0.08360662311315536, "step": 17660 }, { "epoch": 5.893929286190794, "loss": 0.3591, "step": 17670 }, { "epoch": 5.893929286190794, "grad_norm": 2.3944671154022217, "step": 17670 }, { "epoch": 5.893929286190794, "learning_rate": 5.1301346266709684e-05, "step": 17670 }, { "epoch": 5.893929286190794, "loss": 0.21314899623394012, "step": 17670 }, { "ce_loss": 0.03496822342276573, "epoch": 5.893929286190794, "step": 17670 }, { "distill_loss": 0.11594592034816742, "epoch": 5.893929286190794, "step": 17670 }, { "epoch": 5.893929286190794, "ref_ce_loss": 0.047304894775152206, "step": 17670 }, { "epoch": 5.893929286190794, "loss": 0.467916876077652, "step": 17670 }, { "ce_loss": 0.15920695662498474, "epoch": 5.893929286190794, "step": 17670 }, { "distill_loss": 0.15867634117603302, "epoch": 5.893929286190794, "step": 17670 }, { "epoch": 5.893929286190794, "ref_ce_loss": 0.07394873350858688, "step": 17670 }, { "epoch": 5.893929286190794, "loss": 0.19804324209690094, "step": 17670 }, { "ce_loss": 0.018038207665085793, "epoch": 5.893929286190794, "step": 17670 }, { "distill_loss": 0.09465809911489487, "epoch": 5.893929286190794, "step": 17670 }, { "epoch": 5.893929286190794, "ref_ce_loss": 0.05995374172925949, "step": 17670 }, { "epoch": 5.893929286190794, "loss": 0.4456608295440674, "step": 17670 }, { "ce_loss": 0.13386403024196625, "epoch": 5.893929286190794, "step": 17670 }, { "distill_loss": 0.17635229229927063, "epoch": 5.893929286190794, "step": 17670 }, { "epoch": 5.893929286190794, "ref_ce_loss": 0.07214650511741638, "step": 17670 }, { "epoch": 5.897264843228819, "loss": 0.3387, "step": 17680 }, { "epoch": 5.897264843228819, "grad_norm": 3.302736759185791, "step": 17680 }, { "epoch": 5.897264843228819, "learning_rate": 5.114890233397405e-05, "step": 17680 }, { "epoch": 5.897264843228819, "loss": 0.3278179168701172, "step": 17680 }, { "ce_loss": 0.06378965824842453, "epoch": 5.897264843228819, "step": 17680 }, { "distill_loss": 0.10937260091304779, "epoch": 5.897264843228819, "step": 17680 }, { "epoch": 5.897264843228819, "ref_ce_loss": 0.08671337366104126, "step": 17680 }, { "epoch": 5.897264843228819, "loss": 1.0311903953552246, "step": 17680 }, { "ce_loss": 0.1758325695991516, "epoch": 5.897264843228819, "step": 17680 }, { "distill_loss": 0.17192047834396362, "epoch": 5.897264843228819, "step": 17680 }, { "epoch": 5.897264843228819, "ref_ce_loss": 0.07445625215768814, "step": 17680 }, { "epoch": 5.897264843228819, "loss": 0.46555769443511963, "step": 17680 }, { "ce_loss": 0.0498911514878273, "epoch": 5.897264843228819, "step": 17680 }, { "distill_loss": 0.1123102605342865, "epoch": 5.897264843228819, "step": 17680 }, { "epoch": 5.897264843228819, "ref_ce_loss": 0.047028716653585434, "step": 17680 }, { "epoch": 5.897264843228819, "loss": 0.6143039464950562, "step": 17680 }, { "ce_loss": 0.12528234720230103, "epoch": 5.897264843228819, "step": 17680 }, { "distill_loss": 0.15658093988895416, "epoch": 5.897264843228819, "step": 17680 }, { "epoch": 5.897264843228819, "ref_ce_loss": 0.0682942271232605, "step": 17680 }, { "epoch": 5.900600400266844, "loss": 0.3652, "step": 17690 }, { "epoch": 5.900600400266844, "grad_norm": 2.636820077896118, "step": 17690 }, { "epoch": 5.900600400266844, "learning_rate": 5.0996638666646916e-05, "step": 17690 }, { "epoch": 5.900600400266844, "loss": 0.4062114357948303, "step": 17690 }, { "ce_loss": 0.05009109526872635, "epoch": 5.900600400266844, "step": 17690 }, { "distill_loss": 0.15433265268802643, "epoch": 5.900600400266844, "step": 17690 }, { "epoch": 5.900600400266844, "ref_ce_loss": 0.09808304905891418, "step": 17690 }, { "epoch": 5.900600400266844, "loss": 0.6093151569366455, "step": 17690 }, { "ce_loss": 0.09959680587053299, "epoch": 5.900600400266844, "step": 17690 }, { "distill_loss": 0.1387074738740921, "epoch": 5.900600400266844, "step": 17690 }, { "epoch": 5.900600400266844, "ref_ce_loss": 0.05216062441468239, "step": 17690 }, { "epoch": 5.900600400266844, "loss": 0.3263792395591736, "step": 17690 }, { "ce_loss": 0.09031203389167786, "epoch": 5.900600400266844, "step": 17690 }, { "distill_loss": 0.13475021719932556, "epoch": 5.900600400266844, "step": 17690 }, { "epoch": 5.900600400266844, "ref_ce_loss": 0.06634215265512466, "step": 17690 }, { "epoch": 5.900600400266844, "loss": 0.35873445868492126, "step": 17690 }, { "ce_loss": 0.069797083735466, "epoch": 5.900600400266844, "step": 17690 }, { "distill_loss": 0.15537424385547638, "epoch": 5.900600400266844, "step": 17690 }, { "epoch": 5.900600400266844, "ref_ce_loss": 0.11273673176765442, "step": 17690 }, { "epoch": 5.90393595730487, "loss": 0.3705, "step": 17700 }, { "epoch": 5.90393595730487, "grad_norm": 1.9607138633728027, "step": 17700 }, { "epoch": 5.90393595730487, "learning_rate": 5.084455554239724e-05, "step": 17700 }, { "epoch": 5.90393595730487, "loss": 0.2984341084957123, "step": 17700 }, { "ce_loss": 0.04308363050222397, "epoch": 5.90393595730487, "step": 17700 }, { "distill_loss": 0.1266622692346573, "epoch": 5.90393595730487, "step": 17700 }, { "epoch": 5.90393595730487, "ref_ce_loss": 0.06170135736465454, "step": 17700 }, { "epoch": 5.90393595730487, "loss": 0.2519134283065796, "step": 17700 }, { "ce_loss": 0.03667460009455681, "epoch": 5.90393595730487, "step": 17700 }, { "distill_loss": 0.12148652225732803, "epoch": 5.90393595730487, "step": 17700 }, { "epoch": 5.90393595730487, "ref_ce_loss": 0.05401930958032608, "step": 17700 }, { "epoch": 5.90393595730487, "loss": 0.42079973220825195, "step": 17700 }, { "ce_loss": 0.07386600226163864, "epoch": 5.90393595730487, "step": 17700 }, { "distill_loss": 0.20743930339813232, "epoch": 5.90393595730487, "step": 17700 }, { "epoch": 5.90393595730487, "ref_ce_loss": 0.10644058138132095, "step": 17700 }, { "epoch": 5.90393595730487, "loss": 0.5361905694007874, "step": 17700 }, { "ce_loss": 0.08765469491481781, "epoch": 5.90393595730487, "step": 17700 }, { "distill_loss": 0.13080808520317078, "epoch": 5.90393595730487, "step": 17700 }, { "epoch": 5.90393595730487, "ref_ce_loss": 0.06986983120441437, "step": 17700 }, { "epoch": 5.907271514342895, "loss": 0.3395, "step": 17710 }, { "epoch": 5.907271514342895, "grad_norm": 2.605874538421631, "step": 17710 }, { "epoch": 5.907271514342895, "learning_rate": 5.069265323856464e-05, "step": 17710 }, { "epoch": 5.907271514342895, "loss": 0.3166840672492981, "step": 17710 }, { "ce_loss": 0.07511896640062332, "epoch": 5.907271514342895, "step": 17710 }, { "distill_loss": 0.11693719774484634, "epoch": 5.907271514342895, "step": 17710 }, { "epoch": 5.907271514342895, "ref_ce_loss": 0.04918194189667702, "step": 17710 }, { "epoch": 5.907271514342895, "loss": 0.2965814471244812, "step": 17710 }, { "ce_loss": 0.029125245288014412, "epoch": 5.907271514342895, "step": 17710 }, { "distill_loss": 0.1444043070077896, "epoch": 5.907271514342895, "step": 17710 }, { "epoch": 5.907271514342895, "ref_ce_loss": 0.05168358236551285, "step": 17710 }, { "epoch": 5.907271514342895, "loss": 0.3079814612865448, "step": 17710 }, { "ce_loss": 0.07331456243991852, "epoch": 5.907271514342895, "step": 17710 }, { "distill_loss": 0.16929419338703156, "epoch": 5.907271514342895, "step": 17710 }, { "epoch": 5.907271514342895, "ref_ce_loss": 0.0648675486445427, "step": 17710 }, { "epoch": 5.907271514342895, "loss": 0.2419438362121582, "step": 17710 }, { "ce_loss": 0.032821644097566605, "epoch": 5.907271514342895, "step": 17710 }, { "distill_loss": 0.16019362211227417, "epoch": 5.907271514342895, "step": 17710 }, { "epoch": 5.907271514342895, "ref_ce_loss": 0.04862841218709946, "step": 17710 }, { "epoch": 5.9106070713809205, "loss": 0.347, "step": 17720 }, { "epoch": 5.9106070713809205, "grad_norm": 2.5357227325439453, "step": 17720 }, { "epoch": 5.9106070713809205, "learning_rate": 5.054093203215896e-05, "step": 17720 }, { "epoch": 5.9106070713809205, "loss": 0.40148186683654785, "step": 17720 }, { "ce_loss": 0.11128007620573044, "epoch": 5.9106070713809205, "step": 17720 }, { "distill_loss": 0.18451912701129913, "epoch": 5.9106070713809205, "step": 17720 }, { "epoch": 5.9106070713809205, "ref_ce_loss": 0.10536003857851028, "step": 17720 }, { "epoch": 5.9106070713809205, "loss": 0.21454600989818573, "step": 17720 }, { "ce_loss": 0.060639139264822006, "epoch": 5.9106070713809205, "step": 17720 }, { "distill_loss": 0.08433546870946884, "epoch": 5.9106070713809205, "step": 17720 }, { "epoch": 5.9106070713809205, "ref_ce_loss": 0.05175725743174553, "step": 17720 }, { "epoch": 5.9106070713809205, "loss": 0.42682793736457825, "step": 17720 }, { "ce_loss": 0.08756023645401001, "epoch": 5.9106070713809205, "step": 17720 }, { "distill_loss": 0.17866460978984833, "epoch": 5.9106070713809205, "step": 17720 }, { "epoch": 5.9106070713809205, "ref_ce_loss": 0.090923972427845, "step": 17720 }, { "epoch": 5.9106070713809205, "loss": 0.23504258692264557, "step": 17720 }, { "ce_loss": 0.03457394242286682, "epoch": 5.9106070713809205, "step": 17720 }, { "distill_loss": 0.10484711080789566, "epoch": 5.9106070713809205, "step": 17720 }, { "epoch": 5.9106070713809205, "ref_ce_loss": 0.061616200953722, "step": 17720 }, { "epoch": 5.913942628418946, "loss": 0.3788, "step": 17730 }, { "epoch": 5.913942628418946, "grad_norm": 2.757432460784912, "step": 17730 }, { "epoch": 5.913942628418946, "learning_rate": 5.038939219985979e-05, "step": 17730 }, { "epoch": 5.913942628418946, "loss": 0.7194844484329224, "step": 17730 }, { "ce_loss": 0.11522924154996872, "epoch": 5.913942628418946, "step": 17730 }, { "distill_loss": 0.15983763337135315, "epoch": 5.913942628418946, "step": 17730 }, { "epoch": 5.913942628418946, "ref_ce_loss": 0.059831805527210236, "step": 17730 }, { "epoch": 5.913942628418946, "loss": 0.3257623314857483, "step": 17730 }, { "ce_loss": 0.0717523992061615, "epoch": 5.913942628418946, "step": 17730 }, { "distill_loss": 0.10491523146629333, "epoch": 5.913942628418946, "step": 17730 }, { "epoch": 5.913942628418946, "ref_ce_loss": 0.08156079798936844, "step": 17730 }, { "epoch": 5.913942628418946, "loss": 0.331621378660202, "step": 17730 }, { "ce_loss": 0.0417015366256237, "epoch": 5.913942628418946, "step": 17730 }, { "distill_loss": 0.1441517323255539, "epoch": 5.913942628418946, "step": 17730 }, { "epoch": 5.913942628418946, "ref_ce_loss": 0.06556253135204315, "step": 17730 }, { "epoch": 5.913942628418946, "loss": 0.24882987141609192, "step": 17730 }, { "ce_loss": 0.04481315240263939, "epoch": 5.913942628418946, "step": 17730 }, { "distill_loss": 0.12070439755916595, "epoch": 5.913942628418946, "step": 17730 }, { "epoch": 5.913942628418946, "ref_ce_loss": 0.04913991317152977, "step": 17730 }, { "epoch": 5.917278185456971, "loss": 0.3517, "step": 17740 }, { "epoch": 5.917278185456971, "grad_norm": 2.0407073497772217, "step": 17740 }, { "epoch": 5.917278185456971, "learning_rate": 5.023803401801618e-05, "step": 17740 }, { "epoch": 5.917278185456971, "loss": 0.3508586287498474, "step": 17740 }, { "ce_loss": 0.04194953292608261, "epoch": 5.917278185456971, "step": 17740 }, { "distill_loss": 0.12034259736537933, "epoch": 5.917278185456971, "step": 17740 }, { "epoch": 5.917278185456971, "ref_ce_loss": 0.06494162231683731, "step": 17740 }, { "epoch": 5.917278185456971, "loss": 0.22291912138462067, "step": 17740 }, { "ce_loss": 0.0345107764005661, "epoch": 5.917278185456971, "step": 17740 }, { "distill_loss": 0.09842812269926071, "epoch": 5.917278185456971, "step": 17740 }, { "epoch": 5.917278185456971, "ref_ce_loss": 0.049962058663368225, "step": 17740 }, { "epoch": 5.917278185456971, "loss": 0.39541149139404297, "step": 17740 }, { "ce_loss": 0.08839607238769531, "epoch": 5.917278185456971, "step": 17740 }, { "distill_loss": 0.15726499259471893, "epoch": 5.917278185456971, "step": 17740 }, { "epoch": 5.917278185456971, "ref_ce_loss": 0.08510472625494003, "step": 17740 }, { "epoch": 5.917278185456971, "loss": 0.2958389222621918, "step": 17740 }, { "ce_loss": 0.07011163234710693, "epoch": 5.917278185456971, "step": 17740 }, { "distill_loss": 0.13538995385169983, "epoch": 5.917278185456971, "step": 17740 }, { "epoch": 5.917278185456971, "ref_ce_loss": 0.0671728178858757, "step": 17740 }, { "epoch": 5.9206137424949965, "loss": 0.3611, "step": 17750 }, { "epoch": 5.9206137424949965, "grad_norm": 3.440727949142456, "step": 17750 }, { "epoch": 5.9206137424949965, "learning_rate": 5.0086857762645574e-05, "step": 17750 }, { "epoch": 5.9206137424949965, "loss": 0.2400866150856018, "step": 17750 }, { "ce_loss": 0.04839707911014557, "epoch": 5.9206137424949965, "step": 17750 }, { "distill_loss": 0.12277629971504211, "epoch": 5.9206137424949965, "step": 17750 }, { "epoch": 5.9206137424949965, "ref_ce_loss": 0.03812626749277115, "step": 17750 }, { "epoch": 5.9206137424949965, "loss": 0.33886978030204773, "step": 17750 }, { "ce_loss": 0.053880248218774796, "epoch": 5.9206137424949965, "step": 17750 }, { "distill_loss": 0.19124048948287964, "epoch": 5.9206137424949965, "step": 17750 }, { "epoch": 5.9206137424949965, "ref_ce_loss": 0.06993494182825089, "step": 17750 }, { "epoch": 5.9206137424949965, "loss": 0.4030514359474182, "step": 17750 }, { "ce_loss": 0.0309885386377573, "epoch": 5.9206137424949965, "step": 17750 }, { "distill_loss": 0.12448279559612274, "epoch": 5.9206137424949965, "step": 17750 }, { "epoch": 5.9206137424949965, "ref_ce_loss": 0.06645922362804413, "step": 17750 }, { "epoch": 5.9206137424949965, "loss": 0.3506067395210266, "step": 17750 }, { "ce_loss": 0.09792735427618027, "epoch": 5.9206137424949965, "step": 17750 }, { "distill_loss": 0.15322014689445496, "epoch": 5.9206137424949965, "step": 17750 }, { "epoch": 5.9206137424949965, "ref_ce_loss": 0.05199942737817764, "step": 17750 }, { "epoch": 5.923949299533022, "loss": 0.3787, "step": 17760 }, { "epoch": 5.923949299533022, "grad_norm": 1.9704997539520264, "step": 17760 }, { "epoch": 5.923949299533022, "learning_rate": 4.9935863709433945e-05, "step": 17760 }, { "epoch": 5.923949299533022, "loss": 0.33312490582466125, "step": 17760 }, { "ce_loss": 0.09216049313545227, "epoch": 5.923949299533022, "step": 17760 }, { "distill_loss": 0.11537503451108932, "epoch": 5.923949299533022, "step": 17760 }, { "epoch": 5.923949299533022, "ref_ce_loss": 0.06690085679292679, "step": 17760 }, { "epoch": 5.923949299533022, "loss": 0.6168229579925537, "step": 17760 }, { "ce_loss": 0.1484738290309906, "epoch": 5.923949299533022, "step": 17760 }, { "distill_loss": 0.18674319982528687, "epoch": 5.923949299533022, "step": 17760 }, { "epoch": 5.923949299533022, "ref_ce_loss": 0.13398297131061554, "step": 17760 }, { "epoch": 5.923949299533022, "loss": 0.4197208285331726, "step": 17760 }, { "ce_loss": 0.08689359575510025, "epoch": 5.923949299533022, "step": 17760 }, { "distill_loss": 0.15195444226264954, "epoch": 5.923949299533022, "step": 17760 }, { "epoch": 5.923949299533022, "ref_ce_loss": 0.045860983431339264, "step": 17760 }, { "epoch": 5.923949299533022, "loss": 0.622891902923584, "step": 17760 }, { "ce_loss": 0.13707229495048523, "epoch": 5.923949299533022, "step": 17760 }, { "distill_loss": 0.2516328692436218, "epoch": 5.923949299533022, "step": 17760 }, { "epoch": 5.923949299533022, "ref_ce_loss": 0.09355565160512924, "step": 17760 }, { "epoch": 5.927284856571047, "loss": 0.3647, "step": 17770 }, { "epoch": 5.927284856571047, "grad_norm": 3.1013424396514893, "step": 17770 }, { "epoch": 5.927284856571047, "learning_rate": 4.978505213373479e-05, "step": 17770 }, { "epoch": 5.927284856571047, "loss": 0.25511544942855835, "step": 17770 }, { "ce_loss": 0.04221971705555916, "epoch": 5.927284856571047, "step": 17770 }, { "distill_loss": 0.10919360816478729, "epoch": 5.927284856571047, "step": 17770 }, { "epoch": 5.927284856571047, "ref_ce_loss": 0.07045939564704895, "step": 17770 }, { "epoch": 5.927284856571047, "loss": 0.39829209446907043, "step": 17770 }, { "ce_loss": 0.13580723106861115, "epoch": 5.927284856571047, "step": 17770 }, { "distill_loss": 0.16190184652805328, "epoch": 5.927284856571047, "step": 17770 }, { "epoch": 5.927284856571047, "ref_ce_loss": 0.08752710372209549, "step": 17770 }, { "epoch": 5.927284856571047, "loss": 0.49762338399887085, "step": 17770 }, { "ce_loss": 0.11486893892288208, "epoch": 5.927284856571047, "step": 17770 }, { "distill_loss": 0.1458262801170349, "epoch": 5.927284856571047, "step": 17770 }, { "epoch": 5.927284856571047, "ref_ce_loss": 0.08525654673576355, "step": 17770 }, { "epoch": 5.927284856571047, "loss": 0.34713953733444214, "step": 17770 }, { "ce_loss": 0.13515137135982513, "epoch": 5.927284856571047, "step": 17770 }, { "distill_loss": 0.13119323551654816, "epoch": 5.927284856571047, "step": 17770 }, { "epoch": 5.927284856571047, "ref_ce_loss": 0.06520017981529236, "step": 17770 }, { "epoch": 5.9306204136090725, "loss": 0.3589, "step": 17780 }, { "epoch": 5.9306204136090725, "grad_norm": 4.23400354385376, "step": 17780 }, { "epoch": 5.9306204136090725, "learning_rate": 4.9634423310568963e-05, "step": 17780 }, { "epoch": 5.9306204136090725, "loss": 0.4135797321796417, "step": 17780 }, { "ce_loss": 0.04827263206243515, "epoch": 5.9306204136090725, "step": 17780 }, { "distill_loss": 0.16114744544029236, "epoch": 5.9306204136090725, "step": 17780 }, { "epoch": 5.9306204136090725, "ref_ce_loss": 0.09109925478696823, "step": 17780 }, { "epoch": 5.9306204136090725, "loss": 0.7013535499572754, "step": 17780 }, { "ce_loss": 0.04648328199982643, "epoch": 5.9306204136090725, "step": 17780 }, { "distill_loss": 0.1734389215707779, "epoch": 5.9306204136090725, "step": 17780 }, { "epoch": 5.9306204136090725, "ref_ce_loss": 0.12075760960578918, "step": 17780 }, { "epoch": 5.9306204136090725, "loss": 0.32692864537239075, "step": 17780 }, { "ce_loss": 0.08308849483728409, "epoch": 5.9306204136090725, "step": 17780 }, { "distill_loss": 0.1307058483362198, "epoch": 5.9306204136090725, "step": 17780 }, { "epoch": 5.9306204136090725, "ref_ce_loss": 0.08304446935653687, "step": 17780 }, { "epoch": 5.9306204136090725, "loss": 0.8171455264091492, "step": 17780 }, { "ce_loss": 0.1697182059288025, "epoch": 5.9306204136090725, "step": 17780 }, { "distill_loss": 0.2494955211877823, "epoch": 5.9306204136090725, "step": 17780 }, { "epoch": 5.9306204136090725, "ref_ce_loss": 0.14302107691764832, "step": 17780 }, { "epoch": 5.933955970647098, "loss": 0.4004, "step": 17790 }, { "epoch": 5.933955970647098, "grad_norm": 3.5227556228637695, "step": 17790 }, { "epoch": 5.933955970647098, "learning_rate": 4.948397751462402e-05, "step": 17790 }, { "epoch": 5.933955970647098, "loss": 0.2799118161201477, "step": 17790 }, { "ce_loss": 0.02078850008547306, "epoch": 5.933955970647098, "step": 17790 }, { "distill_loss": 0.11289118975400925, "epoch": 5.933955970647098, "step": 17790 }, { "epoch": 5.933955970647098, "ref_ce_loss": 0.07155484706163406, "step": 17790 }, { "epoch": 5.933955970647098, "loss": 0.3532387614250183, "step": 17790 }, { "ce_loss": 0.06364618241786957, "epoch": 5.933955970647098, "step": 17790 }, { "distill_loss": 0.13902947306632996, "epoch": 5.933955970647098, "step": 17790 }, { "epoch": 5.933955970647098, "ref_ce_loss": 0.07811892777681351, "step": 17790 }, { "epoch": 5.933955970647098, "loss": 0.40037864446640015, "step": 17790 }, { "ce_loss": 0.13156820833683014, "epoch": 5.933955970647098, "step": 17790 }, { "distill_loss": 0.16982513666152954, "epoch": 5.933955970647098, "step": 17790 }, { "epoch": 5.933955970647098, "ref_ce_loss": 0.07489024102687836, "step": 17790 }, { "epoch": 5.933955970647098, "loss": 0.32981160283088684, "step": 17790 }, { "ce_loss": 0.08601900935173035, "epoch": 5.933955970647098, "step": 17790 }, { "distill_loss": 0.15488427877426147, "epoch": 5.933955970647098, "step": 17790 }, { "epoch": 5.933955970647098, "ref_ce_loss": 0.08870402723550797, "step": 17790 }, { "epoch": 5.937291527685123, "loss": 0.3339, "step": 17800 }, { "epoch": 5.937291527685123, "grad_norm": 2.76710844039917, "step": 17800 }, { "epoch": 5.937291527685123, "learning_rate": 4.933371502025377e-05, "step": 17800 }, { "epoch": 5.937291527685123, "loss": 0.19565752148628235, "step": 17800 }, { "ce_loss": 0.032000426203012466, "epoch": 5.937291527685123, "step": 17800 }, { "distill_loss": 0.08187977969646454, "epoch": 5.937291527685123, "step": 17800 }, { "epoch": 5.937291527685123, "ref_ce_loss": 0.04776681214570999, "step": 17800 }, { "epoch": 5.937291527685123, "loss": 0.3080092668533325, "step": 17800 }, { "ce_loss": 0.05300501361489296, "epoch": 5.937291527685123, "step": 17800 }, { "distill_loss": 0.11545910686254501, "epoch": 5.937291527685123, "step": 17800 }, { "epoch": 5.937291527685123, "ref_ce_loss": 0.08720026910305023, "step": 17800 }, { "epoch": 5.937291527685123, "loss": 0.9395288228988647, "step": 17800 }, { "ce_loss": 0.09806068241596222, "epoch": 5.937291527685123, "step": 17800 }, { "distill_loss": 0.1455065906047821, "epoch": 5.937291527685123, "step": 17800 }, { "epoch": 5.937291527685123, "ref_ce_loss": 0.098774254322052, "step": 17800 }, { "epoch": 5.937291527685123, "loss": 0.3972787857055664, "step": 17800 }, { "ce_loss": 0.08753319829702377, "epoch": 5.937291527685123, "step": 17800 }, { "distill_loss": 0.1433761566877365, "epoch": 5.937291527685123, "step": 17800 }, { "epoch": 5.937291527685123, "ref_ce_loss": 0.07781031727790833, "step": 17800 }, { "epoch": 5.940627084723149, "loss": 0.3778, "step": 17810 }, { "epoch": 5.940627084723149, "grad_norm": 2.1027743816375732, "step": 17810 }, { "epoch": 5.940627084723149, "learning_rate": 4.918363610147775e-05, "step": 17810 }, { "epoch": 5.940627084723149, "loss": 0.2537614703178406, "step": 17810 }, { "ce_loss": 0.08498618751764297, "epoch": 5.940627084723149, "step": 17810 }, { "distill_loss": 0.10000818222761154, "epoch": 5.940627084723149, "step": 17810 }, { "epoch": 5.940627084723149, "ref_ce_loss": 0.044208452105522156, "step": 17810 }, { "epoch": 5.940627084723149, "loss": 0.34233108162879944, "step": 17810 }, { "ce_loss": 0.10245108604431152, "epoch": 5.940627084723149, "step": 17810 }, { "distill_loss": 0.1445772349834442, "epoch": 5.940627084723149, "step": 17810 }, { "epoch": 5.940627084723149, "ref_ce_loss": 0.09517716616392136, "step": 17810 }, { "epoch": 5.940627084723149, "loss": 0.6560655236244202, "step": 17810 }, { "ce_loss": 0.07113312929868698, "epoch": 5.940627084723149, "step": 17810 }, { "distill_loss": 0.13794416189193726, "epoch": 5.940627084723149, "step": 17810 }, { "epoch": 5.940627084723149, "ref_ce_loss": 0.036306753754615784, "step": 17810 }, { "epoch": 5.940627084723149, "loss": 0.7157806158065796, "step": 17810 }, { "ce_loss": 0.0921962708234787, "epoch": 5.940627084723149, "step": 17810 }, { "distill_loss": 0.17530521750450134, "epoch": 5.940627084723149, "step": 17810 }, { "epoch": 5.940627084723149, "ref_ce_loss": 0.10351521521806717, "step": 17810 }, { "epoch": 5.943962641761174, "loss": 0.4083, "step": 17820 }, { "epoch": 5.943962641761174, "grad_norm": 3.1235644817352295, "step": 17820 }, { "epoch": 5.943962641761174, "learning_rate": 4.903374103198064e-05, "step": 17820 }, { "epoch": 5.943962641761174, "loss": 0.342007040977478, "step": 17820 }, { "ce_loss": 0.05079389363527298, "epoch": 5.943962641761174, "step": 17820 }, { "distill_loss": 0.12523949146270752, "epoch": 5.943962641761174, "step": 17820 }, { "epoch": 5.943962641761174, "ref_ce_loss": 0.06961704790592194, "step": 17820 }, { "epoch": 5.943962641761174, "loss": 0.36865371465682983, "step": 17820 }, { "ce_loss": 0.03996824845671654, "epoch": 5.943962641761174, "step": 17820 }, { "distill_loss": 0.21217110753059387, "epoch": 5.943962641761174, "step": 17820 }, { "epoch": 5.943962641761174, "ref_ce_loss": 0.08097124099731445, "step": 17820 }, { "epoch": 5.943962641761174, "loss": 0.33703479170799255, "step": 17820 }, { "ce_loss": 0.04888799786567688, "epoch": 5.943962641761174, "step": 17820 }, { "distill_loss": 0.1261524111032486, "epoch": 5.943962641761174, "step": 17820 }, { "epoch": 5.943962641761174, "ref_ce_loss": 0.05606861412525177, "step": 17820 }, { "epoch": 5.943962641761174, "loss": 0.28407689929008484, "step": 17820 }, { "ce_loss": 0.03848010301589966, "epoch": 5.943962641761174, "step": 17820 }, { "distill_loss": 0.11298099160194397, "epoch": 5.943962641761174, "step": 17820 }, { "epoch": 5.943962641761174, "ref_ce_loss": 0.06575914472341537, "step": 17820 }, { "epoch": 5.947298198799199, "loss": 0.3712, "step": 17830 }, { "epoch": 5.947298198799199, "grad_norm": 2.5720245838165283, "step": 17830 }, { "epoch": 5.947298198799199, "learning_rate": 4.8884030085111934e-05, "step": 17830 }, { "epoch": 5.947298198799199, "loss": 0.2565248906612396, "step": 17830 }, { "ce_loss": 0.05670475959777832, "epoch": 5.947298198799199, "step": 17830 }, { "distill_loss": 0.1039658635854721, "epoch": 5.947298198799199, "step": 17830 }, { "epoch": 5.947298198799199, "ref_ce_loss": 0.06518025696277618, "step": 17830 }, { "epoch": 5.947298198799199, "loss": 0.32152819633483887, "step": 17830 }, { "ce_loss": 0.03222854435443878, "epoch": 5.947298198799199, "step": 17830 }, { "distill_loss": 0.13430121541023254, "epoch": 5.947298198799199, "step": 17830 }, { "epoch": 5.947298198799199, "ref_ce_loss": 0.08920446783304214, "step": 17830 }, { "epoch": 5.947298198799199, "loss": 0.23141776025295258, "step": 17830 }, { "ce_loss": 0.07320418953895569, "epoch": 5.947298198799199, "step": 17830 }, { "distill_loss": 0.104472815990448, "epoch": 5.947298198799199, "step": 17830 }, { "epoch": 5.947298198799199, "ref_ce_loss": 0.05357253551483154, "step": 17830 }, { "epoch": 5.947298198799199, "loss": 0.2036972939968109, "step": 17830 }, { "ce_loss": 0.02201797068119049, "epoch": 5.947298198799199, "step": 17830 }, { "distill_loss": 0.11067456752061844, "epoch": 5.947298198799199, "step": 17830 }, { "epoch": 5.947298198799199, "ref_ce_loss": 0.05038437992334366, "step": 17830 }, { "epoch": 5.950633755837225, "loss": 0.3526, "step": 17840 }, { "epoch": 5.950633755837225, "grad_norm": 2.784125804901123, "step": 17840 }, { "epoch": 5.950633755837225, "learning_rate": 4.8734503533885355e-05, "step": 17840 }, { "epoch": 5.950633755837225, "loss": 0.193734809756279, "step": 17840 }, { "ce_loss": 0.012308568693697453, "epoch": 5.950633755837225, "step": 17840 }, { "distill_loss": 0.09944108128547668, "epoch": 5.950633755837225, "step": 17840 }, { "epoch": 5.950633755837225, "ref_ce_loss": 0.03873451426625252, "step": 17840 }, { "epoch": 5.950633755837225, "loss": 0.548555314540863, "step": 17840 }, { "ce_loss": 0.07860098034143448, "epoch": 5.950633755837225, "step": 17840 }, { "distill_loss": 0.12121152877807617, "epoch": 5.950633755837225, "step": 17840 }, { "epoch": 5.950633755837225, "ref_ce_loss": 0.06695293635129929, "step": 17840 }, { "epoch": 5.950633755837225, "loss": 0.31232643127441406, "step": 17840 }, { "ce_loss": 0.02223125286400318, "epoch": 5.950633755837225, "step": 17840 }, { "distill_loss": 0.11335283517837524, "epoch": 5.950633755837225, "step": 17840 }, { "epoch": 5.950633755837225, "ref_ce_loss": 0.054550062865018845, "step": 17840 }, { "epoch": 5.950633755837225, "loss": 0.3241339325904846, "step": 17840 }, { "ce_loss": 0.08912857621908188, "epoch": 5.950633755837225, "step": 17840 }, { "distill_loss": 0.11383871734142303, "epoch": 5.950633755837225, "step": 17840 }, { "epoch": 5.950633755837225, "ref_ce_loss": 0.06944772601127625, "step": 17840 }, { "epoch": 5.95396931287525, "loss": 0.3676, "step": 17850 }, { "epoch": 5.95396931287525, "grad_norm": 2.149252414703369, "step": 17850 }, { "epoch": 5.95396931287525, "learning_rate": 4.858516165097836e-05, "step": 17850 }, { "epoch": 5.95396931287525, "loss": 0.47968047857284546, "step": 17850 }, { "ce_loss": 0.12607181072235107, "epoch": 5.95396931287525, "step": 17850 }, { "distill_loss": 0.13286791741847992, "epoch": 5.95396931287525, "step": 17850 }, { "epoch": 5.95396931287525, "ref_ce_loss": 0.05671996250748634, "step": 17850 }, { "epoch": 5.95396931287525, "loss": 0.3840937316417694, "step": 17850 }, { "ce_loss": 0.10201926529407501, "epoch": 5.95396931287525, "step": 17850 }, { "distill_loss": 0.14407148957252502, "epoch": 5.95396931287525, "step": 17850 }, { "epoch": 5.95396931287525, "ref_ce_loss": 0.08624617010354996, "step": 17850 }, { "epoch": 5.95396931287525, "loss": 0.43116295337677, "step": 17850 }, { "ce_loss": 0.09212303906679153, "epoch": 5.95396931287525, "step": 17850 }, { "distill_loss": 0.11729922145605087, "epoch": 5.95396931287525, "step": 17850 }, { "epoch": 5.95396931287525, "ref_ce_loss": 0.08468250930309296, "step": 17850 }, { "epoch": 5.95396931287525, "loss": 0.47167813777923584, "step": 17850 }, { "ce_loss": 0.15872035920619965, "epoch": 5.95396931287525, "step": 17850 }, { "distill_loss": 0.19481000304222107, "epoch": 5.95396931287525, "step": 17850 }, { "epoch": 5.95396931287525, "ref_ce_loss": 0.09276537597179413, "step": 17850 }, { "epoch": 5.957304869913275, "loss": 0.391, "step": 17860 }, { "epoch": 5.957304869913275, "grad_norm": 2.8126001358032227, "step": 17860 }, { "epoch": 5.957304869913275, "learning_rate": 4.8436004708731636e-05, "step": 17860 }, { "epoch": 5.957304869913275, "loss": 0.1991313099861145, "step": 17860 }, { "ce_loss": 0.042863406240940094, "epoch": 5.957304869913275, "step": 17860 }, { "distill_loss": 0.09574732929468155, "epoch": 5.957304869913275, "step": 17860 }, { "epoch": 5.957304869913275, "ref_ce_loss": 0.046636320650577545, "step": 17860 }, { "epoch": 5.957304869913275, "loss": 0.34296974539756775, "step": 17860 }, { "ce_loss": 0.08897802978754044, "epoch": 5.957304869913275, "step": 17860 }, { "distill_loss": 0.13395370543003082, "epoch": 5.957304869913275, "step": 17860 }, { "epoch": 5.957304869913275, "ref_ce_loss": 0.0635993480682373, "step": 17860 }, { "epoch": 5.957304869913275, "loss": 0.47116437554359436, "step": 17860 }, { "ce_loss": 0.12424060702323914, "epoch": 5.957304869913275, "step": 17860 }, { "distill_loss": 0.2328859567642212, "epoch": 5.957304869913275, "step": 17860 }, { "epoch": 5.957304869913275, "ref_ce_loss": 0.11392879486083984, "step": 17860 }, { "epoch": 5.957304869913275, "loss": 0.37671905755996704, "step": 17860 }, { "ce_loss": 0.12620703876018524, "epoch": 5.957304869913275, "step": 17860 }, { "distill_loss": 0.15510450303554535, "epoch": 5.957304869913275, "step": 17860 }, { "epoch": 5.957304869913275, "ref_ce_loss": 0.0951317846775055, "step": 17860 }, { "epoch": 5.960640426951301, "loss": 0.3679, "step": 17870 }, { "epoch": 5.960640426951301, "grad_norm": 2.2110021114349365, "step": 17870 }, { "epoch": 5.960640426951301, "learning_rate": 4.8287032979148635e-05, "step": 17870 }, { "epoch": 5.960640426951301, "loss": 0.4523363411426544, "step": 17870 }, { "ce_loss": 0.11198491603136063, "epoch": 5.960640426951301, "step": 17870 }, { "distill_loss": 0.15241265296936035, "epoch": 5.960640426951301, "step": 17870 }, { "epoch": 5.960640426951301, "ref_ce_loss": 0.07542113214731216, "step": 17870 }, { "epoch": 5.960640426951301, "loss": 0.48284369707107544, "step": 17870 }, { "ce_loss": 0.12855391204357147, "epoch": 5.960640426951301, "step": 17870 }, { "distill_loss": 0.21619918942451477, "epoch": 5.960640426951301, "step": 17870 }, { "epoch": 5.960640426951301, "ref_ce_loss": 0.07777365297079086, "step": 17870 }, { "epoch": 5.960640426951301, "loss": 0.3478761613368988, "step": 17870 }, { "ce_loss": 0.0799589678645134, "epoch": 5.960640426951301, "step": 17870 }, { "distill_loss": 0.09983595460653305, "epoch": 5.960640426951301, "step": 17870 }, { "epoch": 5.960640426951301, "ref_ce_loss": 0.07422303408384323, "step": 17870 }, { "epoch": 5.960640426951301, "loss": 0.36284440755844116, "step": 17870 }, { "ce_loss": 0.05180178955197334, "epoch": 5.960640426951301, "step": 17870 }, { "distill_loss": 0.11064992845058441, "epoch": 5.960640426951301, "step": 17870 }, { "epoch": 5.960640426951301, "ref_ce_loss": 0.06782492250204086, "step": 17870 }, { "epoch": 5.963975983989326, "loss": 0.3438, "step": 17880 }, { "epoch": 5.963975983989326, "grad_norm": 2.341897964477539, "step": 17880 }, { "epoch": 5.963975983989326, "learning_rate": 4.8138246733894924e-05, "step": 17880 }, { "epoch": 5.963975983989326, "loss": 0.2747371792793274, "step": 17880 }, { "ce_loss": 0.05753449723124504, "epoch": 5.963975983989326, "step": 17880 }, { "distill_loss": 0.0984141156077385, "epoch": 5.963975983989326, "step": 17880 }, { "epoch": 5.963975983989326, "ref_ce_loss": 0.046176254749298096, "step": 17880 }, { "epoch": 5.963975983989326, "loss": 0.2829621434211731, "step": 17880 }, { "ce_loss": 0.061033353209495544, "epoch": 5.963975983989326, "step": 17880 }, { "distill_loss": 0.12907133996486664, "epoch": 5.963975983989326, "step": 17880 }, { "epoch": 5.963975983989326, "ref_ce_loss": 0.061487022787332535, "step": 17880 }, { "epoch": 5.963975983989326, "loss": 0.3641863465309143, "step": 17880 }, { "ce_loss": 0.01116390060633421, "epoch": 5.963975983989326, "step": 17880 }, { "distill_loss": 0.11722405254840851, "epoch": 5.963975983989326, "step": 17880 }, { "epoch": 5.963975983989326, "ref_ce_loss": 0.06099073961377144, "step": 17880 }, { "epoch": 5.963975983989326, "loss": 0.5954475998878479, "step": 17880 }, { "ce_loss": 0.1188012957572937, "epoch": 5.963975983989326, "step": 17880 }, { "distill_loss": 0.1340787261724472, "epoch": 5.963975983989326, "step": 17880 }, { "epoch": 5.963975983989326, "ref_ce_loss": 0.06728996336460114, "step": 17880 }, { "epoch": 5.967311541027351, "loss": 0.3917, "step": 17890 }, { "epoch": 5.967311541027351, "grad_norm": 3.270078659057617, "step": 17890 }, { "epoch": 5.967311541027351, "learning_rate": 4.798964624429801e-05, "step": 17890 }, { "epoch": 5.967311541027351, "loss": 0.3070351481437683, "step": 17890 }, { "ce_loss": 0.02258501574397087, "epoch": 5.967311541027351, "step": 17890 }, { "distill_loss": 0.20282799005508423, "epoch": 5.967311541027351, "step": 17890 }, { "epoch": 5.967311541027351, "ref_ce_loss": 0.05928156152367592, "step": 17890 }, { "epoch": 5.967311541027351, "loss": 0.37176916003227234, "step": 17890 }, { "ce_loss": 0.07661992311477661, "epoch": 5.967311541027351, "step": 17890 }, { "distill_loss": 0.17923808097839355, "epoch": 5.967311541027351, "step": 17890 }, { "epoch": 5.967311541027351, "ref_ce_loss": 0.07780885696411133, "step": 17890 }, { "epoch": 5.967311541027351, "loss": 0.26655101776123047, "step": 17890 }, { "ce_loss": 0.04947191849350929, "epoch": 5.967311541027351, "step": 17890 }, { "distill_loss": 0.11034003645181656, "epoch": 5.967311541027351, "step": 17890 }, { "epoch": 5.967311541027351, "ref_ce_loss": 0.0893992930650711, "step": 17890 }, { "epoch": 5.967311541027351, "loss": 0.37074536085128784, "step": 17890 }, { "ce_loss": 0.08786562830209732, "epoch": 5.967311541027351, "step": 17890 }, { "distill_loss": 0.1649412363767624, "epoch": 5.967311541027351, "step": 17890 }, { "epoch": 5.967311541027351, "ref_ce_loss": 0.07530762255191803, "step": 17890 }, { "epoch": 5.970647098065377, "loss": 0.3688, "step": 17900 }, { "epoch": 5.970647098065377, "grad_norm": 21.496910095214844, "step": 17900 }, { "epoch": 5.970647098065377, "learning_rate": 4.784123178134653e-05, "step": 17900 }, { "epoch": 5.970647098065377, "loss": 0.4974766969680786, "step": 17900 }, { "ce_loss": 0.022632336243987083, "epoch": 5.970647098065377, "step": 17900 }, { "distill_loss": 0.1455262154340744, "epoch": 5.970647098065377, "step": 17900 }, { "epoch": 5.970647098065377, "ref_ce_loss": 0.12247674912214279, "step": 17900 }, { "epoch": 5.970647098065377, "loss": 1.078402042388916, "step": 17900 }, { "ce_loss": 0.1272539347410202, "epoch": 5.970647098065377, "step": 17900 }, { "distill_loss": 0.21468423306941986, "epoch": 5.970647098065377, "step": 17900 }, { "epoch": 5.970647098065377, "ref_ce_loss": 0.08153967559337616, "step": 17900 }, { "epoch": 5.970647098065377, "loss": 0.2720794677734375, "step": 17900 }, { "ce_loss": 0.05145162716507912, "epoch": 5.970647098065377, "step": 17900 }, { "distill_loss": 0.1308475136756897, "epoch": 5.970647098065377, "step": 17900 }, { "epoch": 5.970647098065377, "ref_ce_loss": 0.06132218986749649, "step": 17900 }, { "epoch": 5.970647098065377, "loss": 0.4849904179573059, "step": 17900 }, { "ce_loss": 0.03414897248148918, "epoch": 5.970647098065377, "step": 17900 }, { "distill_loss": 0.18564105033874512, "epoch": 5.970647098065377, "step": 17900 }, { "epoch": 5.970647098065377, "ref_ce_loss": 0.07281206548213959, "step": 17900 }, { "epoch": 5.973982655103402, "loss": 0.3864, "step": 17910 }, { "epoch": 5.973982655103402, "grad_norm": 2.804548501968384, "step": 17910 }, { "epoch": 5.973982655103402, "learning_rate": 4.769300361568994e-05, "step": 17910 }, { "epoch": 5.973982655103402, "loss": 0.31776753067970276, "step": 17910 }, { "ce_loss": 0.0540342815220356, "epoch": 5.973982655103402, "step": 17910 }, { "distill_loss": 0.11750628054141998, "epoch": 5.973982655103402, "step": 17910 }, { "epoch": 5.973982655103402, "ref_ce_loss": 0.08454766869544983, "step": 17910 }, { "epoch": 5.973982655103402, "loss": 0.4837797284126282, "step": 17910 }, { "ce_loss": 0.06660284101963043, "epoch": 5.973982655103402, "step": 17910 }, { "distill_loss": 0.2543158233165741, "epoch": 5.973982655103402, "step": 17910 }, { "epoch": 5.973982655103402, "ref_ce_loss": 0.06508289277553558, "step": 17910 }, { "epoch": 5.973982655103402, "loss": 0.2823173999786377, "step": 17910 }, { "ce_loss": 0.0301041379570961, "epoch": 5.973982655103402, "step": 17910 }, { "distill_loss": 0.14463257789611816, "epoch": 5.973982655103402, "step": 17910 }, { "epoch": 5.973982655103402, "ref_ce_loss": 0.053473278880119324, "step": 17910 }, { "epoch": 5.973982655103402, "loss": 0.2954986095428467, "step": 17910 }, { "ce_loss": 0.07780736684799194, "epoch": 5.973982655103402, "step": 17910 }, { "distill_loss": 0.11172517389059067, "epoch": 5.973982655103402, "step": 17910 }, { "epoch": 5.973982655103402, "ref_ce_loss": 0.07242323458194733, "step": 17910 }, { "epoch": 5.9773182121414274, "loss": 0.3964, "step": 17920 }, { "epoch": 5.9773182121414274, "grad_norm": 2.323547601699829, "step": 17920 }, { "epoch": 5.9773182121414274, "learning_rate": 4.7544962017638e-05, "step": 17920 }, { "epoch": 5.9773182121414274, "loss": 0.725130558013916, "step": 17920 }, { "ce_loss": 0.16385790705680847, "epoch": 5.9773182121414274, "step": 17920 }, { "distill_loss": 0.21144914627075195, "epoch": 5.9773182121414274, "step": 17920 }, { "epoch": 5.9773182121414274, "ref_ce_loss": 0.13433006405830383, "step": 17920 }, { "epoch": 5.9773182121414274, "loss": 0.720917820930481, "step": 17920 }, { "ce_loss": 0.058357104659080505, "epoch": 5.9773182121414274, "step": 17920 }, { "distill_loss": 0.21775387227535248, "epoch": 5.9773182121414274, "step": 17920 }, { "epoch": 5.9773182121414274, "ref_ce_loss": 0.13213202357292175, "step": 17920 }, { "epoch": 5.9773182121414274, "loss": 0.3602050542831421, "step": 17920 }, { "ce_loss": 0.019712118431925774, "epoch": 5.9773182121414274, "step": 17920 }, { "distill_loss": 0.08597089350223541, "epoch": 5.9773182121414274, "step": 17920 }, { "epoch": 5.9773182121414274, "ref_ce_loss": 0.06011264771223068, "step": 17920 }, { "epoch": 5.9773182121414274, "loss": 0.24541306495666504, "step": 17920 }, { "ce_loss": 0.056333962827920914, "epoch": 5.9773182121414274, "step": 17920 }, { "distill_loss": 0.12353299558162689, "epoch": 5.9773182121414274, "step": 17920 }, { "epoch": 5.9773182121414274, "ref_ce_loss": 0.05609600991010666, "step": 17920 }, { "epoch": 5.980653769179453, "loss": 0.3587, "step": 17930 }, { "epoch": 5.980653769179453, "grad_norm": 2.24729323387146, "step": 17930 }, { "epoch": 5.980653769179453, "learning_rate": 4.7397107257160056e-05, "step": 17930 }, { "epoch": 5.980653769179453, "loss": 0.4122750163078308, "step": 17930 }, { "ce_loss": 0.07864921540021896, "epoch": 5.980653769179453, "step": 17930 }, { "distill_loss": 0.20610211789608002, "epoch": 5.980653769179453, "step": 17930 }, { "epoch": 5.980653769179453, "ref_ce_loss": 0.07581673562526703, "step": 17930 }, { "epoch": 5.980653769179453, "loss": 0.2803608775138855, "step": 17930 }, { "ce_loss": 0.053923893719911575, "epoch": 5.980653769179453, "step": 17930 }, { "distill_loss": 0.14501123130321503, "epoch": 5.980653769179453, "step": 17930 }, { "epoch": 5.980653769179453, "ref_ce_loss": 0.08128301799297333, "step": 17930 }, { "epoch": 5.980653769179453, "loss": 0.3348112106323242, "step": 17930 }, { "ce_loss": 0.05133431404829025, "epoch": 5.980653769179453, "step": 17930 }, { "distill_loss": 0.1559540331363678, "epoch": 5.980653769179453, "step": 17930 }, { "epoch": 5.980653769179453, "ref_ce_loss": 0.06095059961080551, "step": 17930 }, { "epoch": 5.980653769179453, "loss": 0.6578578352928162, "step": 17930 }, { "ce_loss": 0.026385093107819557, "epoch": 5.980653769179453, "step": 17930 }, { "distill_loss": 0.13736410439014435, "epoch": 5.980653769179453, "step": 17930 }, { "epoch": 5.980653769179453, "ref_ce_loss": 0.06968227028846741, "step": 17930 }, { "epoch": 5.983989326217478, "loss": 0.3642, "step": 17940 }, { "epoch": 5.983989326217478, "grad_norm": 2.0110793113708496, "step": 17940 }, { "epoch": 5.983989326217478, "learning_rate": 4.724943960388499e-05, "step": 17940 }, { "epoch": 5.983989326217478, "loss": 0.3668304681777954, "step": 17940 }, { "ce_loss": 0.08051525801420212, "epoch": 5.983989326217478, "step": 17940 }, { "distill_loss": 0.17716766893863678, "epoch": 5.983989326217478, "step": 17940 }, { "epoch": 5.983989326217478, "ref_ce_loss": 0.07459532469511032, "step": 17940 }, { "epoch": 5.983989326217478, "loss": 0.3955934941768646, "step": 17940 }, { "ce_loss": 0.007330432068556547, "epoch": 5.983989326217478, "step": 17940 }, { "distill_loss": 0.1856948286294937, "epoch": 5.983989326217478, "step": 17940 }, { "epoch": 5.983989326217478, "ref_ce_loss": 0.08589158952236176, "step": 17940 }, { "epoch": 5.983989326217478, "loss": 0.4008892774581909, "step": 17940 }, { "ce_loss": 0.12865528464317322, "epoch": 5.983989326217478, "step": 17940 }, { "distill_loss": 0.17550432682037354, "epoch": 5.983989326217478, "step": 17940 }, { "epoch": 5.983989326217478, "ref_ce_loss": 0.07676687091588974, "step": 17940 }, { "epoch": 5.983989326217478, "loss": 0.5108517408370972, "step": 17940 }, { "ce_loss": 0.05577657371759415, "epoch": 5.983989326217478, "step": 17940 }, { "distill_loss": 0.29523903131484985, "epoch": 5.983989326217478, "step": 17940 }, { "epoch": 5.983989326217478, "ref_ce_loss": 0.07486500591039658, "step": 17940 }, { "epoch": 5.9873248832555035, "loss": 0.411, "step": 17950 }, { "epoch": 5.9873248832555035, "grad_norm": 5.392819881439209, "step": 17950 }, { "epoch": 5.9873248832555035, "learning_rate": 4.7101959327100216e-05, "step": 17950 }, { "epoch": 5.9873248832555035, "loss": 0.29439449310302734, "step": 17950 }, { "ce_loss": 0.05010393634438515, "epoch": 5.9873248832555035, "step": 17950 }, { "distill_loss": 0.1031777560710907, "epoch": 5.9873248832555035, "step": 17950 }, { "epoch": 5.9873248832555035, "ref_ce_loss": 0.054850682616233826, "step": 17950 }, { "epoch": 5.9873248832555035, "loss": 0.5090524554252625, "step": 17950 }, { "ce_loss": 0.11068634688854218, "epoch": 5.9873248832555035, "step": 17950 }, { "distill_loss": 0.11719664931297302, "epoch": 5.9873248832555035, "step": 17950 }, { "epoch": 5.9873248832555035, "ref_ce_loss": 0.06820139288902283, "step": 17950 }, { "epoch": 5.9873248832555035, "loss": 0.6417162418365479, "step": 17950 }, { "ce_loss": 0.05354602634906769, "epoch": 5.9873248832555035, "step": 17950 }, { "distill_loss": 0.11590472608804703, "epoch": 5.9873248832555035, "step": 17950 }, { "epoch": 5.9873248832555035, "ref_ce_loss": 0.08205898851156235, "step": 17950 }, { "epoch": 5.9873248832555035, "loss": 0.22045600414276123, "step": 17950 }, { "ce_loss": 0.03451256453990936, "epoch": 5.9873248832555035, "step": 17950 }, { "distill_loss": 0.1123829036951065, "epoch": 5.9873248832555035, "step": 17950 }, { "epoch": 5.9873248832555035, "ref_ce_loss": 0.04432832822203636, "step": 17950 }, { "epoch": 5.990660440293529, "loss": 0.3739, "step": 17960 }, { "epoch": 5.990660440293529, "grad_norm": 3.4776222705841064, "step": 17960 }, { "epoch": 5.990660440293529, "learning_rate": 4.6954666695751704e-05, "step": 17960 }, { "epoch": 5.990660440293529, "loss": 0.3324894607067108, "step": 17960 }, { "ce_loss": 0.04826672747731209, "epoch": 5.990660440293529, "step": 17960 }, { "distill_loss": 0.11226597428321838, "epoch": 5.990660440293529, "step": 17960 }, { "epoch": 5.990660440293529, "ref_ce_loss": 0.07892588526010513, "step": 17960 }, { "epoch": 5.990660440293529, "loss": 0.36604809761047363, "step": 17960 }, { "ce_loss": 0.05619461089372635, "epoch": 5.990660440293529, "step": 17960 }, { "distill_loss": 0.1991359293460846, "epoch": 5.990660440293529, "step": 17960 }, { "epoch": 5.990660440293529, "ref_ce_loss": 0.0792030394077301, "step": 17960 }, { "epoch": 5.990660440293529, "loss": 0.33557814359664917, "step": 17960 }, { "ce_loss": 0.05750252306461334, "epoch": 5.990660440293529, "step": 17960 }, { "distill_loss": 0.1858070194721222, "epoch": 5.990660440293529, "step": 17960 }, { "epoch": 5.990660440293529, "ref_ce_loss": 0.06949331611394882, "step": 17960 }, { "epoch": 5.990660440293529, "loss": 0.27118319272994995, "step": 17960 }, { "ce_loss": 0.04408452659845352, "epoch": 5.990660440293529, "step": 17960 }, { "distill_loss": 0.1736617088317871, "epoch": 5.990660440293529, "step": 17960 }, { "epoch": 5.990660440293529, "ref_ce_loss": 0.05310777202248573, "step": 17960 }, { "epoch": 5.993995997331554, "loss": 0.3675, "step": 17970 }, { "epoch": 5.993995997331554, "grad_norm": 2.418032169342041, "step": 17970 }, { "epoch": 5.993995997331554, "learning_rate": 4.680756197844311e-05, "step": 17970 }, { "epoch": 5.993995997331554, "loss": 0.2190331071615219, "step": 17970 }, { "ce_loss": 0.04840138927102089, "epoch": 5.993995997331554, "step": 17970 }, { "distill_loss": 0.10126939415931702, "epoch": 5.993995997331554, "step": 17970 }, { "epoch": 5.993995997331554, "ref_ce_loss": 0.046917498111724854, "step": 17970 }, { "epoch": 5.993995997331554, "loss": 0.3270896077156067, "step": 17970 }, { "ce_loss": 0.04233918339014053, "epoch": 5.993995997331554, "step": 17970 }, { "distill_loss": 0.18095675110816956, "epoch": 5.993995997331554, "step": 17970 }, { "epoch": 5.993995997331554, "ref_ce_loss": 0.07469739764928818, "step": 17970 }, { "epoch": 5.993995997331554, "loss": 0.30097100138664246, "step": 17970 }, { "ce_loss": 0.04831115901470184, "epoch": 5.993995997331554, "step": 17970 }, { "distill_loss": 0.13550861179828644, "epoch": 5.993995997331554, "step": 17970 }, { "epoch": 5.993995997331554, "ref_ce_loss": 0.08458152413368225, "step": 17970 }, { "epoch": 5.993995997331554, "loss": 0.3853110671043396, "step": 17970 }, { "ce_loss": 0.10421506315469742, "epoch": 5.993995997331554, "step": 17970 }, { "distill_loss": 0.21022917330265045, "epoch": 5.993995997331554, "step": 17970 }, { "epoch": 5.993995997331554, "ref_ce_loss": 0.053354986011981964, "step": 17970 }, { "epoch": 5.9973315543695795, "loss": 0.3617, "step": 17980 }, { "epoch": 5.9973315543695795, "grad_norm": 2.6923153400421143, "step": 17980 }, { "epoch": 5.9973315543695795, "learning_rate": 4.666064544343535e-05, "step": 17980 }, { "epoch": 5.9973315543695795, "loss": 0.5322884321212769, "step": 17980 }, { "ce_loss": 0.07125137001276016, "epoch": 5.9973315543695795, "step": 17980 }, { "distill_loss": 0.2436763346195221, "epoch": 5.9973315543695795, "step": 17980 }, { "epoch": 5.9973315543695795, "ref_ce_loss": 0.10013176500797272, "step": 17980 }, { "epoch": 5.9973315543695795, "loss": 0.32280588150024414, "step": 17980 }, { "ce_loss": 0.08425823599100113, "epoch": 5.9973315543695795, "step": 17980 }, { "distill_loss": 0.12138201296329498, "epoch": 5.9973315543695795, "step": 17980 }, { "epoch": 5.9973315543695795, "ref_ce_loss": 0.0700070708990097, "step": 17980 }, { "epoch": 5.9973315543695795, "loss": 0.15443341434001923, "step": 17980 }, { "ce_loss": 0.008114497177302837, "epoch": 5.9973315543695795, "step": 17980 }, { "distill_loss": 0.09282103925943375, "epoch": 5.9973315543695795, "step": 17980 }, { "epoch": 5.9973315543695795, "ref_ce_loss": 0.053100306540727615, "step": 17980 }, { "epoch": 5.9973315543695795, "loss": 0.35013052821159363, "step": 17980 }, { "ce_loss": 0.09281091392040253, "epoch": 5.9973315543695795, "step": 17980 }, { "distill_loss": 0.13264884054660797, "epoch": 5.9973315543695795, "step": 17980 }, { "epoch": 5.9973315543695795, "ref_ce_loss": 0.08823098242282867, "step": 17980 }, { "epoch": 6.000667111407605, "loss": 0.3923, "step": 17990 }, { "epoch": 6.000667111407605, "grad_norm": 3.768995523452759, "step": 17990 }, { "epoch": 6.000667111407605, "learning_rate": 4.651391735864629e-05, "step": 17990 }, { "epoch": 6.000667111407605, "loss": 0.35800161957740784, "step": 17990 }, { "ce_loss": 0.08932358771562576, "epoch": 6.000667111407605, "step": 17990 }, { "distill_loss": 0.13139638304710388, "epoch": 6.000667111407605, "step": 17990 }, { "epoch": 6.000667111407605, "ref_ce_loss": 0.06785690784454346, "step": 17990 }, { "epoch": 6.000667111407605, "loss": 0.2316976636648178, "step": 17990 }, { "ce_loss": 0.03572269156575203, "epoch": 6.000667111407605, "step": 17990 }, { "distill_loss": 0.13418623805046082, "epoch": 6.000667111407605, "step": 17990 }, { "epoch": 6.000667111407605, "ref_ce_loss": 0.061637792736291885, "step": 17990 }, { "epoch": 6.000667111407605, "loss": 0.31546854972839355, "step": 17990 }, { "ce_loss": 0.026578150689601898, "epoch": 6.000667111407605, "step": 17990 }, { "distill_loss": 0.112409807741642, "epoch": 6.000667111407605, "step": 17990 }, { "epoch": 6.000667111407605, "ref_ce_loss": 0.07530684024095535, "step": 17990 }, { "epoch": 6.000667111407605, "loss": 0.2855393886566162, "step": 17990 }, { "ce_loss": 0.017660409212112427, "epoch": 6.000667111407605, "step": 17990 }, { "distill_loss": 0.09330619871616364, "epoch": 6.000667111407605, "step": 17990 }, { "epoch": 6.000667111407605, "ref_ce_loss": 0.044164739549160004, "step": 17990 }, { "epoch": 6.00400266844563, "loss": 0.3248, "step": 18000 }, { "epoch": 6.00400266844563, "grad_norm": 2.2335541248321533, "step": 18000 }, { "epoch": 6.00400266844563, "learning_rate": 4.636737799164998e-05, "step": 18000 }, { "epoch": 6.00400266844563, "loss": 0.19867165386676788, "step": 18000 }, { "ce_loss": 0.03791828081011772, "epoch": 6.00400266844563, "step": 18000 }, { "distill_loss": 0.09197042137384415, "epoch": 6.00400266844563, "step": 18000 }, { "epoch": 6.00400266844563, "ref_ce_loss": 0.03162221238017082, "step": 18000 }, { "epoch": 6.00400266844563, "loss": 0.3159444332122803, "step": 18000 }, { "ce_loss": 0.04408169165253639, "epoch": 6.00400266844563, "step": 18000 }, { "distill_loss": 0.13018789887428284, "epoch": 6.00400266844563, "step": 18000 }, { "epoch": 6.00400266844563, "ref_ce_loss": 0.07600803673267365, "step": 18000 }, { "epoch": 6.00400266844563, "loss": 0.24588634073734283, "step": 18000 }, { "ce_loss": 0.02073906734585762, "epoch": 6.00400266844563, "step": 18000 }, { "distill_loss": 0.11770112812519073, "epoch": 6.00400266844563, "step": 18000 }, { "epoch": 6.00400266844563, "ref_ce_loss": 0.028640158474445343, "step": 18000 }, { "epoch": 6.00400266844563, "loss": 0.5873534679412842, "step": 18000 }, { "ce_loss": 0.1087309941649437, "epoch": 6.00400266844563, "step": 18000 }, { "distill_loss": 0.14516711235046387, "epoch": 6.00400266844563, "step": 18000 }, { "epoch": 6.00400266844563, "ref_ce_loss": 0.09350698441267014, "step": 18000 }, { "epoch": 6.007338225483656, "loss": 0.3314, "step": 18010 }, { "epoch": 6.007338225483656, "grad_norm": 3.0365161895751953, "step": 18010 }, { "epoch": 6.007338225483656, "learning_rate": 4.622102760967644e-05, "step": 18010 }, { "epoch": 6.007338225483656, "loss": 0.24367427825927734, "step": 18010 }, { "ce_loss": 0.03099408745765686, "epoch": 6.007338225483656, "step": 18010 }, { "distill_loss": 0.11708693206310272, "epoch": 6.007338225483656, "step": 18010 }, { "epoch": 6.007338225483656, "ref_ce_loss": 0.058985915035009384, "step": 18010 }, { "epoch": 6.007338225483656, "loss": 0.24253971874713898, "step": 18010 }, { "ce_loss": 0.027155818417668343, "epoch": 6.007338225483656, "step": 18010 }, { "distill_loss": 0.14921477437019348, "epoch": 6.007338225483656, "step": 18010 }, { "epoch": 6.007338225483656, "ref_ce_loss": 0.025843849405646324, "step": 18010 }, { "epoch": 6.007338225483656, "loss": 0.2067640721797943, "step": 18010 }, { "ce_loss": 0.018789952620863914, "epoch": 6.007338225483656, "step": 18010 }, { "distill_loss": 0.13064438104629517, "epoch": 6.007338225483656, "step": 18010 }, { "epoch": 6.007338225483656, "ref_ce_loss": 0.03466375172138214, "step": 18010 }, { "epoch": 6.007338225483656, "loss": 0.3233734369277954, "step": 18010 }, { "ce_loss": 0.11963633447885513, "epoch": 6.007338225483656, "step": 18010 }, { "distill_loss": 0.16788633167743683, "epoch": 6.007338225483656, "step": 18010 }, { "epoch": 6.007338225483656, "ref_ce_loss": 0.035782862454652786, "step": 18010 }, { "epoch": 6.010673782521681, "loss": 0.3234, "step": 18020 }, { "epoch": 6.010673782521681, "grad_norm": 4.069089889526367, "step": 18020 }, { "epoch": 6.010673782521681, "learning_rate": 4.607486647961117e-05, "step": 18020 }, { "epoch": 6.010673782521681, "loss": 0.36928215622901917, "step": 18020 }, { "ce_loss": 0.02707325853407383, "epoch": 6.010673782521681, "step": 18020 }, { "distill_loss": 0.1534077674150467, "epoch": 6.010673782521681, "step": 18020 }, { "epoch": 6.010673782521681, "ref_ce_loss": 0.07559659332036972, "step": 18020 }, { "epoch": 6.010673782521681, "loss": 0.2386367917060852, "step": 18020 }, { "ce_loss": 0.03612204268574715, "epoch": 6.010673782521681, "step": 18020 }, { "distill_loss": 0.11690287292003632, "epoch": 6.010673782521681, "step": 18020 }, { "epoch": 6.010673782521681, "ref_ce_loss": 0.060044534504413605, "step": 18020 }, { "epoch": 6.010673782521681, "loss": 0.20590835809707642, "step": 18020 }, { "ce_loss": 0.05326259881258011, "epoch": 6.010673782521681, "step": 18020 }, { "distill_loss": 0.1046588271856308, "epoch": 6.010673782521681, "step": 18020 }, { "epoch": 6.010673782521681, "ref_ce_loss": 0.03534253314137459, "step": 18020 }, { "epoch": 6.010673782521681, "loss": 0.2644188106060028, "step": 18020 }, { "ce_loss": 0.02832714095711708, "epoch": 6.010673782521681, "step": 18020 }, { "distill_loss": 0.16729630529880524, "epoch": 6.010673782521681, "step": 18020 }, { "epoch": 6.010673782521681, "ref_ce_loss": 0.03283698111772537, "step": 18020 }, { "epoch": 6.014009339559706, "loss": 0.3222, "step": 18030 }, { "epoch": 6.014009339559706, "grad_norm": 2.2713472843170166, "step": 18030 }, { "epoch": 6.014009339559706, "learning_rate": 4.592889486799428e-05, "step": 18030 }, { "epoch": 6.014009339559706, "loss": 0.3372799754142761, "step": 18030 }, { "ce_loss": 0.0459248311817646, "epoch": 6.014009339559706, "step": 18030 }, { "distill_loss": 0.20176801085472107, "epoch": 6.014009339559706, "step": 18030 }, { "epoch": 6.014009339559706, "ref_ce_loss": 0.06285598129034042, "step": 18030 }, { "epoch": 6.014009339559706, "loss": 0.28023040294647217, "step": 18030 }, { "ce_loss": 0.057125724852085114, "epoch": 6.014009339559706, "step": 18030 }, { "distill_loss": 0.15166716277599335, "epoch": 6.014009339559706, "step": 18030 }, { "epoch": 6.014009339559706, "ref_ce_loss": 0.07121897488832474, "step": 18030 }, { "epoch": 6.014009339559706, "loss": 0.1802394837141037, "step": 18030 }, { "ce_loss": 0.03540526703000069, "epoch": 6.014009339559706, "step": 18030 }, { "distill_loss": 0.09024612605571747, "epoch": 6.014009339559706, "step": 18030 }, { "epoch": 6.014009339559706, "ref_ce_loss": 0.044617220759391785, "step": 18030 }, { "epoch": 6.014009339559706, "loss": 0.2588875889778137, "step": 18030 }, { "ce_loss": 0.03209130838513374, "epoch": 6.014009339559706, "step": 18030 }, { "distill_loss": 0.12625159323215485, "epoch": 6.014009339559706, "step": 18030 }, { "epoch": 6.014009339559706, "ref_ce_loss": 0.07114759832620621, "step": 18030 }, { "epoch": 6.017344896597732, "loss": 0.3416, "step": 18040 }, { "epoch": 6.017344896597732, "grad_norm": 2.186108350753784, "step": 18040 }, { "epoch": 6.017344896597732, "learning_rate": 4.578311304102043e-05, "step": 18040 }, { "epoch": 6.017344896597732, "loss": 0.17702947556972504, "step": 18040 }, { "ce_loss": 0.03151837736368179, "epoch": 6.017344896597732, "step": 18040 }, { "distill_loss": 0.09358489513397217, "epoch": 6.017344896597732, "step": 18040 }, { "epoch": 6.017344896597732, "ref_ce_loss": 0.051804766058921814, "step": 18040 }, { "epoch": 6.017344896597732, "loss": 0.5507766604423523, "step": 18040 }, { "ce_loss": 0.07824090123176575, "epoch": 6.017344896597732, "step": 18040 }, { "distill_loss": 0.12588661909103394, "epoch": 6.017344896597732, "step": 18040 }, { "epoch": 6.017344896597732, "ref_ce_loss": 0.055321402847766876, "step": 18040 }, { "epoch": 6.017344896597732, "loss": 0.38365840911865234, "step": 18040 }, { "ce_loss": 0.0746174082159996, "epoch": 6.017344896597732, "step": 18040 }, { "distill_loss": 0.183217391371727, "epoch": 6.017344896597732, "step": 18040 }, { "epoch": 6.017344896597732, "ref_ce_loss": 0.06258904933929443, "step": 18040 }, { "epoch": 6.017344896597732, "loss": 0.23493067920207977, "step": 18040 }, { "ce_loss": 0.04061503708362579, "epoch": 6.017344896597732, "step": 18040 }, { "distill_loss": 0.14113374054431915, "epoch": 6.017344896597732, "step": 18040 }, { "epoch": 6.017344896597732, "ref_ce_loss": 0.05288606137037277, "step": 18040 }, { "epoch": 6.020680453635757, "loss": 0.3774, "step": 18050 }, { "epoch": 6.020680453635757, "grad_norm": 2.509274482727051, "step": 18050 }, { "epoch": 6.020680453635757, "learning_rate": 4.5637521264538244e-05, "step": 18050 }, { "epoch": 6.020680453635757, "loss": 0.22896793484687805, "step": 18050 }, { "ce_loss": 0.020151875913143158, "epoch": 6.020680453635757, "step": 18050 }, { "distill_loss": 0.11503048986196518, "epoch": 6.020680453635757, "step": 18050 }, { "epoch": 6.020680453635757, "ref_ce_loss": 0.0390886589884758, "step": 18050 }, { "epoch": 6.020680453635757, "loss": 0.30240610241889954, "step": 18050 }, { "ce_loss": 0.029857726767659187, "epoch": 6.020680453635757, "step": 18050 }, { "distill_loss": 0.2106308937072754, "epoch": 6.020680453635757, "step": 18050 }, { "epoch": 6.020680453635757, "ref_ce_loss": 0.042414914816617966, "step": 18050 }, { "epoch": 6.020680453635757, "loss": 0.3427516222000122, "step": 18050 }, { "ce_loss": 0.14109061658382416, "epoch": 6.020680453635757, "step": 18050 }, { "distill_loss": 0.13355794548988342, "epoch": 6.020680453635757, "step": 18050 }, { "epoch": 6.020680453635757, "ref_ce_loss": 0.0491449311375618, "step": 18050 }, { "epoch": 6.020680453635757, "loss": 0.4403334856033325, "step": 18050 }, { "ce_loss": 0.037846773862838745, "epoch": 6.020680453635757, "step": 18050 }, { "distill_loss": 0.20196445286273956, "epoch": 6.020680453635757, "step": 18050 }, { "epoch": 6.020680453635757, "ref_ce_loss": 0.06973356008529663, "step": 18050 }, { "epoch": 6.024016010673782, "loss": 0.311, "step": 18060 }, { "epoch": 6.024016010673782, "grad_norm": 2.0426642894744873, "step": 18060 }, { "epoch": 6.024016010673782, "learning_rate": 4.549211980404959e-05, "step": 18060 }, { "epoch": 6.024016010673782, "loss": 0.17637385427951813, "step": 18060 }, { "ce_loss": 0.021697957068681717, "epoch": 6.024016010673782, "step": 18060 }, { "distill_loss": 0.11747463792562485, "epoch": 6.024016010673782, "step": 18060 }, { "epoch": 6.024016010673782, "ref_ce_loss": 0.03694509342312813, "step": 18060 }, { "epoch": 6.024016010673782, "loss": 0.273229718208313, "step": 18060 }, { "ce_loss": 0.014125284738838673, "epoch": 6.024016010673782, "step": 18060 }, { "distill_loss": 0.08990029990673065, "epoch": 6.024016010673782, "step": 18060 }, { "epoch": 6.024016010673782, "ref_ce_loss": 0.03305063769221306, "step": 18060 }, { "epoch": 6.024016010673782, "loss": 0.4129598140716553, "step": 18060 }, { "ce_loss": 0.03795570880174637, "epoch": 6.024016010673782, "step": 18060 }, { "distill_loss": 0.14061012864112854, "epoch": 6.024016010673782, "step": 18060 }, { "epoch": 6.024016010673782, "ref_ce_loss": 0.06605387479066849, "step": 18060 }, { "epoch": 6.024016010673782, "loss": 0.5871542692184448, "step": 18060 }, { "ce_loss": 0.06563892215490341, "epoch": 6.024016010673782, "step": 18060 }, { "distill_loss": 0.146104097366333, "epoch": 6.024016010673782, "step": 18060 }, { "epoch": 6.024016010673782, "ref_ce_loss": 0.08084701746702194, "step": 18060 }, { "epoch": 6.027351567711808, "loss": 0.3051, "step": 18070 }, { "epoch": 6.027351567711808, "grad_norm": 1.981549620628357, "step": 18070 }, { "epoch": 6.027351567711808, "learning_rate": 4.534690892470942e-05, "step": 18070 }, { "epoch": 6.027351567711808, "loss": 0.45083296298980713, "step": 18070 }, { "ce_loss": 0.02970081754028797, "epoch": 6.027351567711808, "step": 18070 }, { "distill_loss": 0.10548330098390579, "epoch": 6.027351567711808, "step": 18070 }, { "epoch": 6.027351567711808, "ref_ce_loss": 0.06935655325651169, "step": 18070 }, { "epoch": 6.027351567711808, "loss": 0.3245180547237396, "step": 18070 }, { "ce_loss": 0.05906673148274422, "epoch": 6.027351567711808, "step": 18070 }, { "distill_loss": 0.156429722905159, "epoch": 6.027351567711808, "step": 18070 }, { "epoch": 6.027351567711808, "ref_ce_loss": 0.08586925268173218, "step": 18070 }, { "epoch": 6.027351567711808, "loss": 0.26633402705192566, "step": 18070 }, { "ce_loss": 0.05068255960941315, "epoch": 6.027351567711808, "step": 18070 }, { "distill_loss": 0.13847847282886505, "epoch": 6.027351567711808, "step": 18070 }, { "epoch": 6.027351567711808, "ref_ce_loss": 0.05850810185074806, "step": 18070 }, { "epoch": 6.027351567711808, "loss": 0.38250064849853516, "step": 18070 }, { "ce_loss": 0.036539558321237564, "epoch": 6.027351567711808, "step": 18070 }, { "distill_loss": 0.2482924610376358, "epoch": 6.027351567711808, "step": 18070 }, { "epoch": 6.027351567711808, "ref_ce_loss": 0.06506029516458511, "step": 18070 }, { "epoch": 6.030687124749833, "loss": 0.2887, "step": 18080 }, { "epoch": 6.030687124749833, "grad_norm": 2.2178709506988525, "step": 18080 }, { "epoch": 6.030687124749833, "learning_rate": 4.52018888913251e-05, "step": 18080 }, { "epoch": 6.030687124749833, "loss": 0.41285941004753113, "step": 18080 }, { "ce_loss": 0.10323239117860794, "epoch": 6.030687124749833, "step": 18080 }, { "distill_loss": 0.14549994468688965, "epoch": 6.030687124749833, "step": 18080 }, { "epoch": 6.030687124749833, "ref_ce_loss": 0.051457539200782776, "step": 18080 }, { "epoch": 6.030687124749833, "loss": 0.28614822030067444, "step": 18080 }, { "ce_loss": 0.06471443176269531, "epoch": 6.030687124749833, "step": 18080 }, { "distill_loss": 0.14293956756591797, "epoch": 6.030687124749833, "step": 18080 }, { "epoch": 6.030687124749833, "ref_ce_loss": 0.06065152958035469, "step": 18080 }, { "epoch": 6.030687124749833, "loss": 0.3671509027481079, "step": 18080 }, { "ce_loss": 0.05830796808004379, "epoch": 6.030687124749833, "step": 18080 }, { "distill_loss": 0.13328364491462708, "epoch": 6.030687124749833, "step": 18080 }, { "epoch": 6.030687124749833, "ref_ce_loss": 0.040838442742824554, "step": 18080 }, { "epoch": 6.030687124749833, "loss": 0.18215957283973694, "step": 18080 }, { "ce_loss": 0.030505971983075142, "epoch": 6.030687124749833, "step": 18080 }, { "distill_loss": 0.10520057380199432, "epoch": 6.030687124749833, "step": 18080 }, { "epoch": 6.030687124749833, "ref_ce_loss": 0.03815292939543724, "step": 18080 }, { "epoch": 6.034022681787858, "loss": 0.3066, "step": 18090 }, { "epoch": 6.034022681787858, "grad_norm": 2.430351734161377, "step": 18090 }, { "epoch": 6.034022681787858, "learning_rate": 4.505705996835596e-05, "step": 18090 }, { "epoch": 6.034022681787858, "loss": 0.28383922576904297, "step": 18090 }, { "ce_loss": 0.05320765823125839, "epoch": 6.034022681787858, "step": 18090 }, { "distill_loss": 0.13925553858280182, "epoch": 6.034022681787858, "step": 18090 }, { "epoch": 6.034022681787858, "ref_ce_loss": 0.032751839607954025, "step": 18090 }, { "epoch": 6.034022681787858, "loss": 0.2511315941810608, "step": 18090 }, { "ce_loss": 0.0436200387775898, "epoch": 6.034022681787858, "step": 18090 }, { "distill_loss": 0.11578132212162018, "epoch": 6.034022681787858, "step": 18090 }, { "epoch": 6.034022681787858, "ref_ce_loss": 0.073519267141819, "step": 18090 }, { "epoch": 6.034022681787858, "loss": 0.26703473925590515, "step": 18090 }, { "ce_loss": 0.05209790915250778, "epoch": 6.034022681787858, "step": 18090 }, { "distill_loss": 0.09003359824419022, "epoch": 6.034022681787858, "step": 18090 }, { "epoch": 6.034022681787858, "ref_ce_loss": 0.08476582914590836, "step": 18090 }, { "epoch": 6.034022681787858, "loss": 0.24599985778331757, "step": 18090 }, { "ce_loss": 0.07031101733446121, "epoch": 6.034022681787858, "step": 18090 }, { "distill_loss": 0.10161477327346802, "epoch": 6.034022681787858, "step": 18090 }, { "epoch": 6.034022681787858, "ref_ce_loss": 0.029595762491226196, "step": 18090 }, { "epoch": 6.037358238825884, "loss": 0.3341, "step": 18100 }, { "epoch": 6.037358238825884, "grad_norm": 2.7874391078948975, "step": 18100 }, { "epoch": 6.037358238825884, "learning_rate": 4.491242241991286e-05, "step": 18100 }, { "epoch": 6.037358238825884, "loss": 0.4746611416339874, "step": 18100 }, { "ce_loss": 0.04059432074427605, "epoch": 6.037358238825884, "step": 18100 }, { "distill_loss": 0.12545041739940643, "epoch": 6.037358238825884, "step": 18100 }, { "epoch": 6.037358238825884, "ref_ce_loss": 0.07350048422813416, "step": 18100 }, { "epoch": 6.037358238825884, "loss": 0.29115569591522217, "step": 18100 }, { "ce_loss": 0.043459825217723846, "epoch": 6.037358238825884, "step": 18100 }, { "distill_loss": 0.17026124894618988, "epoch": 6.037358238825884, "step": 18100 }, { "epoch": 6.037358238825884, "ref_ce_loss": 0.05982349440455437, "step": 18100 }, { "epoch": 6.037358238825884, "loss": 0.32593053579330444, "step": 18100 }, { "ce_loss": 0.05972723290324211, "epoch": 6.037358238825884, "step": 18100 }, { "distill_loss": 0.14202600717544556, "epoch": 6.037358238825884, "step": 18100 }, { "epoch": 6.037358238825884, "ref_ce_loss": 0.08605913072824478, "step": 18100 }, { "epoch": 6.037358238825884, "loss": 0.1863568127155304, "step": 18100 }, { "ce_loss": 0.027750154957175255, "epoch": 6.037358238825884, "step": 18100 }, { "distill_loss": 0.08981994539499283, "epoch": 6.037358238825884, "step": 18100 }, { "epoch": 6.037358238825884, "ref_ce_loss": 0.03250458091497421, "step": 18100 }, { "epoch": 6.040693795863909, "loss": 0.3122, "step": 18110 }, { "epoch": 6.040693795863909, "grad_norm": 2.810276746749878, "step": 18110 }, { "epoch": 6.040693795863909, "learning_rate": 4.4767976509757563e-05, "step": 18110 }, { "epoch": 6.040693795863909, "loss": 0.20042987167835236, "step": 18110 }, { "ce_loss": 0.014375542290508747, "epoch": 6.040693795863909, "step": 18110 }, { "distill_loss": 0.11172399669885635, "epoch": 6.040693795863909, "step": 18110 }, { "epoch": 6.040693795863909, "ref_ce_loss": 0.05216660350561142, "step": 18110 }, { "epoch": 6.040693795863909, "loss": 0.23054620623588562, "step": 18110 }, { "ce_loss": 0.020065613090991974, "epoch": 6.040693795863909, "step": 18110 }, { "distill_loss": 0.11909779906272888, "epoch": 6.040693795863909, "step": 18110 }, { "epoch": 6.040693795863909, "ref_ce_loss": 0.05230550840497017, "step": 18110 }, { "epoch": 6.040693795863909, "loss": 0.27268943190574646, "step": 18110 }, { "ce_loss": 0.06317111849784851, "epoch": 6.040693795863909, "step": 18110 }, { "distill_loss": 0.12656812369823456, "epoch": 6.040693795863909, "step": 18110 }, { "epoch": 6.040693795863909, "ref_ce_loss": 0.08271820843219757, "step": 18110 }, { "epoch": 6.040693795863909, "loss": 0.28299564123153687, "step": 18110 }, { "ce_loss": 0.07387886941432953, "epoch": 6.040693795863909, "step": 18110 }, { "distill_loss": 0.1497463583946228, "epoch": 6.040693795863909, "step": 18110 }, { "epoch": 6.040693795863909, "ref_ce_loss": 0.0429847314953804, "step": 18110 }, { "epoch": 6.044029352901934, "loss": 0.3004, "step": 18120 }, { "epoch": 6.044029352901934, "grad_norm": 2.7142744064331055, "step": 18120 }, { "epoch": 6.044029352901934, "learning_rate": 4.462372250130247e-05, "step": 18120 }, { "epoch": 6.044029352901934, "loss": 0.3670087456703186, "step": 18120 }, { "ce_loss": 0.06371357291936874, "epoch": 6.044029352901934, "step": 18120 }, { "distill_loss": 0.1704612523317337, "epoch": 6.044029352901934, "step": 18120 }, { "epoch": 6.044029352901934, "ref_ce_loss": 0.1321641057729721, "step": 18120 }, { "epoch": 6.044029352901934, "loss": 0.2539289891719818, "step": 18120 }, { "ce_loss": 0.047485459595918655, "epoch": 6.044029352901934, "step": 18120 }, { "distill_loss": 0.11303704977035522, "epoch": 6.044029352901934, "step": 18120 }, { "epoch": 6.044029352901934, "ref_ce_loss": 0.06555590778589249, "step": 18120 }, { "epoch": 6.044029352901934, "loss": 0.22901363670825958, "step": 18120 }, { "ce_loss": 0.052790023386478424, "epoch": 6.044029352901934, "step": 18120 }, { "distill_loss": 0.10688269138336182, "epoch": 6.044029352901934, "step": 18120 }, { "epoch": 6.044029352901934, "ref_ce_loss": 0.0471915639936924, "step": 18120 }, { "epoch": 6.044029352901934, "loss": 0.5411761999130249, "step": 18120 }, { "ce_loss": 0.038415033370256424, "epoch": 6.044029352901934, "step": 18120 }, { "distill_loss": 0.08298727869987488, "epoch": 6.044029352901934, "step": 18120 }, { "epoch": 6.044029352901934, "ref_ce_loss": 0.036232996731996536, "step": 18120 }, { "epoch": 6.04736490993996, "loss": 0.3601, "step": 18130 }, { "epoch": 6.04736490993996, "grad_norm": 4.615497589111328, "step": 18130 }, { "epoch": 6.04736490993996, "learning_rate": 4.447966065760997e-05, "step": 18130 }, { "epoch": 6.04736490993996, "loss": 0.33229541778564453, "step": 18130 }, { "ce_loss": 0.046142105013132095, "epoch": 6.04736490993996, "step": 18130 }, { "distill_loss": 0.16430698335170746, "epoch": 6.04736490993996, "step": 18130 }, { "epoch": 6.04736490993996, "ref_ce_loss": 0.0699513629078865, "step": 18130 }, { "epoch": 6.04736490993996, "loss": 0.44384586811065674, "step": 18130 }, { "ce_loss": 0.028619125485420227, "epoch": 6.04736490993996, "step": 18130 }, { "distill_loss": 0.14834138751029968, "epoch": 6.04736490993996, "step": 18130 }, { "epoch": 6.04736490993996, "ref_ce_loss": 0.042568981647491455, "step": 18130 }, { "epoch": 6.04736490993996, "loss": 0.5875935554504395, "step": 18130 }, { "ce_loss": 0.014686590991914272, "epoch": 6.04736490993996, "step": 18130 }, { "distill_loss": 0.12745922803878784, "epoch": 6.04736490993996, "step": 18130 }, { "epoch": 6.04736490993996, "ref_ce_loss": 0.10846404731273651, "step": 18130 }, { "epoch": 6.04736490993996, "loss": 0.2178553193807602, "step": 18130 }, { "ce_loss": 0.05181694030761719, "epoch": 6.04736490993996, "step": 18130 }, { "distill_loss": 0.1011785939335823, "epoch": 6.04736490993996, "step": 18130 }, { "epoch": 6.04736490993996, "ref_ce_loss": 0.04709342122077942, "step": 18130 }, { "epoch": 6.050700466977985, "loss": 0.3242, "step": 18140 }, { "epoch": 6.050700466977985, "grad_norm": 1.946622371673584, "step": 18140 }, { "epoch": 6.050700466977985, "learning_rate": 4.433579124139206e-05, "step": 18140 }, { "epoch": 6.050700466977985, "loss": 0.25335797667503357, "step": 18140 }, { "ce_loss": 0.0676303505897522, "epoch": 6.050700466977985, "step": 18140 }, { "distill_loss": 0.15347588062286377, "epoch": 6.050700466977985, "step": 18140 }, { "epoch": 6.050700466977985, "ref_ce_loss": 0.03215770795941353, "step": 18140 }, { "epoch": 6.050700466977985, "loss": 0.40435728430747986, "step": 18140 }, { "ce_loss": 0.0626855120062828, "epoch": 6.050700466977985, "step": 18140 }, { "distill_loss": 0.1381070762872696, "epoch": 6.050700466977985, "step": 18140 }, { "epoch": 6.050700466977985, "ref_ce_loss": 0.08322049677371979, "step": 18140 }, { "epoch": 6.050700466977985, "loss": 0.3568632900714874, "step": 18140 }, { "ce_loss": 0.11419306695461273, "epoch": 6.050700466977985, "step": 18140 }, { "distill_loss": 0.13917380571365356, "epoch": 6.050700466977985, "step": 18140 }, { "epoch": 6.050700466977985, "ref_ce_loss": 0.07467635720968246, "step": 18140 }, { "epoch": 6.050700466977985, "loss": 0.27939581871032715, "step": 18140 }, { "ce_loss": 0.012311798520386219, "epoch": 6.050700466977985, "step": 18140 }, { "distill_loss": 0.16337209939956665, "epoch": 6.050700466977985, "step": 18140 }, { "epoch": 6.050700466977985, "ref_ce_loss": 0.06114046275615692, "step": 18140 }, { "epoch": 6.0540360240160105, "loss": 0.3313, "step": 18150 }, { "epoch": 6.0540360240160105, "grad_norm": 2.934711217880249, "step": 18150 }, { "epoch": 6.0540360240160105, "learning_rate": 4.419211451500986e-05, "step": 18150 }, { "epoch": 6.0540360240160105, "loss": 0.3649110198020935, "step": 18150 }, { "ce_loss": 0.08198490738868713, "epoch": 6.0540360240160105, "step": 18150 }, { "distill_loss": 0.13583259284496307, "epoch": 6.0540360240160105, "step": 18150 }, { "epoch": 6.0540360240160105, "ref_ce_loss": 0.06769489496946335, "step": 18150 }, { "epoch": 6.0540360240160105, "loss": 0.20053881406784058, "step": 18150 }, { "ce_loss": 0.031547416001558304, "epoch": 6.0540360240160105, "step": 18150 }, { "distill_loss": 0.12106670439243317, "epoch": 6.0540360240160105, "step": 18150 }, { "epoch": 6.0540360240160105, "ref_ce_loss": 0.03251959756016731, "step": 18150 }, { "epoch": 6.0540360240160105, "loss": 0.19803735613822937, "step": 18150 }, { "ce_loss": 0.013310973532497883, "epoch": 6.0540360240160105, "step": 18150 }, { "distill_loss": 0.09719417989253998, "epoch": 6.0540360240160105, "step": 18150 }, { "epoch": 6.0540360240160105, "ref_ce_loss": 0.05349541828036308, "step": 18150 }, { "epoch": 6.0540360240160105, "loss": 0.22628724575042725, "step": 18150 }, { "ce_loss": 0.02451532892882824, "epoch": 6.0540360240160105, "step": 18150 }, { "distill_loss": 0.11430226266384125, "epoch": 6.0540360240160105, "step": 18150 }, { "epoch": 6.0540360240160105, "ref_ce_loss": 0.04020338132977486, "step": 18150 }, { "epoch": 6.057371581054036, "loss": 0.2857, "step": 18160 }, { "epoch": 6.057371581054036, "grad_norm": 2.198643445968628, "step": 18160 }, { "epoch": 6.057371581054036, "learning_rate": 4.4048630740472915e-05, "step": 18160 }, { "epoch": 6.057371581054036, "loss": 0.3034615218639374, "step": 18160 }, { "ce_loss": 0.03148229420185089, "epoch": 6.057371581054036, "step": 18160 }, { "distill_loss": 0.10916534811258316, "epoch": 6.057371581054036, "step": 18160 }, { "epoch": 6.057371581054036, "ref_ce_loss": 0.055936042219400406, "step": 18160 }, { "epoch": 6.057371581054036, "loss": 0.1808771938085556, "step": 18160 }, { "ce_loss": 0.01604457013309002, "epoch": 6.057371581054036, "step": 18160 }, { "distill_loss": 0.09643196314573288, "epoch": 6.057371581054036, "step": 18160 }, { "epoch": 6.057371581054036, "ref_ce_loss": 0.04746682941913605, "step": 18160 }, { "epoch": 6.057371581054036, "loss": 0.3949342370033264, "step": 18160 }, { "ce_loss": 0.07888796180486679, "epoch": 6.057371581054036, "step": 18160 }, { "distill_loss": 0.11194111406803131, "epoch": 6.057371581054036, "step": 18160 }, { "epoch": 6.057371581054036, "ref_ce_loss": 0.06055905297398567, "step": 18160 }, { "epoch": 6.057371581054036, "loss": 0.3000640571117401, "step": 18160 }, { "ce_loss": 0.10470689833164215, "epoch": 6.057371581054036, "step": 18160 }, { "distill_loss": 0.13388051092624664, "epoch": 6.057371581054036, "step": 18160 }, { "epoch": 6.057371581054036, "ref_ce_loss": 0.06086868792772293, "step": 18160 }, { "epoch": 6.060707138092061, "loss": 0.3392, "step": 18170 }, { "epoch": 6.060707138092061, "grad_norm": 2.10493803024292, "step": 18170 }, { "epoch": 6.060707138092061, "learning_rate": 4.390534017943911e-05, "step": 18170 }, { "epoch": 6.060707138092061, "loss": 0.5440735816955566, "step": 18170 }, { "ce_loss": 0.07138556241989136, "epoch": 6.060707138092061, "step": 18170 }, { "distill_loss": 0.15884366631507874, "epoch": 6.060707138092061, "step": 18170 }, { "epoch": 6.060707138092061, "ref_ce_loss": 0.05661080777645111, "step": 18170 }, { "epoch": 6.060707138092061, "loss": 0.46446603536605835, "step": 18170 }, { "ce_loss": 0.05201159417629242, "epoch": 6.060707138092061, "step": 18170 }, { "distill_loss": 0.1514742076396942, "epoch": 6.060707138092061, "step": 18170 }, { "epoch": 6.060707138092061, "ref_ce_loss": 0.042757753282785416, "step": 18170 }, { "epoch": 6.060707138092061, "loss": 0.1995864063501358, "step": 18170 }, { "ce_loss": 0.02640986256301403, "epoch": 6.060707138092061, "step": 18170 }, { "distill_loss": 0.10817944258451462, "epoch": 6.060707138092061, "step": 18170 }, { "epoch": 6.060707138092061, "ref_ce_loss": 0.06455156952142715, "step": 18170 }, { "epoch": 6.060707138092061, "loss": 0.26589059829711914, "step": 18170 }, { "ce_loss": 0.035571422427892685, "epoch": 6.060707138092061, "step": 18170 }, { "distill_loss": 0.1171257495880127, "epoch": 6.060707138092061, "step": 18170 }, { "epoch": 6.060707138092061, "ref_ce_loss": 0.07539810985326767, "step": 18170 }, { "epoch": 6.0640426951300865, "loss": 0.311, "step": 18180 }, { "epoch": 6.0640426951300865, "grad_norm": 2.6534509658813477, "step": 18180 }, { "epoch": 6.0640426951300865, "learning_rate": 4.376224309321388e-05, "step": 18180 }, { "epoch": 6.0640426951300865, "loss": 0.18756568431854248, "step": 18180 }, { "ce_loss": 0.02319115772843361, "epoch": 6.0640426951300865, "step": 18180 }, { "distill_loss": 0.09259458631277084, "epoch": 6.0640426951300865, "step": 18180 }, { "epoch": 6.0640426951300865, "ref_ce_loss": 0.071564681828022, "step": 18180 }, { "epoch": 6.0640426951300865, "loss": 0.36065298318862915, "step": 18180 }, { "ce_loss": 0.10762543231248856, "epoch": 6.0640426951300865, "step": 18180 }, { "distill_loss": 0.13027355074882507, "epoch": 6.0640426951300865, "step": 18180 }, { "epoch": 6.0640426951300865, "ref_ce_loss": 0.05452357977628708, "step": 18180 }, { "epoch": 6.0640426951300865, "loss": 0.26103997230529785, "step": 18180 }, { "ce_loss": 0.08527110517024994, "epoch": 6.0640426951300865, "step": 18180 }, { "distill_loss": 0.09655442833900452, "epoch": 6.0640426951300865, "step": 18180 }, { "epoch": 6.0640426951300865, "ref_ce_loss": 0.05029728263616562, "step": 18180 }, { "epoch": 6.0640426951300865, "loss": 0.4683290123939514, "step": 18180 }, { "ce_loss": 0.03932412713766098, "epoch": 6.0640426951300865, "step": 18180 }, { "distill_loss": 0.09566164761781693, "epoch": 6.0640426951300865, "step": 18180 }, { "epoch": 6.0640426951300865, "ref_ce_loss": 0.036095380783081055, "step": 18180 }, { "epoch": 6.067378252168112, "loss": 0.3035, "step": 18190 }, { "epoch": 6.067378252168112, "grad_norm": 2.610994338989258, "step": 18190 }, { "epoch": 6.067378252168112, "learning_rate": 4.361933974274987e-05, "step": 18190 }, { "epoch": 6.067378252168112, "loss": 0.10692508518695831, "step": 18190 }, { "ce_loss": 0.0038241662550717592, "epoch": 6.067378252168112, "step": 18190 }, { "distill_loss": 0.07537472993135452, "epoch": 6.067378252168112, "step": 18190 }, { "epoch": 6.067378252168112, "ref_ce_loss": 0.02766880951821804, "step": 18190 }, { "epoch": 6.067378252168112, "loss": 0.3644680976867676, "step": 18190 }, { "ce_loss": 0.08397339284420013, "epoch": 6.067378252168112, "step": 18190 }, { "distill_loss": 0.14016035199165344, "epoch": 6.067378252168112, "step": 18190 }, { "epoch": 6.067378252168112, "ref_ce_loss": 0.06778319925069809, "step": 18190 }, { "epoch": 6.067378252168112, "loss": 0.22137466073036194, "step": 18190 }, { "ce_loss": 0.07818184047937393, "epoch": 6.067378252168112, "step": 18190 }, { "distill_loss": 0.0896679237484932, "epoch": 6.067378252168112, "step": 18190 }, { "epoch": 6.067378252168112, "ref_ce_loss": 0.05327446386218071, "step": 18190 }, { "epoch": 6.067378252168112, "loss": 0.3335571885108948, "step": 18190 }, { "ce_loss": 0.04578216373920441, "epoch": 6.067378252168112, "step": 18190 }, { "distill_loss": 0.13960577547550201, "epoch": 6.067378252168112, "step": 18190 }, { "epoch": 6.067378252168112, "ref_ce_loss": 0.06935045123100281, "step": 18190 }, { "epoch": 6.070713809206137, "loss": 0.2786, "step": 18200 }, { "epoch": 6.070713809206137, "grad_norm": 2.146139144897461, "step": 18200 }, { "epoch": 6.070713809206137, "learning_rate": 4.347663038864648e-05, "step": 18200 }, { "epoch": 6.070713809206137, "loss": 0.3073939085006714, "step": 18200 }, { "ce_loss": 0.07068096101284027, "epoch": 6.070713809206137, "step": 18200 }, { "distill_loss": 0.15674972534179688, "epoch": 6.070713809206137, "step": 18200 }, { "epoch": 6.070713809206137, "ref_ce_loss": 0.06139358878135681, "step": 18200 }, { "epoch": 6.070713809206137, "loss": 0.7768551111221313, "step": 18200 }, { "ce_loss": 0.07332314550876617, "epoch": 6.070713809206137, "step": 18200 }, { "distill_loss": 0.17987912893295288, "epoch": 6.070713809206137, "step": 18200 }, { "epoch": 6.070713809206137, "ref_ce_loss": 0.08335179090499878, "step": 18200 }, { "epoch": 6.070713809206137, "loss": 0.24506054818630219, "step": 18200 }, { "ce_loss": 0.01479860208928585, "epoch": 6.070713809206137, "step": 18200 }, { "distill_loss": 0.1127048134803772, "epoch": 6.070713809206137, "step": 18200 }, { "epoch": 6.070713809206137, "ref_ce_loss": 0.06586452573537827, "step": 18200 }, { "epoch": 6.070713809206137, "loss": 0.31514981389045715, "step": 18200 }, { "ce_loss": 0.058193325996398926, "epoch": 6.070713809206137, "step": 18200 }, { "distill_loss": 0.14873111248016357, "epoch": 6.070713809206137, "step": 18200 }, { "epoch": 6.070713809206137, "ref_ce_loss": 0.05019865557551384, "step": 18200 }, { "epoch": 6.074049366244163, "loss": 0.3327, "step": 18210 }, { "epoch": 6.074049366244163, "grad_norm": 3.6657485961914062, "step": 18210 }, { "epoch": 6.074049366244163, "learning_rate": 4.3334115291149154e-05, "step": 18210 }, { "epoch": 6.074049366244163, "loss": 0.21963410079479218, "step": 18210 }, { "ce_loss": 0.029777036979794502, "epoch": 6.074049366244163, "step": 18210 }, { "distill_loss": 0.09637679904699326, "epoch": 6.074049366244163, "step": 18210 }, { "epoch": 6.074049366244163, "ref_ce_loss": 0.042842797935009, "step": 18210 }, { "epoch": 6.074049366244163, "loss": 0.35416334867477417, "step": 18210 }, { "ce_loss": 0.07847097516059875, "epoch": 6.074049366244163, "step": 18210 }, { "distill_loss": 0.22109586000442505, "epoch": 6.074049366244163, "step": 18210 }, { "epoch": 6.074049366244163, "ref_ce_loss": 0.05443606525659561, "step": 18210 }, { "epoch": 6.074049366244163, "loss": 0.23724272847175598, "step": 18210 }, { "ce_loss": 0.02057749032974243, "epoch": 6.074049366244163, "step": 18210 }, { "distill_loss": 0.12848436832427979, "epoch": 6.074049366244163, "step": 18210 }, { "epoch": 6.074049366244163, "ref_ce_loss": 0.060044120997190475, "step": 18210 }, { "epoch": 6.074049366244163, "loss": 0.5802603363990784, "step": 18210 }, { "ce_loss": 0.036357879638671875, "epoch": 6.074049366244163, "step": 18210 }, { "distill_loss": 0.10860496759414673, "epoch": 6.074049366244163, "step": 18210 }, { "epoch": 6.074049366244163, "ref_ce_loss": 0.05457604303956032, "step": 18210 }, { "epoch": 6.077384923282188, "loss": 0.3228, "step": 18220 }, { "epoch": 6.077384923282188, "grad_norm": 2.0046796798706055, "step": 18220 }, { "epoch": 6.077384923282188, "learning_rate": 4.31917947101493e-05, "step": 18220 }, { "epoch": 6.077384923282188, "loss": 0.5089726448059082, "step": 18220 }, { "ce_loss": 0.06141791120171547, "epoch": 6.077384923282188, "step": 18220 }, { "distill_loss": 0.14615316689014435, "epoch": 6.077384923282188, "step": 18220 }, { "epoch": 6.077384923282188, "ref_ce_loss": 0.060697052627801895, "step": 18220 }, { "epoch": 6.077384923282188, "loss": 0.29290029406547546, "step": 18220 }, { "ce_loss": 0.05762191489338875, "epoch": 6.077384923282188, "step": 18220 }, { "distill_loss": 0.13641268014907837, "epoch": 6.077384923282188, "step": 18220 }, { "epoch": 6.077384923282188, "ref_ce_loss": 0.05285864323377609, "step": 18220 }, { "epoch": 6.077384923282188, "loss": 0.16176757216453552, "step": 18220 }, { "ce_loss": 0.011141808703541756, "epoch": 6.077384923282188, "step": 18220 }, { "distill_loss": 0.0926661565899849, "epoch": 6.077384923282188, "step": 18220 }, { "epoch": 6.077384923282188, "ref_ce_loss": 0.03927552327513695, "step": 18220 }, { "epoch": 6.077384923282188, "loss": 0.24812178313732147, "step": 18220 }, { "ce_loss": 0.027461422607302666, "epoch": 6.077384923282188, "step": 18220 }, { "distill_loss": 0.1357075273990631, "epoch": 6.077384923282188, "step": 18220 }, { "epoch": 6.077384923282188, "ref_ce_loss": 0.048979636281728745, "step": 18220 }, { "epoch": 6.080720480320213, "loss": 0.3197, "step": 18230 }, { "epoch": 6.080720480320213, "grad_norm": 2.6882028579711914, "step": 18230 }, { "epoch": 6.080720480320213, "learning_rate": 4.304966890518349e-05, "step": 18230 }, { "epoch": 6.080720480320213, "loss": 0.2826864421367645, "step": 18230 }, { "ce_loss": 0.06585119664669037, "epoch": 6.080720480320213, "step": 18230 }, { "distill_loss": 0.1340772956609726, "epoch": 6.080720480320213, "step": 18230 }, { "epoch": 6.080720480320213, "ref_ce_loss": 0.04505593702197075, "step": 18230 }, { "epoch": 6.080720480320213, "loss": 0.29510295391082764, "step": 18230 }, { "ce_loss": 0.052019789814949036, "epoch": 6.080720480320213, "step": 18230 }, { "distill_loss": 0.15474607050418854, "epoch": 6.080720480320213, "step": 18230 }, { "epoch": 6.080720480320213, "ref_ce_loss": 0.05676916614174843, "step": 18230 }, { "epoch": 6.080720480320213, "loss": 0.24109765887260437, "step": 18230 }, { "ce_loss": 0.007869729772210121, "epoch": 6.080720480320213, "step": 18230 }, { "distill_loss": 0.087680384516716, "epoch": 6.080720480320213, "step": 18230 }, { "epoch": 6.080720480320213, "ref_ce_loss": 0.042009759694337845, "step": 18230 }, { "epoch": 6.080720480320213, "loss": 0.1544315367937088, "step": 18230 }, { "ce_loss": 0.02946949191391468, "epoch": 6.080720480320213, "step": 18230 }, { "distill_loss": 0.09035389125347137, "epoch": 6.080720480320213, "step": 18230 }, { "epoch": 6.080720480320213, "ref_ce_loss": 0.03453512117266655, "step": 18230 }, { "epoch": 6.084056037358239, "loss": 0.3303, "step": 18240 }, { "epoch": 6.084056037358239, "grad_norm": 2.3786280155181885, "step": 18240 }, { "epoch": 6.084056037358239, "learning_rate": 4.290773813543312e-05, "step": 18240 }, { "epoch": 6.084056037358239, "loss": 0.5657137632369995, "step": 18240 }, { "ce_loss": 0.04016527906060219, "epoch": 6.084056037358239, "step": 18240 }, { "distill_loss": 0.09664686024188995, "epoch": 6.084056037358239, "step": 18240 }, { "epoch": 6.084056037358239, "ref_ce_loss": 0.06613688915967941, "step": 18240 }, { "epoch": 6.084056037358239, "loss": 0.4780872166156769, "step": 18240 }, { "ce_loss": 0.08768974244594574, "epoch": 6.084056037358239, "step": 18240 }, { "distill_loss": 0.23951031267642975, "epoch": 6.084056037358239, "step": 18240 }, { "epoch": 6.084056037358239, "ref_ce_loss": 0.0796649381518364, "step": 18240 }, { "epoch": 6.084056037358239, "loss": 0.21369272470474243, "step": 18240 }, { "ce_loss": 0.03242059797048569, "epoch": 6.084056037358239, "step": 18240 }, { "distill_loss": 0.12135210633277893, "epoch": 6.084056037358239, "step": 18240 }, { "epoch": 6.084056037358239, "ref_ce_loss": 0.05987042933702469, "step": 18240 }, { "epoch": 6.084056037358239, "loss": 0.2175336629152298, "step": 18240 }, { "ce_loss": 0.03970927745103836, "epoch": 6.084056037358239, "step": 18240 }, { "distill_loss": 0.1291152983903885, "epoch": 6.084056037358239, "step": 18240 }, { "epoch": 6.084056037358239, "ref_ce_loss": 0.04774701967835426, "step": 18240 }, { "epoch": 6.087391594396264, "loss": 0.3106, "step": 18250 }, { "epoch": 6.087391594396264, "grad_norm": 1.8230851888656616, "step": 18250 }, { "epoch": 6.087391594396264, "learning_rate": 4.2766002659724014e-05, "step": 18250 }, { "epoch": 6.087391594396264, "loss": 0.20345722138881683, "step": 18250 }, { "ce_loss": 0.055368226021528244, "epoch": 6.087391594396264, "step": 18250 }, { "distill_loss": 0.11434651166200638, "epoch": 6.087391594396264, "step": 18250 }, { "epoch": 6.087391594396264, "ref_ce_loss": 0.033585384488105774, "step": 18250 }, { "epoch": 6.087391594396264, "loss": 0.3076462745666504, "step": 18250 }, { "ce_loss": 0.04119785875082016, "epoch": 6.087391594396264, "step": 18250 }, { "distill_loss": 0.1454070806503296, "epoch": 6.087391594396264, "step": 18250 }, { "epoch": 6.087391594396264, "ref_ce_loss": 0.09140428155660629, "step": 18250 }, { "epoch": 6.087391594396264, "loss": 0.2772146463394165, "step": 18250 }, { "ce_loss": 0.04244215041399002, "epoch": 6.087391594396264, "step": 18250 }, { "distill_loss": 0.11346837878227234, "epoch": 6.087391594396264, "step": 18250 }, { "epoch": 6.087391594396264, "ref_ce_loss": 0.08478512614965439, "step": 18250 }, { "epoch": 6.087391594396264, "loss": 0.437461256980896, "step": 18250 }, { "ce_loss": 0.05325673148036003, "epoch": 6.087391594396264, "step": 18250 }, { "distill_loss": 0.12055876851081848, "epoch": 6.087391594396264, "step": 18250 }, { "epoch": 6.087391594396264, "ref_ce_loss": 0.044090624898672104, "step": 18250 }, { "epoch": 6.090727151434289, "loss": 0.2982, "step": 18260 }, { "epoch": 6.090727151434289, "grad_norm": 1.8949027061462402, "step": 18260 }, { "epoch": 6.090727151434289, "learning_rate": 4.262446273652562e-05, "step": 18260 }, { "epoch": 6.090727151434289, "loss": 0.2269206941127777, "step": 18260 }, { "ce_loss": 0.062054380774497986, "epoch": 6.090727151434289, "step": 18260 }, { "distill_loss": 0.10284656286239624, "epoch": 6.090727151434289, "step": 18260 }, { "epoch": 6.090727151434289, "ref_ce_loss": 0.04559963196516037, "step": 18260 }, { "epoch": 6.090727151434289, "loss": 0.30982521176338196, "step": 18260 }, { "ce_loss": 0.054068103432655334, "epoch": 6.090727151434289, "step": 18260 }, { "distill_loss": 0.15855297446250916, "epoch": 6.090727151434289, "step": 18260 }, { "epoch": 6.090727151434289, "ref_ce_loss": 0.07700146734714508, "step": 18260 }, { "epoch": 6.090727151434289, "loss": 0.23585665225982666, "step": 18260 }, { "ce_loss": 0.015442995354533195, "epoch": 6.090727151434289, "step": 18260 }, { "distill_loss": 0.10868162661790848, "epoch": 6.090727151434289, "step": 18260 }, { "epoch": 6.090727151434289, "ref_ce_loss": 0.041581302881240845, "step": 18260 }, { "epoch": 6.090727151434289, "loss": 0.17389631271362305, "step": 18260 }, { "ce_loss": 0.015349296852946281, "epoch": 6.090727151434289, "step": 18260 }, { "distill_loss": 0.11651057004928589, "epoch": 6.090727151434289, "step": 18260 }, { "epoch": 6.090727151434289, "ref_ce_loss": 0.041636236011981964, "step": 18260 }, { "epoch": 6.094062708472315, "loss": 0.2793, "step": 18270 }, { "epoch": 6.094062708472315, "grad_norm": 1.77875816822052, "step": 18270 }, { "epoch": 6.094062708472315, "learning_rate": 4.248311862395103e-05, "step": 18270 }, { "epoch": 6.094062708472315, "loss": 0.3750142753124237, "step": 18270 }, { "ce_loss": 0.09963914006948471, "epoch": 6.094062708472315, "step": 18270 }, { "distill_loss": 0.21521353721618652, "epoch": 6.094062708472315, "step": 18270 }, { "epoch": 6.094062708472315, "ref_ce_loss": 0.059947673231363297, "step": 18270 }, { "epoch": 6.094062708472315, "loss": 0.3384113311767578, "step": 18270 }, { "ce_loss": 0.049503691494464874, "epoch": 6.094062708472315, "step": 18270 }, { "distill_loss": 0.13446229696273804, "epoch": 6.094062708472315, "step": 18270 }, { "epoch": 6.094062708472315, "ref_ce_loss": 0.08077162504196167, "step": 18270 }, { "epoch": 6.094062708472315, "loss": 0.3235764503479004, "step": 18270 }, { "ce_loss": 0.07446154206991196, "epoch": 6.094062708472315, "step": 18270 }, { "distill_loss": 0.1346776783466339, "epoch": 6.094062708472315, "step": 18270 }, { "epoch": 6.094062708472315, "ref_ce_loss": 0.08494427055120468, "step": 18270 }, { "epoch": 6.094062708472315, "loss": 0.8587777614593506, "step": 18270 }, { "ce_loss": 0.08527383208274841, "epoch": 6.094062708472315, "step": 18270 }, { "distill_loss": 0.18738584220409393, "epoch": 6.094062708472315, "step": 18270 }, { "epoch": 6.094062708472315, "ref_ce_loss": 0.10342002660036087, "step": 18270 }, { "epoch": 6.09739826551034, "loss": 0.306, "step": 18280 }, { "epoch": 6.09739826551034, "grad_norm": 4.400006294250488, "step": 18280 }, { "epoch": 6.09739826551034, "learning_rate": 4.234197057975615e-05, "step": 18280 }, { "epoch": 6.09739826551034, "loss": 0.44186297059059143, "step": 18280 }, { "ce_loss": 0.08321335166692734, "epoch": 6.09739826551034, "step": 18280 }, { "distill_loss": 0.27311789989471436, "epoch": 6.09739826551034, "step": 18280 }, { "epoch": 6.09739826551034, "ref_ce_loss": 0.05534227564930916, "step": 18280 }, { "epoch": 6.09739826551034, "loss": 0.3389902710914612, "step": 18280 }, { "ce_loss": 0.07443110644817352, "epoch": 6.09739826551034, "step": 18280 }, { "distill_loss": 0.12935440242290497, "epoch": 6.09739826551034, "step": 18280 }, { "epoch": 6.09739826551034, "ref_ce_loss": 0.08985836803913116, "step": 18280 }, { "epoch": 6.09739826551034, "loss": 0.19584959745407104, "step": 18280 }, { "ce_loss": 0.014480430632829666, "epoch": 6.09739826551034, "step": 18280 }, { "distill_loss": 0.08713266998529434, "epoch": 6.09739826551034, "step": 18280 }, { "epoch": 6.09739826551034, "ref_ce_loss": 0.06017642840743065, "step": 18280 }, { "epoch": 6.09739826551034, "loss": 0.22717109322547913, "step": 18280 }, { "ce_loss": 0.03948648273944855, "epoch": 6.09739826551034, "step": 18280 }, { "distill_loss": 0.1253734976053238, "epoch": 6.09739826551034, "step": 18280 }, { "epoch": 6.09739826551034, "ref_ce_loss": 0.04686824604868889, "step": 18280 }, { "epoch": 6.100733822548365, "loss": 0.3242, "step": 18290 }, { "epoch": 6.100733822548365, "grad_norm": 1.7938190698623657, "step": 18290 }, { "epoch": 6.100733822548365, "learning_rate": 4.2201018861339226e-05, "step": 18290 }, { "epoch": 6.100733822548365, "loss": 0.26108479499816895, "step": 18290 }, { "ce_loss": 0.05302269011735916, "epoch": 6.100733822548365, "step": 18290 }, { "distill_loss": 0.11189974844455719, "epoch": 6.100733822548365, "step": 18290 }, { "epoch": 6.100733822548365, "ref_ce_loss": 0.05024619773030281, "step": 18290 }, { "epoch": 6.100733822548365, "loss": 0.37917935848236084, "step": 18290 }, { "ce_loss": 0.04029357060790062, "epoch": 6.100733822548365, "step": 18290 }, { "distill_loss": 0.12598711252212524, "epoch": 6.100733822548365, "step": 18290 }, { "epoch": 6.100733822548365, "ref_ce_loss": 0.07380714267492294, "step": 18290 }, { "epoch": 6.100733822548365, "loss": 0.32163190841674805, "step": 18290 }, { "ce_loss": 0.06296807527542114, "epoch": 6.100733822548365, "step": 18290 }, { "distill_loss": 0.16332390904426575, "epoch": 6.100733822548365, "step": 18290 }, { "epoch": 6.100733822548365, "ref_ce_loss": 0.09513794630765915, "step": 18290 }, { "epoch": 6.100733822548365, "loss": 0.2830953598022461, "step": 18290 }, { "ce_loss": 0.0462760366499424, "epoch": 6.100733822548365, "step": 18290 }, { "distill_loss": 0.15794722735881805, "epoch": 6.100733822548365, "step": 18290 }, { "epoch": 6.100733822548365, "ref_ce_loss": 0.05615334212779999, "step": 18290 }, { "epoch": 6.104069379586391, "loss": 0.3294, "step": 18300 }, { "epoch": 6.104069379586391, "grad_norm": 2.763949394226074, "step": 18300 }, { "epoch": 6.104069379586391, "learning_rate": 4.2060263725740756e-05, "step": 18300 }, { "epoch": 6.104069379586391, "loss": 0.392032265663147, "step": 18300 }, { "ce_loss": 0.09934784471988678, "epoch": 6.104069379586391, "step": 18300 }, { "distill_loss": 0.20907564461231232, "epoch": 6.104069379586391, "step": 18300 }, { "epoch": 6.104069379586391, "ref_ce_loss": 0.060490336269140244, "step": 18300 }, { "epoch": 6.104069379586391, "loss": 0.18854185938835144, "step": 18300 }, { "ce_loss": 0.019186168909072876, "epoch": 6.104069379586391, "step": 18300 }, { "distill_loss": 0.12396436184644699, "epoch": 6.104069379586391, "step": 18300 }, { "epoch": 6.104069379586391, "ref_ce_loss": 0.04528871178627014, "step": 18300 }, { "epoch": 6.104069379586391, "loss": 0.22943469882011414, "step": 18300 }, { "ce_loss": 0.04969404265284538, "epoch": 6.104069379586391, "step": 18300 }, { "distill_loss": 0.10475605726242065, "epoch": 6.104069379586391, "step": 18300 }, { "epoch": 6.104069379586391, "ref_ce_loss": 0.056972041726112366, "step": 18300 }, { "epoch": 6.104069379586391, "loss": 0.24096515774726868, "step": 18300 }, { "ce_loss": 0.00777512276545167, "epoch": 6.104069379586391, "step": 18300 }, { "distill_loss": 0.10689795762300491, "epoch": 6.104069379586391, "step": 18300 }, { "epoch": 6.104069379586391, "ref_ce_loss": 0.06155410408973694, "step": 18300 }, { "epoch": 6.107404936624416, "loss": 0.3067, "step": 18310 }, { "epoch": 6.107404936624416, "grad_norm": 2.4878458976745605, "step": 18310 }, { "epoch": 6.107404936624416, "learning_rate": 4.191970542964245e-05, "step": 18310 }, { "epoch": 6.107404936624416, "loss": 0.27044180035591125, "step": 18310 }, { "ce_loss": 0.022618383169174194, "epoch": 6.107404936624416, "step": 18310 }, { "distill_loss": 0.11641081422567368, "epoch": 6.107404936624416, "step": 18310 }, { "epoch": 6.107404936624416, "ref_ce_loss": 0.05938781052827835, "step": 18310 }, { "epoch": 6.107404936624416, "loss": 0.2108479142189026, "step": 18310 }, { "ce_loss": 0.019511261954903603, "epoch": 6.107404936624416, "step": 18310 }, { "distill_loss": 0.109732985496521, "epoch": 6.107404936624416, "step": 18310 }, { "epoch": 6.107404936624416, "ref_ce_loss": 0.027610134333372116, "step": 18310 }, { "epoch": 6.107404936624416, "loss": 0.6706960201263428, "step": 18310 }, { "ce_loss": 0.09902139008045197, "epoch": 6.107404936624416, "step": 18310 }, { "distill_loss": 0.16243474185466766, "epoch": 6.107404936624416, "step": 18310 }, { "epoch": 6.107404936624416, "ref_ce_loss": 0.06722650676965714, "step": 18310 }, { "epoch": 6.107404936624416, "loss": 0.3304760754108429, "step": 18310 }, { "ce_loss": 0.0915578156709671, "epoch": 6.107404936624416, "step": 18310 }, { "distill_loss": 0.1474878489971161, "epoch": 6.107404936624416, "step": 18310 }, { "epoch": 6.107404936624416, "ref_ce_loss": 0.04289526119828224, "step": 18310 }, { "epoch": 6.110740493662441, "loss": 0.3309, "step": 18320 }, { "epoch": 6.110740493662441, "grad_norm": 2.8061115741729736, "step": 18320 }, { "epoch": 6.110740493662441, "learning_rate": 4.177934422936725e-05, "step": 18320 }, { "epoch": 6.110740493662441, "loss": 0.21884343028068542, "step": 18320 }, { "ce_loss": 0.020756900310516357, "epoch": 6.110740493662441, "step": 18320 }, { "distill_loss": 0.0981239303946495, "epoch": 6.110740493662441, "step": 18320 }, { "epoch": 6.110740493662441, "ref_ce_loss": 0.05400104448199272, "step": 18320 }, { "epoch": 6.110740493662441, "loss": 0.6406104564666748, "step": 18320 }, { "ce_loss": 0.060525428503751755, "epoch": 6.110740493662441, "step": 18320 }, { "distill_loss": 0.15946635603904724, "epoch": 6.110740493662441, "step": 18320 }, { "epoch": 6.110740493662441, "ref_ce_loss": 0.09081117808818817, "step": 18320 }, { "epoch": 6.110740493662441, "loss": 0.19435235857963562, "step": 18320 }, { "ce_loss": 0.016969850286841393, "epoch": 6.110740493662441, "step": 18320 }, { "distill_loss": 0.11550012975931168, "epoch": 6.110740493662441, "step": 18320 }, { "epoch": 6.110740493662441, "ref_ce_loss": 0.04261741042137146, "step": 18320 }, { "epoch": 6.110740493662441, "loss": 0.3633389472961426, "step": 18320 }, { "ce_loss": 0.028799138963222504, "epoch": 6.110740493662441, "step": 18320 }, { "distill_loss": 0.2342735230922699, "epoch": 6.110740493662441, "step": 18320 }, { "epoch": 6.110740493662441, "ref_ce_loss": 0.07534074783325195, "step": 18320 }, { "epoch": 6.114076050700467, "loss": 0.3231, "step": 18330 }, { "epoch": 6.114076050700467, "grad_norm": 2.804523468017578, "step": 18330 }, { "epoch": 6.114076050700467, "learning_rate": 4.163918038087865e-05, "step": 18330 }, { "epoch": 6.114076050700467, "loss": 0.22182205319404602, "step": 18330 }, { "ce_loss": 0.04863829165697098, "epoch": 6.114076050700467, "step": 18330 }, { "distill_loss": 0.10979770869016647, "epoch": 6.114076050700467, "step": 18330 }, { "epoch": 6.114076050700467, "ref_ce_loss": 0.06329606473445892, "step": 18330 }, { "epoch": 6.114076050700467, "loss": 0.19124485552310944, "step": 18330 }, { "ce_loss": 0.019544605165719986, "epoch": 6.114076050700467, "step": 18330 }, { "distill_loss": 0.10157377272844315, "epoch": 6.114076050700467, "step": 18330 }, { "epoch": 6.114076050700467, "ref_ce_loss": 0.045619696378707886, "step": 18330 }, { "epoch": 6.114076050700467, "loss": 0.30762940645217896, "step": 18330 }, { "ce_loss": 0.023676371201872826, "epoch": 6.114076050700467, "step": 18330 }, { "distill_loss": 0.09587828814983368, "epoch": 6.114076050700467, "step": 18330 }, { "epoch": 6.114076050700467, "ref_ce_loss": 0.04057086259126663, "step": 18330 }, { "epoch": 6.114076050700467, "loss": 0.26268070936203003, "step": 18330 }, { "ce_loss": 0.03551900386810303, "epoch": 6.114076050700467, "step": 18330 }, { "distill_loss": 0.11628009378910065, "epoch": 6.114076050700467, "step": 18330 }, { "epoch": 6.114076050700467, "ref_ce_loss": 0.04114997386932373, "step": 18330 }, { "epoch": 6.117411607738492, "loss": 0.3617, "step": 18340 }, { "epoch": 6.117411607738492, "grad_norm": 3.2843000888824463, "step": 18340 }, { "epoch": 6.117411607738492, "learning_rate": 4.149921413978014e-05, "step": 18340 }, { "epoch": 6.117411607738492, "loss": 0.26372018456459045, "step": 18340 }, { "ce_loss": 0.022633861750364304, "epoch": 6.117411607738492, "step": 18340 }, { "distill_loss": 0.1458633840084076, "epoch": 6.117411607738492, "step": 18340 }, { "epoch": 6.117411607738492, "ref_ce_loss": 0.05152313411235809, "step": 18340 }, { "epoch": 6.117411607738492, "loss": 0.29105961322784424, "step": 18340 }, { "ce_loss": 0.09380751103162766, "epoch": 6.117411607738492, "step": 18340 }, { "distill_loss": 0.1378300040960312, "epoch": 6.117411607738492, "step": 18340 }, { "epoch": 6.117411607738492, "ref_ce_loss": 0.059045396745204926, "step": 18340 }, { "epoch": 6.117411607738492, "loss": 0.4499114751815796, "step": 18340 }, { "ce_loss": 0.045432500541210175, "epoch": 6.117411607738492, "step": 18340 }, { "distill_loss": 0.14073380827903748, "epoch": 6.117411607738492, "step": 18340 }, { "epoch": 6.117411607738492, "ref_ce_loss": 0.056476663798093796, "step": 18340 }, { "epoch": 6.117411607738492, "loss": 0.19718751311302185, "step": 18340 }, { "ce_loss": 0.031024429947137833, "epoch": 6.117411607738492, "step": 18340 }, { "distill_loss": 0.0994877964258194, "epoch": 6.117411607738492, "step": 18340 }, { "epoch": 6.117411607738492, "ref_ce_loss": 0.056632742285728455, "step": 18340 }, { "epoch": 6.1207471647765175, "loss": 0.3108, "step": 18350 }, { "epoch": 6.1207471647765175, "grad_norm": 2.1846914291381836, "step": 18350 }, { "epoch": 6.1207471647765175, "learning_rate": 4.1359445761314926e-05, "step": 18350 }, { "epoch": 6.1207471647765175, "loss": 0.1581880897283554, "step": 18350 }, { "ce_loss": 0.01265860628336668, "epoch": 6.1207471647765175, "step": 18350 }, { "distill_loss": 0.09621373564004898, "epoch": 6.1207471647765175, "step": 18350 }, { "epoch": 6.1207471647765175, "ref_ce_loss": 0.0492212139070034, "step": 18350 }, { "epoch": 6.1207471647765175, "loss": 0.3098943829536438, "step": 18350 }, { "ce_loss": 0.0733032152056694, "epoch": 6.1207471647765175, "step": 18350 }, { "distill_loss": 0.16024382412433624, "epoch": 6.1207471647765175, "step": 18350 }, { "epoch": 6.1207471647765175, "ref_ce_loss": 0.05418863520026207, "step": 18350 }, { "epoch": 6.1207471647765175, "loss": 0.24440094828605652, "step": 18350 }, { "ce_loss": 0.04196551814675331, "epoch": 6.1207471647765175, "step": 18350 }, { "distill_loss": 0.1325785517692566, "epoch": 6.1207471647765175, "step": 18350 }, { "epoch": 6.1207471647765175, "ref_ce_loss": 0.06961900740861893, "step": 18350 }, { "epoch": 6.1207471647765175, "loss": 0.22027157247066498, "step": 18350 }, { "ce_loss": 0.02423410303890705, "epoch": 6.1207471647765175, "step": 18350 }, { "distill_loss": 0.11101806163787842, "epoch": 6.1207471647765175, "step": 18350 }, { "epoch": 6.1207471647765175, "ref_ce_loss": 0.029094528406858444, "step": 18350 }, { "epoch": 6.124082721814543, "loss": 0.3069, "step": 18360 }, { "epoch": 6.124082721814543, "grad_norm": 3.5036215782165527, "step": 18360 }, { "epoch": 6.124082721814543, "learning_rate": 4.1219875500365516e-05, "step": 18360 }, { "epoch": 6.124082721814543, "loss": 0.17867614328861237, "step": 18360 }, { "ce_loss": 0.02497381530702114, "epoch": 6.124082721814543, "step": 18360 }, { "distill_loss": 0.07437723875045776, "epoch": 6.124082721814543, "step": 18360 }, { "epoch": 6.124082721814543, "ref_ce_loss": 0.04872718080878258, "step": 18360 }, { "epoch": 6.124082721814543, "loss": 0.258394718170166, "step": 18360 }, { "ce_loss": 0.0596681647002697, "epoch": 6.124082721814543, "step": 18360 }, { "distill_loss": 0.13986316323280334, "epoch": 6.124082721814543, "step": 18360 }, { "epoch": 6.124082721814543, "ref_ce_loss": 0.0443265363574028, "step": 18360 }, { "epoch": 6.124082721814543, "loss": 0.15350134670734406, "step": 18360 }, { "ce_loss": 0.015066183172166348, "epoch": 6.124082721814543, "step": 18360 }, { "distill_loss": 0.10006022453308105, "epoch": 6.124082721814543, "step": 18360 }, { "epoch": 6.124082721814543, "ref_ce_loss": 0.038168370723724365, "step": 18360 }, { "epoch": 6.124082721814543, "loss": 0.2913787066936493, "step": 18360 }, { "ce_loss": 0.03667327016592026, "epoch": 6.124082721814543, "step": 18360 }, { "distill_loss": 0.1875118911266327, "epoch": 6.124082721814543, "step": 18360 }, { "epoch": 6.124082721814543, "ref_ce_loss": 0.0670209750533104, "step": 18360 }, { "epoch": 6.127418278852568, "loss": 0.2767, "step": 18370 }, { "epoch": 6.127418278852568, "grad_norm": 2.7282140254974365, "step": 18370 }, { "epoch": 6.127418278852568, "learning_rate": 4.108050361145291e-05, "step": 18370 }, { "epoch": 6.127418278852568, "loss": 0.27476048469543457, "step": 18370 }, { "ce_loss": 0.036131396889686584, "epoch": 6.127418278852568, "step": 18370 }, { "distill_loss": 0.1398783028125763, "epoch": 6.127418278852568, "step": 18370 }, { "epoch": 6.127418278852568, "ref_ce_loss": 0.059157684445381165, "step": 18370 }, { "epoch": 6.127418278852568, "loss": 0.2925405502319336, "step": 18370 }, { "ce_loss": 0.024877002462744713, "epoch": 6.127418278852568, "step": 18370 }, { "distill_loss": 0.16725745797157288, "epoch": 6.127418278852568, "step": 18370 }, { "epoch": 6.127418278852568, "ref_ce_loss": 0.06880000978708267, "step": 18370 }, { "epoch": 6.127418278852568, "loss": 0.7054593563079834, "step": 18370 }, { "ce_loss": 0.09368545562028885, "epoch": 6.127418278852568, "step": 18370 }, { "distill_loss": 0.1227211132645607, "epoch": 6.127418278852568, "step": 18370 }, { "epoch": 6.127418278852568, "ref_ce_loss": 0.048069968819618225, "step": 18370 }, { "epoch": 6.127418278852568, "loss": 0.7590700387954712, "step": 18370 }, { "ce_loss": 0.07026736438274384, "epoch": 6.127418278852568, "step": 18370 }, { "distill_loss": 0.19734793901443481, "epoch": 6.127418278852568, "step": 18370 }, { "epoch": 6.127418278852568, "ref_ce_loss": 0.03684297204017639, "step": 18370 }, { "epoch": 6.1307538358905935, "loss": 0.3519, "step": 18380 }, { "epoch": 6.1307538358905935, "grad_norm": 2.439100742340088, "step": 18380 }, { "epoch": 6.1307538358905935, "learning_rate": 4.0941330348736525e-05, "step": 18380 }, { "epoch": 6.1307538358905935, "loss": 0.20850054919719696, "step": 18380 }, { "ce_loss": 0.03334977477788925, "epoch": 6.1307538358905935, "step": 18380 }, { "distill_loss": 0.11176758259534836, "epoch": 6.1307538358905935, "step": 18380 }, { "epoch": 6.1307538358905935, "ref_ce_loss": 0.04599212110042572, "step": 18380 }, { "epoch": 6.1307538358905935, "loss": 0.3658720552921295, "step": 18380 }, { "ce_loss": 0.055781181901693344, "epoch": 6.1307538358905935, "step": 18380 }, { "distill_loss": 0.21307574212551117, "epoch": 6.1307538358905935, "step": 18380 }, { "epoch": 6.1307538358905935, "ref_ce_loss": 0.04664365574717522, "step": 18380 }, { "epoch": 6.1307538358905935, "loss": 0.30071496963500977, "step": 18380 }, { "ce_loss": 0.03584017977118492, "epoch": 6.1307538358905935, "step": 18380 }, { "distill_loss": 0.13493163883686066, "epoch": 6.1307538358905935, "step": 18380 }, { "epoch": 6.1307538358905935, "ref_ce_loss": 0.06915021687746048, "step": 18380 }, { "epoch": 6.1307538358905935, "loss": 0.26241907477378845, "step": 18380 }, { "ce_loss": 0.049552708864212036, "epoch": 6.1307538358905935, "step": 18380 }, { "distill_loss": 0.14699327945709229, "epoch": 6.1307538358905935, "step": 18380 }, { "epoch": 6.1307538358905935, "ref_ce_loss": 0.06564631313085556, "step": 18380 }, { "epoch": 6.134089392928619, "loss": 0.3589, "step": 18390 }, { "epoch": 6.134089392928619, "grad_norm": 3.0384786128997803, "step": 18390 }, { "epoch": 6.134089392928619, "learning_rate": 4.080235596601341e-05, "step": 18390 }, { "epoch": 6.134089392928619, "loss": 0.24150526523590088, "step": 18390 }, { "ce_loss": 0.01162660401314497, "epoch": 6.134089392928619, "step": 18390 }, { "distill_loss": 0.10201402008533478, "epoch": 6.134089392928619, "step": 18390 }, { "epoch": 6.134089392928619, "ref_ce_loss": 0.055936913937330246, "step": 18390 }, { "epoch": 6.134089392928619, "loss": 0.20581991970539093, "step": 18390 }, { "ce_loss": 0.0499425008893013, "epoch": 6.134089392928619, "step": 18390 }, { "distill_loss": 0.10892849415540695, "epoch": 6.134089392928619, "step": 18390 }, { "epoch": 6.134089392928619, "ref_ce_loss": 0.046872884035110474, "step": 18390 }, { "epoch": 6.134089392928619, "loss": 0.4894862174987793, "step": 18390 }, { "ce_loss": 0.0837978646159172, "epoch": 6.134089392928619, "step": 18390 }, { "distill_loss": 0.13355745375156403, "epoch": 6.134089392928619, "step": 18390 }, { "epoch": 6.134089392928619, "ref_ce_loss": 0.06904693692922592, "step": 18390 }, { "epoch": 6.134089392928619, "loss": 0.27033209800720215, "step": 18390 }, { "ce_loss": 0.0719723179936409, "epoch": 6.134089392928619, "step": 18390 }, { "distill_loss": 0.1512235701084137, "epoch": 6.134089392928619, "step": 18390 }, { "epoch": 6.134089392928619, "ref_ce_loss": 0.035801418125629425, "step": 18390 }, { "epoch": 6.137424949966644, "loss": 0.2963, "step": 18400 }, { "epoch": 6.137424949966644, "grad_norm": 2.858560085296631, "step": 18400 }, { "epoch": 6.137424949966644, "learning_rate": 4.0663580716718046e-05, "step": 18400 }, { "epoch": 6.137424949966644, "loss": 0.2508600950241089, "step": 18400 }, { "ce_loss": 0.0432690791785717, "epoch": 6.137424949966644, "step": 18400 }, { "distill_loss": 0.13280202448368073, "epoch": 6.137424949966644, "step": 18400 }, { "epoch": 6.137424949966644, "ref_ce_loss": 0.06123881787061691, "step": 18400 }, { "epoch": 6.137424949966644, "loss": 0.4933200776576996, "step": 18400 }, { "ce_loss": 0.05571569874882698, "epoch": 6.137424949966644, "step": 18400 }, { "distill_loss": 0.18126748502254486, "epoch": 6.137424949966644, "step": 18400 }, { "epoch": 6.137424949966644, "ref_ce_loss": 0.10519138723611832, "step": 18400 }, { "epoch": 6.137424949966644, "loss": 0.2924741804599762, "step": 18400 }, { "ce_loss": 0.063815638422966, "epoch": 6.137424949966644, "step": 18400 }, { "distill_loss": 0.15766538679599762, "epoch": 6.137424949966644, "step": 18400 }, { "epoch": 6.137424949966644, "ref_ce_loss": 0.07054778188467026, "step": 18400 }, { "epoch": 6.137424949966644, "loss": 0.23580388724803925, "step": 18400 }, { "ce_loss": 0.05556456372141838, "epoch": 6.137424949966644, "step": 18400 }, { "distill_loss": 0.11789394915103912, "epoch": 6.137424949966644, "step": 18400 }, { "epoch": 6.137424949966644, "ref_ce_loss": 0.053790390491485596, "step": 18400 }, { "epoch": 6.14076050700467, "loss": 0.2943, "step": 18410 }, { "epoch": 6.14076050700467, "grad_norm": 1.7060301303863525, "step": 18410 }, { "epoch": 6.14076050700467, "learning_rate": 4.052500485392176e-05, "step": 18410 }, { "epoch": 6.14076050700467, "loss": 0.23654238879680634, "step": 18410 }, { "ce_loss": 0.06270783394575119, "epoch": 6.14076050700467, "step": 18410 }, { "distill_loss": 0.10348555445671082, "epoch": 6.14076050700467, "step": 18410 }, { "epoch": 6.14076050700467, "ref_ce_loss": 0.07024077326059341, "step": 18410 }, { "epoch": 6.14076050700467, "loss": 0.2675868272781372, "step": 18410 }, { "ce_loss": 0.03235636278986931, "epoch": 6.14076050700467, "step": 18410 }, { "distill_loss": 0.14261578023433685, "epoch": 6.14076050700467, "step": 18410 }, { "epoch": 6.14076050700467, "ref_ce_loss": 0.07388745993375778, "step": 18410 }, { "epoch": 6.14076050700467, "loss": 0.4724261462688446, "step": 18410 }, { "ce_loss": 0.12275946140289307, "epoch": 6.14076050700467, "step": 18410 }, { "distill_loss": 0.22504441440105438, "epoch": 6.14076050700467, "step": 18410 }, { "epoch": 6.14076050700467, "ref_ce_loss": 0.08914220333099365, "step": 18410 }, { "epoch": 6.14076050700467, "loss": 0.21506237983703613, "step": 18410 }, { "ce_loss": 0.015775134786963463, "epoch": 6.14076050700467, "step": 18410 }, { "distill_loss": 0.1167135238647461, "epoch": 6.14076050700467, "step": 18410 }, { "epoch": 6.14076050700467, "ref_ce_loss": 0.05251457169651985, "step": 18410 }, { "epoch": 6.144096064042695, "loss": 0.3415, "step": 18420 }, { "epoch": 6.144096064042695, "grad_norm": 2.5018398761749268, "step": 18420 }, { "epoch": 6.144096064042695, "learning_rate": 4.038662863033226e-05, "step": 18420 }, { "epoch": 6.144096064042695, "loss": 0.2518661320209503, "step": 18420 }, { "ce_loss": 0.04636304825544357, "epoch": 6.144096064042695, "step": 18420 }, { "distill_loss": 0.1221461296081543, "epoch": 6.144096064042695, "step": 18420 }, { "epoch": 6.144096064042695, "ref_ce_loss": 0.0639716386795044, "step": 18420 }, { "epoch": 6.144096064042695, "loss": 0.1405506581068039, "step": 18420 }, { "ce_loss": 0.009569302201271057, "epoch": 6.144096064042695, "step": 18420 }, { "distill_loss": 0.09424649178981781, "epoch": 6.144096064042695, "step": 18420 }, { "epoch": 6.144096064042695, "ref_ce_loss": 0.036644306033849716, "step": 18420 }, { "epoch": 6.144096064042695, "loss": 0.3834973871707916, "step": 18420 }, { "ce_loss": 0.08157524466514587, "epoch": 6.144096064042695, "step": 18420 }, { "distill_loss": 0.12001895904541016, "epoch": 6.144096064042695, "step": 18420 }, { "epoch": 6.144096064042695, "ref_ce_loss": 0.08984258025884628, "step": 18420 }, { "epoch": 6.144096064042695, "loss": 0.34268125891685486, "step": 18420 }, { "ce_loss": 0.058754097670316696, "epoch": 6.144096064042695, "step": 18420 }, { "distill_loss": 0.217606782913208, "epoch": 6.144096064042695, "step": 18420 }, { "epoch": 6.144096064042695, "ref_ce_loss": 0.04520254582166672, "step": 18420 }, { "epoch": 6.14743162108072, "loss": 0.329, "step": 18430 }, { "epoch": 6.14743162108072, "grad_norm": 2.1517646312713623, "step": 18430 }, { "epoch": 6.14743162108072, "learning_rate": 4.024845229829323e-05, "step": 18430 }, { "epoch": 6.14743162108072, "loss": 0.3015998303890228, "step": 18430 }, { "ce_loss": 0.062217775732278824, "epoch": 6.14743162108072, "step": 18430 }, { "distill_loss": 0.131020188331604, "epoch": 6.14743162108072, "step": 18430 }, { "epoch": 6.14743162108072, "ref_ce_loss": 0.06910224258899689, "step": 18430 }, { "epoch": 6.14743162108072, "loss": 0.2663721442222595, "step": 18430 }, { "ce_loss": 0.036985091865062714, "epoch": 6.14743162108072, "step": 18430 }, { "distill_loss": 0.13282321393489838, "epoch": 6.14743162108072, "step": 18430 }, { "epoch": 6.14743162108072, "ref_ce_loss": 0.06474227458238602, "step": 18430 }, { "epoch": 6.14743162108072, "loss": 0.278898686170578, "step": 18430 }, { "ce_loss": 0.07325167953968048, "epoch": 6.14743162108072, "step": 18430 }, { "distill_loss": 0.14880669116973877, "epoch": 6.14743162108072, "step": 18430 }, { "epoch": 6.14743162108072, "ref_ce_loss": 0.05667537450790405, "step": 18430 }, { "epoch": 6.14743162108072, "loss": 0.28270652890205383, "step": 18430 }, { "ce_loss": 0.06054554879665375, "epoch": 6.14743162108072, "step": 18430 }, { "distill_loss": 0.15518692135810852, "epoch": 6.14743162108072, "step": 18430 }, { "epoch": 6.14743162108072, "ref_ce_loss": 0.06662452220916748, "step": 18430 }, { "epoch": 6.150767178118746, "loss": 0.3253, "step": 18440 }, { "epoch": 6.150767178118746, "grad_norm": 2.1012563705444336, "step": 18440 }, { "epoch": 6.150767178118746, "learning_rate": 4.0110476109783726e-05, "step": 18440 }, { "epoch": 6.150767178118746, "loss": 0.36245232820510864, "step": 18440 }, { "ce_loss": 0.053363729268312454, "epoch": 6.150767178118746, "step": 18440 }, { "distill_loss": 0.13131973147392273, "epoch": 6.150767178118746, "step": 18440 }, { "epoch": 6.150767178118746, "ref_ce_loss": 0.09455405175685883, "step": 18440 }, { "epoch": 6.150767178118746, "loss": 0.29794377088546753, "step": 18440 }, { "ce_loss": 0.03110991045832634, "epoch": 6.150767178118746, "step": 18440 }, { "distill_loss": 0.12349390238523483, "epoch": 6.150767178118746, "step": 18440 }, { "epoch": 6.150767178118746, "ref_ce_loss": 0.06310545653104782, "step": 18440 }, { "epoch": 6.150767178118746, "loss": 0.3318029046058655, "step": 18440 }, { "ce_loss": 0.06962364166975021, "epoch": 6.150767178118746, "step": 18440 }, { "distill_loss": 0.0969667062163353, "epoch": 6.150767178118746, "step": 18440 }, { "epoch": 6.150767178118746, "ref_ce_loss": 0.10596779733896255, "step": 18440 }, { "epoch": 6.150767178118746, "loss": 0.33284032344818115, "step": 18440 }, { "ce_loss": 0.042060963809490204, "epoch": 6.150767178118746, "step": 18440 }, { "distill_loss": 0.11621900647878647, "epoch": 6.150767178118746, "step": 18440 }, { "epoch": 6.150767178118746, "ref_ce_loss": 0.04753798618912697, "step": 18440 }, { "epoch": 6.154102735156771, "loss": 0.3369, "step": 18450 }, { "epoch": 6.154102735156771, "grad_norm": 2.206515312194824, "step": 18450 }, { "epoch": 6.154102735156771, "learning_rate": 3.997270031641791e-05, "step": 18450 }, { "epoch": 6.154102735156771, "loss": 0.46877241134643555, "step": 18450 }, { "ce_loss": 0.08415161073207855, "epoch": 6.154102735156771, "step": 18450 }, { "distill_loss": 0.1510000228881836, "epoch": 6.154102735156771, "step": 18450 }, { "epoch": 6.154102735156771, "ref_ce_loss": 0.07368329912424088, "step": 18450 }, { "epoch": 6.154102735156771, "loss": 0.25681328773498535, "step": 18450 }, { "ce_loss": 0.07156962156295776, "epoch": 6.154102735156771, "step": 18450 }, { "distill_loss": 0.1276722401380539, "epoch": 6.154102735156771, "step": 18450 }, { "epoch": 6.154102735156771, "ref_ce_loss": 0.05735990032553673, "step": 18450 }, { "epoch": 6.154102735156771, "loss": 0.3189067840576172, "step": 18450 }, { "ce_loss": 0.04861884191632271, "epoch": 6.154102735156771, "step": 18450 }, { "distill_loss": 0.13847175240516663, "epoch": 6.154102735156771, "step": 18450 }, { "epoch": 6.154102735156771, "ref_ce_loss": 0.08016352355480194, "step": 18450 }, { "epoch": 6.154102735156771, "loss": 0.26889464259147644, "step": 18450 }, { "ce_loss": 0.06599020957946777, "epoch": 6.154102735156771, "step": 18450 }, { "distill_loss": 0.1534539759159088, "epoch": 6.154102735156771, "step": 18450 }, { "epoch": 6.154102735156771, "ref_ce_loss": 0.048720572143793106, "step": 18450 }, { "epoch": 6.157438292194796, "loss": 0.296, "step": 18460 }, { "epoch": 6.157438292194796, "grad_norm": 1.9747674465179443, "step": 18460 }, { "epoch": 6.157438292194796, "learning_rate": 3.9835125169444485e-05, "step": 18460 }, { "epoch": 6.157438292194796, "loss": 0.2924409806728363, "step": 18460 }, { "ce_loss": 0.04371657222509384, "epoch": 6.157438292194796, "step": 18460 }, { "distill_loss": 0.14328262209892273, "epoch": 6.157438292194796, "step": 18460 }, { "epoch": 6.157438292194796, "ref_ce_loss": 0.056337349116802216, "step": 18460 }, { "epoch": 6.157438292194796, "loss": 0.39455586671829224, "step": 18460 }, { "ce_loss": 0.07246027141809464, "epoch": 6.157438292194796, "step": 18460 }, { "distill_loss": 0.14171691238880157, "epoch": 6.157438292194796, "step": 18460 }, { "epoch": 6.157438292194796, "ref_ce_loss": 0.08444765955209732, "step": 18460 }, { "epoch": 6.157438292194796, "loss": 0.2808062434196472, "step": 18460 }, { "ce_loss": 0.037363942712545395, "epoch": 6.157438292194796, "step": 18460 }, { "distill_loss": 0.17661446332931519, "epoch": 6.157438292194796, "step": 18460 }, { "epoch": 6.157438292194796, "ref_ce_loss": 0.043351124972105026, "step": 18460 }, { "epoch": 6.157438292194796, "loss": 0.27473172545433044, "step": 18460 }, { "ce_loss": 0.062011852860450745, "epoch": 6.157438292194796, "step": 18460 }, { "distill_loss": 0.11762156337499619, "epoch": 6.157438292194796, "step": 18460 }, { "epoch": 6.157438292194796, "ref_ce_loss": 0.06052962690591812, "step": 18460 }, { "epoch": 6.160773849232822, "loss": 0.3145, "step": 18470 }, { "epoch": 6.160773849232822, "grad_norm": 2.9975457191467285, "step": 18470 }, { "epoch": 6.160773849232822, "learning_rate": 3.9697750919746255e-05, "step": 18470 }, { "epoch": 6.160773849232822, "loss": 0.8400434255599976, "step": 18470 }, { "ce_loss": 0.06479611247777939, "epoch": 6.160773849232822, "step": 18470 }, { "distill_loss": 0.13007411360740662, "epoch": 6.160773849232822, "step": 18470 }, { "epoch": 6.160773849232822, "ref_ce_loss": 0.0658654272556305, "step": 18470 }, { "epoch": 6.160773849232822, "loss": 0.17636524140834808, "step": 18470 }, { "ce_loss": 0.019910158589482307, "epoch": 6.160773849232822, "step": 18470 }, { "distill_loss": 0.08979976177215576, "epoch": 6.160773849232822, "step": 18470 }, { "epoch": 6.160773849232822, "ref_ce_loss": 0.03886188566684723, "step": 18470 }, { "epoch": 6.160773849232822, "loss": 0.30023443698883057, "step": 18470 }, { "ce_loss": 0.01413907390087843, "epoch": 6.160773849232822, "step": 18470 }, { "distill_loss": 0.10442767292261124, "epoch": 6.160773849232822, "step": 18470 }, { "epoch": 6.160773849232822, "ref_ce_loss": 0.04619559645652771, "step": 18470 }, { "epoch": 6.160773849232822, "loss": 0.3421204686164856, "step": 18470 }, { "ce_loss": 0.06843894720077515, "epoch": 6.160773849232822, "step": 18470 }, { "distill_loss": 0.13290368020534515, "epoch": 6.160773849232822, "step": 18470 }, { "epoch": 6.160773849232822, "ref_ce_loss": 0.09307733923196793, "step": 18470 }, { "epoch": 6.164109406270847, "loss": 0.3195, "step": 18480 }, { "epoch": 6.164109406270847, "grad_norm": 2.4326090812683105, "step": 18480 }, { "epoch": 6.164109406270847, "learning_rate": 3.9560577817839664e-05, "step": 18480 }, { "epoch": 6.164109406270847, "loss": 0.2917850613594055, "step": 18480 }, { "ce_loss": 0.06054378300905228, "epoch": 6.164109406270847, "step": 18480 }, { "distill_loss": 0.12447196245193481, "epoch": 6.164109406270847, "step": 18480 }, { "epoch": 6.164109406270847, "ref_ce_loss": 0.04743599891662598, "step": 18480 }, { "epoch": 6.164109406270847, "loss": 0.25405004620552063, "step": 18480 }, { "ce_loss": 0.04941348731517792, "epoch": 6.164109406270847, "step": 18480 }, { "distill_loss": 0.14396850764751434, "epoch": 6.164109406270847, "step": 18480 }, { "epoch": 6.164109406270847, "ref_ce_loss": 0.049728021025657654, "step": 18480 }, { "epoch": 6.164109406270847, "loss": 0.42917001247406006, "step": 18480 }, { "ce_loss": 0.018850069493055344, "epoch": 6.164109406270847, "step": 18480 }, { "distill_loss": 0.16858190298080444, "epoch": 6.164109406270847, "step": 18480 }, { "epoch": 6.164109406270847, "ref_ce_loss": 0.04189824312925339, "step": 18480 }, { "epoch": 6.164109406270847, "loss": 0.37748169898986816, "step": 18480 }, { "ce_loss": 0.061456672847270966, "epoch": 6.164109406270847, "step": 18480 }, { "distill_loss": 0.1580883413553238, "epoch": 6.164109406270847, "step": 18480 }, { "epoch": 6.164109406270847, "ref_ce_loss": 0.07029583305120468, "step": 18480 }, { "epoch": 6.167444963308872, "loss": 0.3445, "step": 18490 }, { "epoch": 6.167444963308872, "grad_norm": 2.8507888317108154, "step": 18490 }, { "epoch": 6.167444963308872, "learning_rate": 3.942360611387438e-05, "step": 18490 }, { "epoch": 6.167444963308872, "loss": 0.3325313627719879, "step": 18490 }, { "ce_loss": 0.049201712012290955, "epoch": 6.167444963308872, "step": 18490 }, { "distill_loss": 0.12399528175592422, "epoch": 6.167444963308872, "step": 18490 }, { "epoch": 6.167444963308872, "ref_ce_loss": 0.0797731950879097, "step": 18490 }, { "epoch": 6.167444963308872, "loss": 0.2676965296268463, "step": 18490 }, { "ce_loss": 0.038861967623233795, "epoch": 6.167444963308872, "step": 18490 }, { "distill_loss": 0.12871843576431274, "epoch": 6.167444963308872, "step": 18490 }, { "epoch": 6.167444963308872, "ref_ce_loss": 0.06563683599233627, "step": 18490 }, { "epoch": 6.167444963308872, "loss": 0.23812928795814514, "step": 18490 }, { "ce_loss": 0.04163096100091934, "epoch": 6.167444963308872, "step": 18490 }, { "distill_loss": 0.13853895664215088, "epoch": 6.167444963308872, "step": 18490 }, { "epoch": 6.167444963308872, "ref_ce_loss": 0.04453599825501442, "step": 18490 }, { "epoch": 6.167444963308872, "loss": 0.20199045538902283, "step": 18490 }, { "ce_loss": 0.020122459158301353, "epoch": 6.167444963308872, "step": 18490 }, { "distill_loss": 0.11555896699428558, "epoch": 6.167444963308872, "step": 18490 }, { "epoch": 6.167444963308872, "ref_ce_loss": 0.044083766639232635, "step": 18490 }, { "epoch": 6.170780520346898, "loss": 0.3082, "step": 18500 }, { "epoch": 6.170780520346898, "grad_norm": 5.568923473358154, "step": 18500 }, { "epoch": 6.170780520346898, "learning_rate": 3.928683605763267e-05, "step": 18500 }, { "epoch": 6.170780520346898, "loss": 0.35506847500801086, "step": 18500 }, { "ce_loss": 0.05164219066500664, "epoch": 6.170780520346898, "step": 18500 }, { "distill_loss": 0.13775408267974854, "epoch": 6.170780520346898, "step": 18500 }, { "epoch": 6.170780520346898, "ref_ce_loss": 0.05966204032301903, "step": 18500 }, { "epoch": 6.170780520346898, "loss": 0.277235209941864, "step": 18500 }, { "ce_loss": 0.0458344966173172, "epoch": 6.170780520346898, "step": 18500 }, { "distill_loss": 0.14046710729599, "epoch": 6.170780520346898, "step": 18500 }, { "epoch": 6.170780520346898, "ref_ce_loss": 0.06634150445461273, "step": 18500 }, { "epoch": 6.170780520346898, "loss": 0.26391613483428955, "step": 18500 }, { "ce_loss": 0.006200232543051243, "epoch": 6.170780520346898, "step": 18500 }, { "distill_loss": 0.13295315206050873, "epoch": 6.170780520346898, "step": 18500 }, { "epoch": 6.170780520346898, "ref_ce_loss": 0.03400679677724838, "step": 18500 }, { "epoch": 6.170780520346898, "loss": 0.7266954779624939, "step": 18500 }, { "ce_loss": 0.05183488875627518, "epoch": 6.170780520346898, "step": 18500 }, { "distill_loss": 0.13190777599811554, "epoch": 6.170780520346898, "step": 18500 }, { "epoch": 6.170780520346898, "ref_ce_loss": 0.06639143824577332, "step": 18500 }, { "epoch": 6.174116077384923, "loss": 0.3219, "step": 18510 }, { "epoch": 6.174116077384923, "grad_norm": 3.9889957904815674, "step": 18510 }, { "epoch": 6.174116077384923, "learning_rate": 3.915026789852921e-05, "step": 18510 }, { "epoch": 6.174116077384923, "loss": 0.281459778547287, "step": 18510 }, { "ce_loss": 0.04788195714354515, "epoch": 6.174116077384923, "step": 18510 }, { "distill_loss": 0.1647387593984604, "epoch": 6.174116077384923, "step": 18510 }, { "epoch": 6.174116077384923, "ref_ce_loss": 0.06857550889253616, "step": 18510 }, { "epoch": 6.174116077384923, "loss": 0.3966805934906006, "step": 18510 }, { "ce_loss": 0.060191236436367035, "epoch": 6.174116077384923, "step": 18510 }, { "distill_loss": 0.19976350665092468, "epoch": 6.174116077384923, "step": 18510 }, { "epoch": 6.174116077384923, "ref_ce_loss": 0.07919872552156448, "step": 18510 }, { "epoch": 6.174116077384923, "loss": 0.5716406106948853, "step": 18510 }, { "ce_loss": 0.04771149158477783, "epoch": 6.174116077384923, "step": 18510 }, { "distill_loss": 0.11342655122280121, "epoch": 6.174116077384923, "step": 18510 }, { "epoch": 6.174116077384923, "ref_ce_loss": 0.07589419186115265, "step": 18510 }, { "epoch": 6.174116077384923, "loss": 0.19318389892578125, "step": 18510 }, { "ce_loss": 0.01549871638417244, "epoch": 6.174116077384923, "step": 18510 }, { "distill_loss": 0.10186982154846191, "epoch": 6.174116077384923, "step": 18510 }, { "epoch": 6.174116077384923, "ref_ce_loss": 0.04499030485749245, "step": 18510 }, { "epoch": 6.177451634422948, "loss": 0.3339, "step": 18520 }, { "epoch": 6.177451634422948, "grad_norm": 3.996617078781128, "step": 18520 }, { "epoch": 6.177451634422948, "learning_rate": 3.901390188561046e-05, "step": 18520 }, { "epoch": 6.177451634422948, "loss": 0.2945614159107208, "step": 18520 }, { "ce_loss": 0.04369880631566048, "epoch": 6.177451634422948, "step": 18520 }, { "distill_loss": 0.14246287941932678, "epoch": 6.177451634422948, "step": 18520 }, { "epoch": 6.177451634422948, "ref_ce_loss": 0.07933271676301956, "step": 18520 }, { "epoch": 6.177451634422948, "loss": 0.20383426547050476, "step": 18520 }, { "ce_loss": 0.005165026523172855, "epoch": 6.177451634422948, "step": 18520 }, { "distill_loss": 0.12378333508968353, "epoch": 6.177451634422948, "step": 18520 }, { "epoch": 6.177451634422948, "ref_ce_loss": 0.04908745735883713, "step": 18520 }, { "epoch": 6.177451634422948, "loss": 0.25932228565216064, "step": 18520 }, { "ce_loss": 0.04456058144569397, "epoch": 6.177451634422948, "step": 18520 }, { "distill_loss": 0.15556976199150085, "epoch": 6.177451634422948, "step": 18520 }, { "epoch": 6.177451634422948, "ref_ce_loss": 0.059071388095617294, "step": 18520 }, { "epoch": 6.177451634422948, "loss": 0.560404896736145, "step": 18520 }, { "ce_loss": 0.05589801073074341, "epoch": 6.177451634422948, "step": 18520 }, { "distill_loss": 0.2018391489982605, "epoch": 6.177451634422948, "step": 18520 }, { "epoch": 6.177451634422948, "ref_ce_loss": 0.04681461676955223, "step": 18520 }, { "epoch": 6.180787191460974, "loss": 0.3013, "step": 18530 }, { "epoch": 6.180787191460974, "grad_norm": 2.298755407333374, "step": 18530 }, { "epoch": 6.180787191460974, "learning_rate": 3.8877738267554214e-05, "step": 18530 }, { "epoch": 6.180787191460974, "loss": 0.2826690077781677, "step": 18530 }, { "ce_loss": 0.027750806882977486, "epoch": 6.180787191460974, "step": 18530 }, { "distill_loss": 0.09200765192508698, "epoch": 6.180787191460974, "step": 18530 }, { "epoch": 6.180787191460974, "ref_ce_loss": 0.06503093987703323, "step": 18530 }, { "epoch": 6.180787191460974, "loss": 0.326167494058609, "step": 18530 }, { "ce_loss": 0.046595703810453415, "epoch": 6.180787191460974, "step": 18530 }, { "distill_loss": 0.15352821350097656, "epoch": 6.180787191460974, "step": 18530 }, { "epoch": 6.180787191460974, "ref_ce_loss": 0.07511696964502335, "step": 18530 }, { "epoch": 6.180787191460974, "loss": 0.26359185576438904, "step": 18530 }, { "ce_loss": 0.03271767497062683, "epoch": 6.180787191460974, "step": 18530 }, { "distill_loss": 0.1250348687171936, "epoch": 6.180787191460974, "step": 18530 }, { "epoch": 6.180787191460974, "ref_ce_loss": 0.05025747790932655, "step": 18530 }, { "epoch": 6.180787191460974, "loss": 0.2011631727218628, "step": 18530 }, { "ce_loss": 0.005348066333681345, "epoch": 6.180787191460974, "step": 18530 }, { "distill_loss": 0.10876020789146423, "epoch": 6.180787191460974, "step": 18530 }, { "epoch": 6.180787191460974, "ref_ce_loss": 0.04273238778114319, "step": 18530 }, { "epoch": 6.184122748498999, "loss": 0.2862, "step": 18540 }, { "epoch": 6.184122748498999, "grad_norm": 2.2494394779205322, "step": 18540 }, { "epoch": 6.184122748498999, "learning_rate": 3.8741777292669276e-05, "step": 18540 }, { "epoch": 6.184122748498999, "loss": 0.6624394655227661, "step": 18540 }, { "ce_loss": 0.032520171254873276, "epoch": 6.184122748498999, "step": 18540 }, { "distill_loss": 0.18035253882408142, "epoch": 6.184122748498999, "step": 18540 }, { "epoch": 6.184122748498999, "ref_ce_loss": 0.05871713161468506, "step": 18540 }, { "epoch": 6.184122748498999, "loss": 0.2021605521440506, "step": 18540 }, { "ce_loss": 0.024922169744968414, "epoch": 6.184122748498999, "step": 18540 }, { "distill_loss": 0.10917256772518158, "epoch": 6.184122748498999, "step": 18540 }, { "epoch": 6.184122748498999, "ref_ce_loss": 0.06801718473434448, "step": 18540 }, { "epoch": 6.184122748498999, "loss": 0.37601831555366516, "step": 18540 }, { "ce_loss": 0.04148799553513527, "epoch": 6.184122748498999, "step": 18540 }, { "distill_loss": 0.12794163823127747, "epoch": 6.184122748498999, "step": 18540 }, { "epoch": 6.184122748498999, "ref_ce_loss": 0.07430071383714676, "step": 18540 }, { "epoch": 6.184122748498999, "loss": 0.6574068069458008, "step": 18540 }, { "ce_loss": 0.08162372559309006, "epoch": 6.184122748498999, "step": 18540 }, { "distill_loss": 0.2035280168056488, "epoch": 6.184122748498999, "step": 18540 }, { "epoch": 6.184122748498999, "ref_ce_loss": 0.09160168468952179, "step": 18540 }, { "epoch": 6.1874583055370245, "loss": 0.3284, "step": 18550 }, { "epoch": 6.1874583055370245, "grad_norm": 1.9440633058547974, "step": 18550 }, { "epoch": 6.1874583055370245, "learning_rate": 3.8606019208894725e-05, "step": 18550 }, { "epoch": 6.1874583055370245, "loss": 0.3212183117866516, "step": 18550 }, { "ce_loss": 0.09615284949541092, "epoch": 6.1874583055370245, "step": 18550 }, { "distill_loss": 0.13871519267559052, "epoch": 6.1874583055370245, "step": 18550 }, { "epoch": 6.1874583055370245, "ref_ce_loss": 0.05264740064740181, "step": 18550 }, { "epoch": 6.1874583055370245, "loss": 0.7384791374206543, "step": 18550 }, { "ce_loss": 0.011780355125665665, "epoch": 6.1874583055370245, "step": 18550 }, { "distill_loss": 0.11695633828639984, "epoch": 6.1874583055370245, "step": 18550 }, { "epoch": 6.1874583055370245, "ref_ce_loss": 0.07216423749923706, "step": 18550 }, { "epoch": 6.1874583055370245, "loss": 0.35503384470939636, "step": 18550 }, { "ce_loss": 0.015260746702551842, "epoch": 6.1874583055370245, "step": 18550 }, { "distill_loss": 0.12225035578012466, "epoch": 6.1874583055370245, "step": 18550 }, { "epoch": 6.1874583055370245, "ref_ce_loss": 0.040481239557266235, "step": 18550 }, { "epoch": 6.1874583055370245, "loss": 0.3401867151260376, "step": 18550 }, { "ce_loss": 0.03045118972659111, "epoch": 6.1874583055370245, "step": 18550 }, { "distill_loss": 0.13645508885383606, "epoch": 6.1874583055370245, "step": 18550 }, { "epoch": 6.1874583055370245, "ref_ce_loss": 0.033003069460392, "step": 18550 }, { "epoch": 6.19079386257505, "loss": 0.3385, "step": 18560 }, { "epoch": 6.19079386257505, "grad_norm": 2.8384790420532227, "step": 18560 }, { "epoch": 6.19079386257505, "learning_rate": 3.8470464263799824e-05, "step": 18560 }, { "epoch": 6.19079386257505, "loss": 0.20757602155208588, "step": 18560 }, { "ce_loss": 0.021547259762883186, "epoch": 6.19079386257505, "step": 18560 }, { "distill_loss": 0.1290460079908371, "epoch": 6.19079386257505, "step": 18560 }, { "epoch": 6.19079386257505, "ref_ce_loss": 0.03783115744590759, "step": 18560 }, { "epoch": 6.19079386257505, "loss": 0.2212572544813156, "step": 18560 }, { "ce_loss": 0.020918430760502815, "epoch": 6.19079386257505, "step": 18560 }, { "distill_loss": 0.14228373765945435, "epoch": 6.19079386257505, "step": 18560 }, { "epoch": 6.19079386257505, "ref_ce_loss": 0.0429234504699707, "step": 18560 }, { "epoch": 6.19079386257505, "loss": 0.32608312368392944, "step": 18560 }, { "ce_loss": 0.05824834108352661, "epoch": 6.19079386257505, "step": 18560 }, { "distill_loss": 0.12231164425611496, "epoch": 6.19079386257505, "step": 18560 }, { "epoch": 6.19079386257505, "ref_ce_loss": 0.05168101191520691, "step": 18560 }, { "epoch": 6.19079386257505, "loss": 0.3267654776573181, "step": 18560 }, { "ce_loss": 0.03888304904103279, "epoch": 6.19079386257505, "step": 18560 }, { "distill_loss": 0.21609488129615784, "epoch": 6.19079386257505, "step": 18560 }, { "epoch": 6.19079386257505, "ref_ce_loss": 0.07168714702129364, "step": 18560 }, { "epoch": 6.194129419613075, "loss": 0.3106, "step": 18570 }, { "epoch": 6.194129419613075, "grad_norm": 2.127432346343994, "step": 18570 }, { "epoch": 6.194129419613075, "learning_rate": 3.833511270458322e-05, "step": 18570 }, { "epoch": 6.194129419613075, "loss": 0.3519989550113678, "step": 18570 }, { "ce_loss": 0.044699445366859436, "epoch": 6.194129419613075, "step": 18570 }, { "distill_loss": 0.1321500837802887, "epoch": 6.194129419613075, "step": 18570 }, { "epoch": 6.194129419613075, "ref_ce_loss": 0.09541293233633041, "step": 18570 }, { "epoch": 6.194129419613075, "loss": 0.38467326760292053, "step": 18570 }, { "ce_loss": 0.11054366827011108, "epoch": 6.194129419613075, "step": 18570 }, { "distill_loss": 0.18650639057159424, "epoch": 6.194129419613075, "step": 18570 }, { "epoch": 6.194129419613075, "ref_ce_loss": 0.06626542657613754, "step": 18570 }, { "epoch": 6.194129419613075, "loss": 0.19615034759044647, "step": 18570 }, { "ce_loss": 0.03159404918551445, "epoch": 6.194129419613075, "step": 18570 }, { "distill_loss": 0.10044976323843002, "epoch": 6.194129419613075, "step": 18570 }, { "epoch": 6.194129419613075, "ref_ce_loss": 0.04699920117855072, "step": 18570 }, { "epoch": 6.194129419613075, "loss": 0.42845267057418823, "step": 18570 }, { "ce_loss": 0.10839418321847916, "epoch": 6.194129419613075, "step": 18570 }, { "distill_loss": 0.2222888171672821, "epoch": 6.194129419613075, "step": 18570 }, { "epoch": 6.194129419613075, "ref_ce_loss": 0.09751333296298981, "step": 18570 }, { "epoch": 6.1974649766511005, "loss": 0.3156, "step": 18580 }, { "epoch": 6.1974649766511005, "grad_norm": 5.873178482055664, "step": 18580 }, { "epoch": 6.1974649766511005, "learning_rate": 3.819996477807288e-05, "step": 18580 }, { "epoch": 6.1974649766511005, "loss": 0.38896477222442627, "step": 18580 }, { "ce_loss": 0.003704048926010728, "epoch": 6.1974649766511005, "step": 18580 }, { "distill_loss": 0.1404985636472702, "epoch": 6.1974649766511005, "step": 18580 }, { "epoch": 6.1974649766511005, "ref_ce_loss": 0.06484709680080414, "step": 18580 }, { "epoch": 6.1974649766511005, "loss": 0.23296181857585907, "step": 18580 }, { "ce_loss": 0.028243107721209526, "epoch": 6.1974649766511005, "step": 18580 }, { "distill_loss": 0.1301645040512085, "epoch": 6.1974649766511005, "step": 18580 }, { "epoch": 6.1974649766511005, "ref_ce_loss": 0.05618060752749443, "step": 18580 }, { "epoch": 6.1974649766511005, "loss": 0.3045993745326996, "step": 18580 }, { "ce_loss": 0.010499167256057262, "epoch": 6.1974649766511005, "step": 18580 }, { "distill_loss": 0.13495343923568726, "epoch": 6.1974649766511005, "step": 18580 }, { "epoch": 6.1974649766511005, "ref_ce_loss": 0.08178295940160751, "step": 18580 }, { "epoch": 6.1974649766511005, "loss": 0.19076423346996307, "step": 18580 }, { "ce_loss": 0.014703667722642422, "epoch": 6.1974649766511005, "step": 18580 }, { "distill_loss": 0.11374014616012573, "epoch": 6.1974649766511005, "step": 18580 }, { "epoch": 6.1974649766511005, "ref_ce_loss": 0.062148548662662506, "step": 18580 }, { "epoch": 6.200800533689126, "loss": 0.3399, "step": 18590 }, { "epoch": 6.200800533689126, "grad_norm": 2.2601478099823, "step": 18590 }, { "epoch": 6.200800533689126, "learning_rate": 3.8065020730725305e-05, "step": 18590 }, { "epoch": 6.200800533689126, "loss": 0.3093893826007843, "step": 18590 }, { "ce_loss": 0.033915240317583084, "epoch": 6.200800533689126, "step": 18590 }, { "distill_loss": 0.14568378031253815, "epoch": 6.200800533689126, "step": 18590 }, { "epoch": 6.200800533689126, "ref_ce_loss": 0.06362231075763702, "step": 18590 }, { "epoch": 6.200800533689126, "loss": 0.2616564631462097, "step": 18590 }, { "ce_loss": 0.08007301390171051, "epoch": 6.200800533689126, "step": 18590 }, { "distill_loss": 0.1228436678647995, "epoch": 6.200800533689126, "step": 18590 }, { "epoch": 6.200800533689126, "ref_ce_loss": 0.04259605333209038, "step": 18590 }, { "epoch": 6.200800533689126, "loss": 0.29220378398895264, "step": 18590 }, { "ce_loss": 0.06992803514003754, "epoch": 6.200800533689126, "step": 18590 }, { "distill_loss": 0.13908424973487854, "epoch": 6.200800533689126, "step": 18590 }, { "epoch": 6.200800533689126, "ref_ce_loss": 0.06554904580116272, "step": 18590 }, { "epoch": 6.200800533689126, "loss": 0.4914194345474243, "step": 18590 }, { "ce_loss": 0.04688086733222008, "epoch": 6.200800533689126, "step": 18590 }, { "distill_loss": 0.23791128396987915, "epoch": 6.200800533689126, "step": 18590 }, { "epoch": 6.200800533689126, "ref_ce_loss": 0.07891146838665009, "step": 18590 }, { "epoch": 6.204136090727151, "loss": 0.3287, "step": 18600 }, { "epoch": 6.204136090727151, "grad_norm": 2.0706448554992676, "step": 18600 }, { "epoch": 6.204136090727151, "learning_rate": 3.7930280808625136e-05, "step": 18600 }, { "epoch": 6.204136090727151, "loss": 0.14727945625782013, "step": 18600 }, { "ce_loss": 0.011500568129122257, "epoch": 6.204136090727151, "step": 18600 }, { "distill_loss": 0.07382229715585709, "epoch": 6.204136090727151, "step": 18600 }, { "epoch": 6.204136090727151, "ref_ce_loss": 0.04233421012759209, "step": 18600 }, { "epoch": 6.204136090727151, "loss": 0.6932018399238586, "step": 18600 }, { "ce_loss": 0.013805784285068512, "epoch": 6.204136090727151, "step": 18600 }, { "distill_loss": 0.11833083629608154, "epoch": 6.204136090727151, "step": 18600 }, { "epoch": 6.204136090727151, "ref_ce_loss": 0.06745176017284393, "step": 18600 }, { "epoch": 6.204136090727151, "loss": 0.2108478993177414, "step": 18600 }, { "ce_loss": 0.043616849929094315, "epoch": 6.204136090727151, "step": 18600 }, { "distill_loss": 0.11201032251119614, "epoch": 6.204136090727151, "step": 18600 }, { "epoch": 6.204136090727151, "ref_ce_loss": 0.05514965206384659, "step": 18600 }, { "epoch": 6.204136090727151, "loss": 0.27369120717048645, "step": 18600 }, { "ce_loss": 0.027406323701143265, "epoch": 6.204136090727151, "step": 18600 }, { "distill_loss": 0.13315467536449432, "epoch": 6.204136090727151, "step": 18600 }, { "epoch": 6.204136090727151, "ref_ce_loss": 0.07010474056005478, "step": 18600 }, { "epoch": 6.207471647765177, "loss": 0.3429, "step": 18610 }, { "epoch": 6.207471647765177, "grad_norm": 2.937699556350708, "step": 18610 }, { "epoch": 6.207471647765177, "learning_rate": 3.7795745257484875e-05, "step": 18610 }, { "epoch": 6.207471647765177, "loss": 0.279723584651947, "step": 18610 }, { "ce_loss": 0.022853851318359375, "epoch": 6.207471647765177, "step": 18610 }, { "distill_loss": 0.11497896909713745, "epoch": 6.207471647765177, "step": 18610 }, { "epoch": 6.207471647765177, "ref_ce_loss": 0.05497308075428009, "step": 18610 }, { "epoch": 6.207471647765177, "loss": 0.23577311635017395, "step": 18610 }, { "ce_loss": 0.0342148058116436, "epoch": 6.207471647765177, "step": 18610 }, { "distill_loss": 0.09748812764883041, "epoch": 6.207471647765177, "step": 18610 }, { "epoch": 6.207471647765177, "ref_ce_loss": 0.04478719085454941, "step": 18610 }, { "epoch": 6.207471647765177, "loss": 0.20509502291679382, "step": 18610 }, { "ce_loss": 0.0038550468161702156, "epoch": 6.207471647765177, "step": 18610 }, { "distill_loss": 0.1022908017039299, "epoch": 6.207471647765177, "step": 18610 }, { "epoch": 6.207471647765177, "ref_ce_loss": 0.03298857808113098, "step": 18610 }, { "epoch": 6.207471647765177, "loss": 0.28083717823028564, "step": 18610 }, { "ce_loss": 0.02645295299589634, "epoch": 6.207471647765177, "step": 18610 }, { "distill_loss": 0.10292775928974152, "epoch": 6.207471647765177, "step": 18610 }, { "epoch": 6.207471647765177, "ref_ce_loss": 0.050342313945293427, "step": 18610 }, { "epoch": 6.210807204803202, "loss": 0.3456, "step": 18620 }, { "epoch": 6.210807204803202, "grad_norm": 2.887526273727417, "step": 18620 }, { "epoch": 6.210807204803202, "learning_rate": 3.7661414322644326e-05, "step": 18620 }, { "epoch": 6.210807204803202, "loss": 0.26322484016418457, "step": 18620 }, { "ce_loss": 0.05786026641726494, "epoch": 6.210807204803202, "step": 18620 }, { "distill_loss": 0.11787180602550507, "epoch": 6.210807204803202, "step": 18620 }, { "epoch": 6.210807204803202, "ref_ce_loss": 0.0689215213060379, "step": 18620 }, { "epoch": 6.210807204803202, "loss": 0.571141242980957, "step": 18620 }, { "ce_loss": 0.05233941599726677, "epoch": 6.210807204803202, "step": 18620 }, { "distill_loss": 0.14902040362358093, "epoch": 6.210807204803202, "step": 18620 }, { "epoch": 6.210807204803202, "ref_ce_loss": 0.06474089622497559, "step": 18620 }, { "epoch": 6.210807204803202, "loss": 0.4037763774394989, "step": 18620 }, { "ce_loss": 0.05231030285358429, "epoch": 6.210807204803202, "step": 18620 }, { "distill_loss": 0.18944039940834045, "epoch": 6.210807204803202, "step": 18620 }, { "epoch": 6.210807204803202, "ref_ce_loss": 0.0811677798628807, "step": 18620 }, { "epoch": 6.210807204803202, "loss": 0.17138558626174927, "step": 18620 }, { "ce_loss": 0.0197373665869236, "epoch": 6.210807204803202, "step": 18620 }, { "distill_loss": 0.08096668869256973, "epoch": 6.210807204803202, "step": 18620 }, { "epoch": 6.210807204803202, "ref_ce_loss": 0.052383799105882645, "step": 18620 }, { "epoch": 6.214142761841227, "loss": 0.3277, "step": 18630 }, { "epoch": 6.214142761841227, "grad_norm": 2.4172303676605225, "step": 18630 }, { "epoch": 6.214142761841227, "learning_rate": 3.7527288249070034e-05, "step": 18630 }, { "epoch": 6.214142761841227, "loss": 0.38156095147132874, "step": 18630 }, { "ce_loss": 0.1050412729382515, "epoch": 6.214142761841227, "step": 18630 }, { "distill_loss": 0.13883177936077118, "epoch": 6.214142761841227, "step": 18630 }, { "epoch": 6.214142761841227, "ref_ce_loss": 0.05640149861574173, "step": 18630 }, { "epoch": 6.214142761841227, "loss": 0.455706387758255, "step": 18630 }, { "ce_loss": 0.057640500366687775, "epoch": 6.214142761841227, "step": 18630 }, { "distill_loss": 0.13849633932113647, "epoch": 6.214142761841227, "step": 18630 }, { "epoch": 6.214142761841227, "ref_ce_loss": 0.05576207861304283, "step": 18630 }, { "epoch": 6.214142761841227, "loss": 0.2372320145368576, "step": 18630 }, { "ce_loss": 0.04792383685708046, "epoch": 6.214142761841227, "step": 18630 }, { "distill_loss": 0.11418038606643677, "epoch": 6.214142761841227, "step": 18630 }, { "epoch": 6.214142761841227, "ref_ce_loss": 0.05012698099017143, "step": 18630 }, { "epoch": 6.214142761841227, "loss": 0.23415352404117584, "step": 18630 }, { "ce_loss": 0.04273424670100212, "epoch": 6.214142761841227, "step": 18630 }, { "distill_loss": 0.12337204068899155, "epoch": 6.214142761841227, "step": 18630 }, { "epoch": 6.214142761841227, "ref_ce_loss": 0.05350811779499054, "step": 18630 }, { "epoch": 6.217478318879253, "loss": 0.3443, "step": 18640 }, { "epoch": 6.217478318879253, "grad_norm": 3.1628241539001465, "step": 18640 }, { "epoch": 6.217478318879253, "learning_rate": 3.739336728135519e-05, "step": 18640 }, { "epoch": 6.217478318879253, "loss": 0.22019442915916443, "step": 18640 }, { "ce_loss": 0.012670139782130718, "epoch": 6.217478318879253, "step": 18640 }, { "distill_loss": 0.11298336833715439, "epoch": 6.217478318879253, "step": 18640 }, { "epoch": 6.217478318879253, "ref_ce_loss": 0.06662783771753311, "step": 18640 }, { "epoch": 6.217478318879253, "loss": 0.3011285960674286, "step": 18640 }, { "ce_loss": 0.05775625631213188, "epoch": 6.217478318879253, "step": 18640 }, { "distill_loss": 0.15119341015815735, "epoch": 6.217478318879253, "step": 18640 }, { "epoch": 6.217478318879253, "ref_ce_loss": 0.04562750086188316, "step": 18640 }, { "epoch": 6.217478318879253, "loss": 0.3526672124862671, "step": 18640 }, { "ce_loss": 0.028817400336265564, "epoch": 6.217478318879253, "step": 18640 }, { "distill_loss": 0.11797647178173065, "epoch": 6.217478318879253, "step": 18640 }, { "epoch": 6.217478318879253, "ref_ce_loss": 0.052301883697509766, "step": 18640 }, { "epoch": 6.217478318879253, "loss": 0.3002309203147888, "step": 18640 }, { "ce_loss": 0.016758514568209648, "epoch": 6.217478318879253, "step": 18640 }, { "distill_loss": 0.15333794057369232, "epoch": 6.217478318879253, "step": 18640 }, { "epoch": 6.217478318879253, "ref_ce_loss": 0.05648922920227051, "step": 18640 }, { "epoch": 6.220813875917278, "loss": 0.3011, "step": 18650 }, { "epoch": 6.220813875917278, "grad_norm": 2.3673858642578125, "step": 18650 }, { "epoch": 6.220813875917278, "learning_rate": 3.7259651663718684e-05, "step": 18650 }, { "epoch": 6.220813875917278, "loss": 0.3125011622905731, "step": 18650 }, { "ce_loss": 0.046617768704891205, "epoch": 6.220813875917278, "step": 18650 }, { "distill_loss": 0.1842920333147049, "epoch": 6.220813875917278, "step": 18650 }, { "epoch": 6.220813875917278, "ref_ce_loss": 0.08146623522043228, "step": 18650 }, { "epoch": 6.220813875917278, "loss": 0.3332507312297821, "step": 18650 }, { "ce_loss": 0.029593532904982567, "epoch": 6.220813875917278, "step": 18650 }, { "distill_loss": 0.15803472697734833, "epoch": 6.220813875917278, "step": 18650 }, { "epoch": 6.220813875917278, "ref_ce_loss": 0.053554873913526535, "step": 18650 }, { "epoch": 6.220813875917278, "loss": 0.43323659896850586, "step": 18650 }, { "ce_loss": 0.021801965311169624, "epoch": 6.220813875917278, "step": 18650 }, { "distill_loss": 0.18702299892902374, "epoch": 6.220813875917278, "step": 18650 }, { "epoch": 6.220813875917278, "ref_ce_loss": 0.06735753268003464, "step": 18650 }, { "epoch": 6.220813875917278, "loss": 0.38802433013916016, "step": 18650 }, { "ce_loss": 0.06311264634132385, "epoch": 6.220813875917278, "step": 18650 }, { "distill_loss": 0.16197697818279266, "epoch": 6.220813875917278, "step": 18650 }, { "epoch": 6.220813875917278, "ref_ce_loss": 0.09785512834787369, "step": 18650 }, { "epoch": 6.224149432955303, "loss": 0.3536, "step": 18660 }, { "epoch": 6.224149432955303, "grad_norm": 2.0434508323669434, "step": 18660 }, { "epoch": 6.224149432955303, "learning_rate": 3.71261416400051e-05, "step": 18660 }, { "epoch": 6.224149432955303, "loss": 0.19896145164966583, "step": 18660 }, { "ce_loss": 0.021050764247775078, "epoch": 6.224149432955303, "step": 18660 }, { "distill_loss": 0.12107603251934052, "epoch": 6.224149432955303, "step": 18660 }, { "epoch": 6.224149432955303, "ref_ce_loss": 0.03518590331077576, "step": 18660 }, { "epoch": 6.224149432955303, "loss": 0.342231810092926, "step": 18660 }, { "ce_loss": 0.04348944127559662, "epoch": 6.224149432955303, "step": 18660 }, { "distill_loss": 0.17786476016044617, "epoch": 6.224149432955303, "step": 18660 }, { "epoch": 6.224149432955303, "ref_ce_loss": 0.07756116986274719, "step": 18660 }, { "epoch": 6.224149432955303, "loss": 0.22609391808509827, "step": 18660 }, { "ce_loss": 0.04094768688082695, "epoch": 6.224149432955303, "step": 18660 }, { "distill_loss": 0.1311914473772049, "epoch": 6.224149432955303, "step": 18660 }, { "epoch": 6.224149432955303, "ref_ce_loss": 0.028837168589234352, "step": 18660 }, { "epoch": 6.224149432955303, "loss": 0.4177436828613281, "step": 18660 }, { "ce_loss": 0.1019946038722992, "epoch": 6.224149432955303, "step": 18660 }, { "distill_loss": 0.1752627044916153, "epoch": 6.224149432955303, "step": 18660 }, { "epoch": 6.224149432955303, "ref_ce_loss": 0.0595175102353096, "step": 18660 }, { "epoch": 6.227484989993329, "loss": 0.3315, "step": 18670 }, { "epoch": 6.227484989993329, "grad_norm": 2.711292028427124, "step": 18670 }, { "epoch": 6.227484989993329, "learning_rate": 3.699283745368412e-05, "step": 18670 }, { "epoch": 6.227484989993329, "loss": 0.30362796783447266, "step": 18670 }, { "ce_loss": 0.0400133915245533, "epoch": 6.227484989993329, "step": 18670 }, { "distill_loss": 0.1721460521221161, "epoch": 6.227484989993329, "step": 18670 }, { "epoch": 6.227484989993329, "ref_ce_loss": 0.0534956268966198, "step": 18670 }, { "epoch": 6.227484989993329, "loss": 0.2800109386444092, "step": 18670 }, { "ce_loss": 0.011257833801209927, "epoch": 6.227484989993329, "step": 18670 }, { "distill_loss": 0.12428543716669083, "epoch": 6.227484989993329, "step": 18670 }, { "epoch": 6.227484989993329, "ref_ce_loss": 0.05313796177506447, "step": 18670 }, { "epoch": 6.227484989993329, "loss": 0.1609666347503662, "step": 18670 }, { "ce_loss": 0.02284008264541626, "epoch": 6.227484989993329, "step": 18670 }, { "distill_loss": 0.07858970016241074, "epoch": 6.227484989993329, "step": 18670 }, { "epoch": 6.227484989993329, "ref_ce_loss": 0.02820756286382675, "step": 18670 }, { "epoch": 6.227484989993329, "loss": 0.20992793142795563, "step": 18670 }, { "ce_loss": 0.028739934787154198, "epoch": 6.227484989993329, "step": 18670 }, { "distill_loss": 0.11657550185918808, "epoch": 6.227484989993329, "step": 18670 }, { "epoch": 6.227484989993329, "ref_ce_loss": 0.06436599791049957, "step": 18670 }, { "epoch": 6.230820547031354, "loss": 0.2938, "step": 18680 }, { "epoch": 6.230820547031354, "grad_norm": 2.294095277786255, "step": 18680 }, { "epoch": 6.230820547031354, "learning_rate": 3.6859739347849884e-05, "step": 18680 }, { "epoch": 6.230820547031354, "loss": 0.3429484963417053, "step": 18680 }, { "ce_loss": 0.06927967071533203, "epoch": 6.230820547031354, "step": 18680 }, { "distill_loss": 0.17625319957733154, "epoch": 6.230820547031354, "step": 18680 }, { "epoch": 6.230820547031354, "ref_ce_loss": 0.07992861419916153, "step": 18680 }, { "epoch": 6.230820547031354, "loss": 0.4963828921318054, "step": 18680 }, { "ce_loss": 0.053090523928403854, "epoch": 6.230820547031354, "step": 18680 }, { "distill_loss": 0.32863304018974304, "epoch": 6.230820547031354, "step": 18680 }, { "epoch": 6.230820547031354, "ref_ce_loss": 0.08859732747077942, "step": 18680 }, { "epoch": 6.230820547031354, "loss": 0.4164791703224182, "step": 18680 }, { "ce_loss": 0.05513370782136917, "epoch": 6.230820547031354, "step": 18680 }, { "distill_loss": 0.2739354074001312, "epoch": 6.230820547031354, "step": 18680 }, { "epoch": 6.230820547031354, "ref_ce_loss": 0.07069216668605804, "step": 18680 }, { "epoch": 6.230820547031354, "loss": 0.1833503097295761, "step": 18680 }, { "ce_loss": 0.013990293256938457, "epoch": 6.230820547031354, "step": 18680 }, { "distill_loss": 0.11608105152845383, "epoch": 6.230820547031354, "step": 18680 }, { "epoch": 6.230820547031354, "ref_ce_loss": 0.05309043079614639, "step": 18680 }, { "epoch": 6.234156104069379, "loss": 0.3561, "step": 18690 }, { "epoch": 6.234156104069379, "grad_norm": 2.279000759124756, "step": 18690 }, { "epoch": 6.234156104069379, "learning_rate": 3.6726847565220895e-05, "step": 18690 }, { "epoch": 6.234156104069379, "loss": 0.25862422585487366, "step": 18690 }, { "ce_loss": 0.022512095049023628, "epoch": 6.234156104069379, "step": 18690 }, { "distill_loss": 0.1802578717470169, "epoch": 6.234156104069379, "step": 18690 }, { "epoch": 6.234156104069379, "ref_ce_loss": 0.055745966732501984, "step": 18690 }, { "epoch": 6.234156104069379, "loss": 0.18684840202331543, "step": 18690 }, { "ce_loss": 0.015279823914170265, "epoch": 6.234156104069379, "step": 18690 }, { "distill_loss": 0.09220601618289948, "epoch": 6.234156104069379, "step": 18690 }, { "epoch": 6.234156104069379, "ref_ce_loss": 0.05309228226542473, "step": 18690 }, { "epoch": 6.234156104069379, "loss": 0.23141039907932281, "step": 18690 }, { "ce_loss": 0.030733682215213776, "epoch": 6.234156104069379, "step": 18690 }, { "distill_loss": 0.1341591477394104, "epoch": 6.234156104069379, "step": 18690 }, { "epoch": 6.234156104069379, "ref_ce_loss": 0.04584375396370888, "step": 18690 }, { "epoch": 6.234156104069379, "loss": 0.43140465021133423, "step": 18690 }, { "ce_loss": 0.019533297047019005, "epoch": 6.234156104069379, "step": 18690 }, { "distill_loss": 0.1933666169643402, "epoch": 6.234156104069379, "step": 18690 }, { "epoch": 6.234156104069379, "ref_ce_loss": 0.08564330637454987, "step": 18690 }, { "epoch": 6.237491661107405, "loss": 0.3111, "step": 18700 }, { "epoch": 6.237491661107405, "grad_norm": 5.9803466796875, "step": 18700 }, { "epoch": 6.237491661107405, "learning_rate": 3.659416234813932e-05, "step": 18700 }, { "epoch": 6.237491661107405, "loss": 0.22904738783836365, "step": 18700 }, { "ce_loss": 0.015067033469676971, "epoch": 6.237491661107405, "step": 18700 }, { "distill_loss": 0.12943653762340546, "epoch": 6.237491661107405, "step": 18700 }, { "epoch": 6.237491661107405, "ref_ce_loss": 0.04217130318284035, "step": 18700 }, { "epoch": 6.237491661107405, "loss": 0.22289657592773438, "step": 18700 }, { "ce_loss": 0.016781846061348915, "epoch": 6.237491661107405, "step": 18700 }, { "distill_loss": 0.1689828336238861, "epoch": 6.237491661107405, "step": 18700 }, { "epoch": 6.237491661107405, "ref_ce_loss": 0.03697735071182251, "step": 18700 }, { "epoch": 6.237491661107405, "loss": 0.534015417098999, "step": 18700 }, { "ce_loss": 0.08496256917715073, "epoch": 6.237491661107405, "step": 18700 }, { "distill_loss": 0.2338809370994568, "epoch": 6.237491661107405, "step": 18700 }, { "epoch": 6.237491661107405, "ref_ce_loss": 0.06475655734539032, "step": 18700 }, { "epoch": 6.237491661107405, "loss": 0.27168723940849304, "step": 18700 }, { "ce_loss": 0.030710462480783463, "epoch": 6.237491661107405, "step": 18700 }, { "distill_loss": 0.17783647775650024, "epoch": 6.237491661107405, "step": 18700 }, { "epoch": 6.237491661107405, "ref_ce_loss": 0.05239542946219444, "step": 18700 }, { "epoch": 6.24082721814543, "loss": 0.3582, "step": 18710 }, { "epoch": 6.24082721814543, "grad_norm": 2.7943332195281982, "step": 18710 }, { "epoch": 6.24082721814543, "learning_rate": 3.64616839385707e-05, "step": 18710 }, { "epoch": 6.24082721814543, "loss": 0.3018385171890259, "step": 18710 }, { "ce_loss": 0.007486680056899786, "epoch": 6.24082721814543, "step": 18710 }, { "distill_loss": 0.13992555439472198, "epoch": 6.24082721814543, "step": 18710 }, { "epoch": 6.24082721814543, "ref_ce_loss": 0.057693563401699066, "step": 18710 }, { "epoch": 6.24082721814543, "loss": 0.48539677262306213, "step": 18710 }, { "ce_loss": 0.06905413419008255, "epoch": 6.24082721814543, "step": 18710 }, { "distill_loss": 0.18162912130355835, "epoch": 6.24082721814543, "step": 18710 }, { "epoch": 6.24082721814543, "ref_ce_loss": 0.05399453267455101, "step": 18710 }, { "epoch": 6.24082721814543, "loss": 0.24727633595466614, "step": 18710 }, { "ce_loss": 0.022812293842434883, "epoch": 6.24082721814543, "step": 18710 }, { "distill_loss": 0.11789606511592865, "epoch": 6.24082721814543, "step": 18710 }, { "epoch": 6.24082721814543, "ref_ce_loss": 0.029683204367756844, "step": 18710 }, { "epoch": 6.24082721814543, "loss": 0.30093979835510254, "step": 18710 }, { "ce_loss": 0.022619565948843956, "epoch": 6.24082721814543, "step": 18710 }, { "distill_loss": 0.20535236597061157, "epoch": 6.24082721814543, "step": 18710 }, { "epoch": 6.24082721814543, "ref_ce_loss": 0.05883384123444557, "step": 18710 }, { "epoch": 6.244162775183455, "loss": 0.323, "step": 18720 }, { "epoch": 6.244162775183455, "grad_norm": 2.1610376834869385, "step": 18720 }, { "epoch": 6.244162775183455, "learning_rate": 3.6329412578103386e-05, "step": 18720 }, { "epoch": 6.244162775183455, "loss": 0.2980462312698364, "step": 18720 }, { "ce_loss": 0.02790931798517704, "epoch": 6.244162775183455, "step": 18720 }, { "distill_loss": 0.15959838032722473, "epoch": 6.244162775183455, "step": 18720 }, { "epoch": 6.244162775183455, "ref_ce_loss": 0.08582749217748642, "step": 18720 }, { "epoch": 6.244162775183455, "loss": 0.28372853994369507, "step": 18720 }, { "ce_loss": 0.05045516416430473, "epoch": 6.244162775183455, "step": 18720 }, { "distill_loss": 0.15782198309898376, "epoch": 6.244162775183455, "step": 18720 }, { "epoch": 6.244162775183455, "ref_ce_loss": 0.058288849890232086, "step": 18720 }, { "epoch": 6.244162775183455, "loss": 0.3082246482372284, "step": 18720 }, { "ce_loss": 0.04321841150522232, "epoch": 6.244162775183455, "step": 18720 }, { "distill_loss": 0.15502133965492249, "epoch": 6.244162775183455, "step": 18720 }, { "epoch": 6.244162775183455, "ref_ce_loss": 0.04743940383195877, "step": 18720 }, { "epoch": 6.244162775183455, "loss": 0.3014211058616638, "step": 18720 }, { "ce_loss": 0.05305873975157738, "epoch": 6.244162775183455, "step": 18720 }, { "distill_loss": 0.14042118191719055, "epoch": 6.244162775183455, "step": 18720 }, { "epoch": 6.244162775183455, "ref_ce_loss": 0.06918078660964966, "step": 18720 }, { "epoch": 6.247498332221481, "loss": 0.3093, "step": 18730 }, { "epoch": 6.247498332221481, "grad_norm": 2.835282325744629, "step": 18730 }, { "epoch": 6.247498332221481, "learning_rate": 3.6197348507948085e-05, "step": 18730 }, { "epoch": 6.247498332221481, "loss": 0.389096736907959, "step": 18730 }, { "ce_loss": 0.11738903820514679, "epoch": 6.247498332221481, "step": 18730 }, { "distill_loss": 0.1438477784395218, "epoch": 6.247498332221481, "step": 18730 }, { "epoch": 6.247498332221481, "ref_ce_loss": 0.06399208307266235, "step": 18730 }, { "epoch": 6.247498332221481, "loss": 0.2826199531555176, "step": 18730 }, { "ce_loss": 0.020239872857928276, "epoch": 6.247498332221481, "step": 18730 }, { "distill_loss": 0.1740793138742447, "epoch": 6.247498332221481, "step": 18730 }, { "epoch": 6.247498332221481, "ref_ce_loss": 0.06345031410455704, "step": 18730 }, { "epoch": 6.247498332221481, "loss": 0.49008870124816895, "step": 18730 }, { "ce_loss": 0.10059117525815964, "epoch": 6.247498332221481, "step": 18730 }, { "distill_loss": 0.13844041526317596, "epoch": 6.247498332221481, "step": 18730 }, { "epoch": 6.247498332221481, "ref_ce_loss": 0.07132607698440552, "step": 18730 }, { "epoch": 6.247498332221481, "loss": 0.2719220817089081, "step": 18730 }, { "ce_loss": 0.05062330514192581, "epoch": 6.247498332221481, "step": 18730 }, { "distill_loss": 0.1395432949066162, "epoch": 6.247498332221481, "step": 18730 }, { "epoch": 6.247498332221481, "ref_ce_loss": 0.05693845823407173, "step": 18730 }, { "epoch": 6.250833889259506, "loss": 0.3179, "step": 18740 }, { "epoch": 6.250833889259506, "grad_norm": 2.3968071937561035, "step": 18740 }, { "epoch": 6.250833889259506, "learning_rate": 3.606549196893764e-05, "step": 18740 }, { "epoch": 6.250833889259506, "loss": 0.35895127058029175, "step": 18740 }, { "ce_loss": 0.03985409811139107, "epoch": 6.250833889259506, "step": 18740 }, { "distill_loss": 0.12473776191473007, "epoch": 6.250833889259506, "step": 18740 }, { "epoch": 6.250833889259506, "ref_ce_loss": 0.03933054581284523, "step": 18740 }, { "epoch": 6.250833889259506, "loss": 0.4552100896835327, "step": 18740 }, { "ce_loss": 0.0754159539937973, "epoch": 6.250833889259506, "step": 18740 }, { "distill_loss": 0.16453154385089874, "epoch": 6.250833889259506, "step": 18740 }, { "epoch": 6.250833889259506, "ref_ce_loss": 0.06217661872506142, "step": 18740 }, { "epoch": 6.250833889259506, "loss": 0.2528969347476959, "step": 18740 }, { "ce_loss": 0.049444712698459625, "epoch": 6.250833889259506, "step": 18740 }, { "distill_loss": 0.12250962853431702, "epoch": 6.250833889259506, "step": 18740 }, { "epoch": 6.250833889259506, "ref_ce_loss": 0.05724600329995155, "step": 18740 }, { "epoch": 6.250833889259506, "loss": 0.23986639082431793, "step": 18740 }, { "ce_loss": 0.02679796703159809, "epoch": 6.250833889259506, "step": 18740 }, { "distill_loss": 0.15213319659233093, "epoch": 6.250833889259506, "step": 18740 }, { "epoch": 6.250833889259506, "ref_ce_loss": 0.06067092344164848, "step": 18740 }, { "epoch": 6.2541694462975315, "loss": 0.35, "step": 18750 }, { "epoch": 6.2541694462975315, "grad_norm": 3.7008721828460693, "step": 18750 }, { "epoch": 6.2541694462975315, "learning_rate": 3.593384320152636e-05, "step": 18750 }, { "epoch": 6.2541694462975315, "loss": 0.25133639574050903, "step": 18750 }, { "ce_loss": 0.040532004088163376, "epoch": 6.2541694462975315, "step": 18750 }, { "distill_loss": 0.13248834013938904, "epoch": 6.2541694462975315, "step": 18750 }, { "epoch": 6.2541694462975315, "ref_ce_loss": 0.04700193554162979, "step": 18750 }, { "epoch": 6.2541694462975315, "loss": 0.3965201675891876, "step": 18750 }, { "ce_loss": 0.07706707715988159, "epoch": 6.2541694462975315, "step": 18750 }, { "distill_loss": 0.21770936250686646, "epoch": 6.2541694462975315, "step": 18750 }, { "epoch": 6.2541694462975315, "ref_ce_loss": 0.05477938801050186, "step": 18750 }, { "epoch": 6.2541694462975315, "loss": 0.5275173187255859, "step": 18750 }, { "ce_loss": 0.10376562178134918, "epoch": 6.2541694462975315, "step": 18750 }, { "distill_loss": 0.18406124413013458, "epoch": 6.2541694462975315, "step": 18750 }, { "epoch": 6.2541694462975315, "ref_ce_loss": 0.11001072824001312, "step": 18750 }, { "epoch": 6.2541694462975315, "loss": 0.3048272728919983, "step": 18750 }, { "ce_loss": 0.02338281460106373, "epoch": 6.2541694462975315, "step": 18750 }, { "distill_loss": 0.15202832221984863, "epoch": 6.2541694462975315, "step": 18750 }, { "epoch": 6.2541694462975315, "ref_ce_loss": 0.08240233361721039, "step": 18750 }, { "epoch": 6.257505003335557, "loss": 0.3237, "step": 18760 }, { "epoch": 6.257505003335557, "grad_norm": 2.6808176040649414, "step": 18760 }, { "epoch": 6.257505003335557, "learning_rate": 3.5802402445789625e-05, "step": 18760 }, { "epoch": 6.257505003335557, "loss": 0.19271326065063477, "step": 18760 }, { "ce_loss": 0.020790955051779747, "epoch": 6.257505003335557, "step": 18760 }, { "distill_loss": 0.11469073593616486, "epoch": 6.257505003335557, "step": 18760 }, { "epoch": 6.257505003335557, "ref_ce_loss": 0.03989434987306595, "step": 18760 }, { "epoch": 6.257505003335557, "loss": 0.43920618295669556, "step": 18760 }, { "ce_loss": 0.061031486839056015, "epoch": 6.257505003335557, "step": 18760 }, { "distill_loss": 0.16793349385261536, "epoch": 6.257505003335557, "step": 18760 }, { "epoch": 6.257505003335557, "ref_ce_loss": 0.08771173655986786, "step": 18760 }, { "epoch": 6.257505003335557, "loss": 0.33124980330467224, "step": 18760 }, { "ce_loss": 0.042613252997398376, "epoch": 6.257505003335557, "step": 18760 }, { "distill_loss": 0.1423777937889099, "epoch": 6.257505003335557, "step": 18760 }, { "epoch": 6.257505003335557, "ref_ce_loss": 0.09082692116498947, "step": 18760 }, { "epoch": 6.257505003335557, "loss": 0.47902119159698486, "step": 18760 }, { "ce_loss": 0.044841740280389786, "epoch": 6.257505003335557, "step": 18760 }, { "distill_loss": 0.1492442935705185, "epoch": 6.257505003335557, "step": 18760 }, { "epoch": 6.257505003335557, "ref_ce_loss": 0.07554224133491516, "step": 18760 }, { "epoch": 6.260840560373582, "loss": 0.3518, "step": 18770 }, { "epoch": 6.260840560373582, "grad_norm": 3.183990716934204, "step": 18770 }, { "epoch": 6.260840560373582, "learning_rate": 3.567116994142362e-05, "step": 18770 }, { "epoch": 6.260840560373582, "loss": 0.20660609006881714, "step": 18770 }, { "ce_loss": 0.015359489247202873, "epoch": 6.260840560373582, "step": 18770 }, { "distill_loss": 0.1334400177001953, "epoch": 6.260840560373582, "step": 18770 }, { "epoch": 6.260840560373582, "ref_ce_loss": 0.05756957083940506, "step": 18770 }, { "epoch": 6.260840560373582, "loss": 0.26321983337402344, "step": 18770 }, { "ce_loss": 0.022753337398171425, "epoch": 6.260840560373582, "step": 18770 }, { "distill_loss": 0.1181974858045578, "epoch": 6.260840560373582, "step": 18770 }, { "epoch": 6.260840560373582, "ref_ce_loss": 0.05340166762471199, "step": 18770 }, { "epoch": 6.260840560373582, "loss": 0.24762709438800812, "step": 18770 }, { "ce_loss": 0.016061890870332718, "epoch": 6.260840560373582, "step": 18770 }, { "distill_loss": 0.17953771352767944, "epoch": 6.260840560373582, "step": 18770 }, { "epoch": 6.260840560373582, "ref_ce_loss": 0.051859185099601746, "step": 18770 }, { "epoch": 6.260840560373582, "loss": 0.2978662848472595, "step": 18770 }, { "ce_loss": 0.056625351309776306, "epoch": 6.260840560373582, "step": 18770 }, { "distill_loss": 0.13011780381202698, "epoch": 6.260840560373582, "step": 18770 }, { "epoch": 6.260840560373582, "ref_ce_loss": 0.0658249706029892, "step": 18770 }, { "epoch": 6.2641761174116075, "loss": 0.331, "step": 18780 }, { "epoch": 6.2641761174116075, "grad_norm": 2.6765992641448975, "step": 18780 }, { "epoch": 6.2641761174116075, "learning_rate": 3.5540145927744554e-05, "step": 18780 }, { "epoch": 6.2641761174116075, "loss": 0.3210102617740631, "step": 18780 }, { "ce_loss": 0.05250520259141922, "epoch": 6.2641761174116075, "step": 18780 }, { "distill_loss": 0.17485934495925903, "epoch": 6.2641761174116075, "step": 18780 }, { "epoch": 6.2641761174116075, "ref_ce_loss": 0.0640835240483284, "step": 18780 }, { "epoch": 6.2641761174116075, "loss": 0.2694230377674103, "step": 18780 }, { "ce_loss": 0.06711471080780029, "epoch": 6.2641761174116075, "step": 18780 }, { "distill_loss": 0.1430618166923523, "epoch": 6.2641761174116075, "step": 18780 }, { "epoch": 6.2641761174116075, "ref_ce_loss": 0.059164922684431076, "step": 18780 }, { "epoch": 6.2641761174116075, "loss": 0.24888139963150024, "step": 18780 }, { "ce_loss": 0.02815820463001728, "epoch": 6.2641761174116075, "step": 18780 }, { "distill_loss": 0.13266366720199585, "epoch": 6.2641761174116075, "step": 18780 }, { "epoch": 6.2641761174116075, "ref_ce_loss": 0.04049038141965866, "step": 18780 }, { "epoch": 6.2641761174116075, "loss": 0.36241692304611206, "step": 18780 }, { "ce_loss": 0.05183975026011467, "epoch": 6.2641761174116075, "step": 18780 }, { "distill_loss": 0.17315593361854553, "epoch": 6.2641761174116075, "step": 18780 }, { "epoch": 6.2641761174116075, "ref_ce_loss": 0.10023162513971329, "step": 18780 }, { "epoch": 6.267511674449633, "loss": 0.3397, "step": 18790 }, { "epoch": 6.267511674449633, "grad_norm": 2.6068735122680664, "step": 18790 }, { "epoch": 6.267511674449633, "learning_rate": 3.540933064368857e-05, "step": 18790 }, { "epoch": 6.267511674449633, "loss": 0.27424922585487366, "step": 18790 }, { "ce_loss": 0.02315659075975418, "epoch": 6.267511674449633, "step": 18790 }, { "distill_loss": 0.14970842003822327, "epoch": 6.267511674449633, "step": 18790 }, { "epoch": 6.267511674449633, "ref_ce_loss": 0.06472202390432358, "step": 18790 }, { "epoch": 6.267511674449633, "loss": 0.272493839263916, "step": 18790 }, { "ce_loss": 0.029809942469000816, "epoch": 6.267511674449633, "step": 18790 }, { "distill_loss": 0.13639044761657715, "epoch": 6.267511674449633, "step": 18790 }, { "epoch": 6.267511674449633, "ref_ce_loss": 0.052156079560518265, "step": 18790 }, { "epoch": 6.267511674449633, "loss": 0.5869401097297668, "step": 18790 }, { "ce_loss": 0.05751309543848038, "epoch": 6.267511674449633, "step": 18790 }, { "distill_loss": 0.1983068883419037, "epoch": 6.267511674449633, "step": 18790 }, { "epoch": 6.267511674449633, "ref_ce_loss": 0.08109751343727112, "step": 18790 }, { "epoch": 6.267511674449633, "loss": 0.22548475861549377, "step": 18790 }, { "ce_loss": 0.05232026055455208, "epoch": 6.267511674449633, "step": 18790 }, { "distill_loss": 0.1157417818903923, "epoch": 6.267511674449633, "step": 18790 }, { "epoch": 6.267511674449633, "ref_ce_loss": 0.03327852487564087, "step": 18790 }, { "epoch": 6.270847231487658, "loss": 0.3617, "step": 18800 }, { "epoch": 6.270847231487658, "grad_norm": 6.989676475524902, "step": 18800 }, { "epoch": 6.270847231487658, "learning_rate": 3.5278724327811174e-05, "step": 18800 }, { "epoch": 6.270847231487658, "loss": 0.23098401725292206, "step": 18800 }, { "ce_loss": 0.013874993659555912, "epoch": 6.270847231487658, "step": 18800 }, { "distill_loss": 0.15344513952732086, "epoch": 6.270847231487658, "step": 18800 }, { "epoch": 6.270847231487658, "ref_ce_loss": 0.03826308622956276, "step": 18800 }, { "epoch": 6.270847231487658, "loss": 0.39530304074287415, "step": 18800 }, { "ce_loss": 0.05913930758833885, "epoch": 6.270847231487658, "step": 18800 }, { "distill_loss": 0.2634964883327484, "epoch": 6.270847231487658, "step": 18800 }, { "epoch": 6.270847231487658, "ref_ce_loss": 0.05968732014298439, "step": 18800 }, { "epoch": 6.270847231487658, "loss": 0.4437573254108429, "step": 18800 }, { "ce_loss": 0.015780942514538765, "epoch": 6.270847231487658, "step": 18800 }, { "distill_loss": 0.15896551311016083, "epoch": 6.270847231487658, "step": 18800 }, { "epoch": 6.270847231487658, "ref_ce_loss": 0.050289690494537354, "step": 18800 }, { "epoch": 6.270847231487658, "loss": 0.26852864027023315, "step": 18800 }, { "ce_loss": 0.03627710044384003, "epoch": 6.270847231487658, "step": 18800 }, { "distill_loss": 0.11065933853387833, "epoch": 6.270847231487658, "step": 18800 }, { "epoch": 6.270847231487658, "ref_ce_loss": 0.07877654582262039, "step": 18800 }, { "epoch": 6.274182788525684, "loss": 0.3798, "step": 18810 }, { "epoch": 6.274182788525684, "grad_norm": 3.7645487785339355, "step": 18810 }, { "epoch": 6.274182788525684, "learning_rate": 3.514832721828676e-05, "step": 18810 }, { "epoch": 6.274182788525684, "loss": 0.46515122056007385, "step": 18810 }, { "ce_loss": 0.08517283201217651, "epoch": 6.274182788525684, "step": 18810 }, { "distill_loss": 0.14585131406784058, "epoch": 6.274182788525684, "step": 18810 }, { "epoch": 6.274182788525684, "ref_ce_loss": 0.055102892220020294, "step": 18810 }, { "epoch": 6.274182788525684, "loss": 0.28196051716804504, "step": 18810 }, { "ce_loss": 0.07954999804496765, "epoch": 6.274182788525684, "step": 18810 }, { "distill_loss": 0.10934408754110336, "epoch": 6.274182788525684, "step": 18810 }, { "epoch": 6.274182788525684, "ref_ce_loss": 0.05305926129221916, "step": 18810 }, { "epoch": 6.274182788525684, "loss": 0.34345611929893494, "step": 18810 }, { "ce_loss": 0.0444522500038147, "epoch": 6.274182788525684, "step": 18810 }, { "distill_loss": 0.20444200932979584, "epoch": 6.274182788525684, "step": 18810 }, { "epoch": 6.274182788525684, "ref_ce_loss": 0.0704997181892395, "step": 18810 }, { "epoch": 6.274182788525684, "loss": 0.47482192516326904, "step": 18810 }, { "ce_loss": 0.0612056702375412, "epoch": 6.274182788525684, "step": 18810 }, { "distill_loss": 0.15192171931266785, "epoch": 6.274182788525684, "step": 18810 }, { "epoch": 6.274182788525684, "ref_ce_loss": 0.050350818783044815, "step": 18810 }, { "epoch": 6.277518345563709, "loss": 0.3319, "step": 18820 }, { "epoch": 6.277518345563709, "grad_norm": 3.944410800933838, "step": 18820 }, { "epoch": 6.277518345563709, "learning_rate": 3.501813955290823e-05, "step": 18820 }, { "epoch": 6.277518345563709, "loss": 0.23156653344631195, "step": 18820 }, { "ce_loss": 0.025693096220493317, "epoch": 6.277518345563709, "step": 18820 }, { "distill_loss": 0.12437605857849121, "epoch": 6.277518345563709, "step": 18820 }, { "epoch": 6.277518345563709, "ref_ce_loss": 0.06192917004227638, "step": 18820 }, { "epoch": 6.277518345563709, "loss": 0.34078988432884216, "step": 18820 }, { "ce_loss": 0.07537341117858887, "epoch": 6.277518345563709, "step": 18820 }, { "distill_loss": 0.1982569396495819, "epoch": 6.277518345563709, "step": 18820 }, { "epoch": 6.277518345563709, "ref_ce_loss": 0.06689344346523285, "step": 18820 }, { "epoch": 6.277518345563709, "loss": 0.30715587735176086, "step": 18820 }, { "ce_loss": 0.028312131762504578, "epoch": 6.277518345563709, "step": 18820 }, { "distill_loss": 0.19388048350811005, "epoch": 6.277518345563709, "step": 18820 }, { "epoch": 6.277518345563709, "ref_ce_loss": 0.04899001121520996, "step": 18820 }, { "epoch": 6.277518345563709, "loss": 0.26771223545074463, "step": 18820 }, { "ce_loss": 0.0458672009408474, "epoch": 6.277518345563709, "step": 18820 }, { "distill_loss": 0.12816867232322693, "epoch": 6.277518345563709, "step": 18820 }, { "epoch": 6.277518345563709, "ref_ce_loss": 0.05818053334951401, "step": 18820 }, { "epoch": 6.280853902601734, "loss": 0.3275, "step": 18830 }, { "epoch": 6.280853902601734, "grad_norm": 2.262253522872925, "step": 18830 }, { "epoch": 6.280853902601734, "learning_rate": 3.48881615690865e-05, "step": 18830 }, { "epoch": 6.280853902601734, "loss": 0.640660285949707, "step": 18830 }, { "ce_loss": 0.04574522003531456, "epoch": 6.280853902601734, "step": 18830 }, { "distill_loss": 0.18484735488891602, "epoch": 6.280853902601734, "step": 18830 }, { "epoch": 6.280853902601734, "ref_ce_loss": 0.0488605722784996, "step": 18830 }, { "epoch": 6.280853902601734, "loss": 0.3685927391052246, "step": 18830 }, { "ce_loss": 0.05379222705960274, "epoch": 6.280853902601734, "step": 18830 }, { "distill_loss": 0.12785358726978302, "epoch": 6.280853902601734, "step": 18830 }, { "epoch": 6.280853902601734, "ref_ce_loss": 0.07150807976722717, "step": 18830 }, { "epoch": 6.280853902601734, "loss": 0.5330727696418762, "step": 18830 }, { "ce_loss": 0.07434044033288956, "epoch": 6.280853902601734, "step": 18830 }, { "distill_loss": 0.19990497827529907, "epoch": 6.280853902601734, "step": 18830 }, { "epoch": 6.280853902601734, "ref_ce_loss": 0.06825008243322372, "step": 18830 }, { "epoch": 6.280853902601734, "loss": 0.5475145578384399, "step": 18830 }, { "ce_loss": 0.05228644236922264, "epoch": 6.280853902601734, "step": 18830 }, { "distill_loss": 0.15708759427070618, "epoch": 6.280853902601734, "step": 18830 }, { "epoch": 6.280853902601734, "ref_ce_loss": 0.03966747596859932, "step": 18830 }, { "epoch": 6.28418945963976, "loss": 0.3523, "step": 18840 }, { "epoch": 6.28418945963976, "grad_norm": 2.7833263874053955, "step": 18840 }, { "epoch": 6.28418945963976, "learning_rate": 3.475839350385014e-05, "step": 18840 }, { "epoch": 6.28418945963976, "loss": 0.23492398858070374, "step": 18840 }, { "ce_loss": 0.006972161587327719, "epoch": 6.28418945963976, "step": 18840 }, { "distill_loss": 0.10143738985061646, "epoch": 6.28418945963976, "step": 18840 }, { "epoch": 6.28418945963976, "ref_ce_loss": 0.04588712379336357, "step": 18840 }, { "epoch": 6.28418945963976, "loss": 0.3233441412448883, "step": 18840 }, { "ce_loss": 0.048174165189266205, "epoch": 6.28418945963976, "step": 18840 }, { "distill_loss": 0.19731523096561432, "epoch": 6.28418945963976, "step": 18840 }, { "epoch": 6.28418945963976, "ref_ce_loss": 0.059398673474788666, "step": 18840 }, { "epoch": 6.28418945963976, "loss": 0.43290847539901733, "step": 18840 }, { "ce_loss": 0.07412654906511307, "epoch": 6.28418945963976, "step": 18840 }, { "distill_loss": 0.226647287607193, "epoch": 6.28418945963976, "step": 18840 }, { "epoch": 6.28418945963976, "ref_ce_loss": 0.05764947459101677, "step": 18840 }, { "epoch": 6.28418945963976, "loss": 0.35213813185691833, "step": 18840 }, { "ce_loss": 0.03941421955823898, "epoch": 6.28418945963976, "step": 18840 }, { "distill_loss": 0.20007504522800446, "epoch": 6.28418945963976, "step": 18840 }, { "epoch": 6.28418945963976, "ref_ce_loss": 0.05939985811710358, "step": 18840 }, { "epoch": 6.287525016677785, "loss": 0.3877, "step": 18850 }, { "epoch": 6.287525016677785, "grad_norm": 2.940953016281128, "step": 18850 }, { "epoch": 6.287525016677785, "learning_rate": 3.462883559384492e-05, "step": 18850 }, { "epoch": 6.287525016677785, "loss": 0.2711646556854248, "step": 18850 }, { "ce_loss": 0.06784234195947647, "epoch": 6.287525016677785, "step": 18850 }, { "distill_loss": 0.10876431316137314, "epoch": 6.287525016677785, "step": 18850 }, { "epoch": 6.287525016677785, "ref_ce_loss": 0.058501046150922775, "step": 18850 }, { "epoch": 6.287525016677785, "loss": 0.19347287714481354, "step": 18850 }, { "ce_loss": 0.026580985635519028, "epoch": 6.287525016677785, "step": 18850 }, { "distill_loss": 0.12169019132852554, "epoch": 6.287525016677785, "step": 18850 }, { "epoch": 6.287525016677785, "ref_ce_loss": 0.04512413963675499, "step": 18850 }, { "epoch": 6.287525016677785, "loss": 0.2864726483821869, "step": 18850 }, { "ce_loss": 0.03934826701879501, "epoch": 6.287525016677785, "step": 18850 }, { "distill_loss": 0.12577557563781738, "epoch": 6.287525016677785, "step": 18850 }, { "epoch": 6.287525016677785, "ref_ce_loss": 0.07803640514612198, "step": 18850 }, { "epoch": 6.287525016677785, "loss": 0.16455747187137604, "step": 18850 }, { "ce_loss": 0.02695547230541706, "epoch": 6.287525016677785, "step": 18850 }, { "distill_loss": 0.0886109471321106, "epoch": 6.287525016677785, "step": 18850 }, { "epoch": 6.287525016677785, "ref_ce_loss": 0.03705105558037758, "step": 18850 }, { "epoch": 6.29086057371581, "loss": 0.3807, "step": 18860 }, { "epoch": 6.29086057371581, "grad_norm": 4.630390167236328, "step": 18860 }, { "epoch": 6.29086057371581, "learning_rate": 3.449948807533337e-05, "step": 18860 }, { "epoch": 6.29086057371581, "loss": 0.21386204659938812, "step": 18860 }, { "ce_loss": 0.026135116815567017, "epoch": 6.29086057371581, "step": 18860 }, { "distill_loss": 0.13585272431373596, "epoch": 6.29086057371581, "step": 18860 }, { "epoch": 6.29086057371581, "ref_ce_loss": 0.034613024443387985, "step": 18860 }, { "epoch": 6.29086057371581, "loss": 0.31926730275154114, "step": 18860 }, { "ce_loss": 0.03757088631391525, "epoch": 6.29086057371581, "step": 18860 }, { "distill_loss": 0.1286323070526123, "epoch": 6.29086057371581, "step": 18860 }, { "epoch": 6.29086057371581, "ref_ce_loss": 0.046050816774368286, "step": 18860 }, { "epoch": 6.29086057371581, "loss": 0.37816765904426575, "step": 18860 }, { "ce_loss": 0.12475141882896423, "epoch": 6.29086057371581, "step": 18860 }, { "distill_loss": 0.16274906694889069, "epoch": 6.29086057371581, "step": 18860 }, { "epoch": 6.29086057371581, "ref_ce_loss": 0.08075873553752899, "step": 18860 }, { "epoch": 6.29086057371581, "loss": 0.24097076058387756, "step": 18860 }, { "ce_loss": 0.019521335139870644, "epoch": 6.29086057371581, "step": 18860 }, { "distill_loss": 0.12991663813591003, "epoch": 6.29086057371581, "step": 18860 }, { "epoch": 6.29086057371581, "ref_ce_loss": 0.052921973168849945, "step": 18860 }, { "epoch": 6.294196130753836, "loss": 0.351, "step": 18870 }, { "epoch": 6.294196130753836, "grad_norm": 6.742186546325684, "step": 18870 }, { "epoch": 6.294196130753836, "learning_rate": 3.437035118419439e-05, "step": 18870 }, { "epoch": 6.294196130753836, "loss": 0.21090252697467804, "step": 18870 }, { "ce_loss": 0.038921330124139786, "epoch": 6.294196130753836, "step": 18870 }, { "distill_loss": 0.12278036773204803, "epoch": 6.294196130753836, "step": 18870 }, { "epoch": 6.294196130753836, "ref_ce_loss": 0.03309723734855652, "step": 18870 }, { "epoch": 6.294196130753836, "loss": 0.25562670826911926, "step": 18870 }, { "ce_loss": 0.0049499752931296825, "epoch": 6.294196130753836, "step": 18870 }, { "distill_loss": 0.20920096337795258, "epoch": 6.294196130753836, "step": 18870 }, { "epoch": 6.294196130753836, "ref_ce_loss": 0.02717713825404644, "step": 18870 }, { "epoch": 6.294196130753836, "loss": 0.6847727298736572, "step": 18870 }, { "ce_loss": 0.06199228763580322, "epoch": 6.294196130753836, "step": 18870 }, { "distill_loss": 0.18934482336044312, "epoch": 6.294196130753836, "step": 18870 }, { "epoch": 6.294196130753836, "ref_ce_loss": 0.045317307114601135, "step": 18870 }, { "epoch": 6.294196130753836, "loss": 0.2877572178840637, "step": 18870 }, { "ce_loss": 0.05759180709719658, "epoch": 6.294196130753836, "step": 18870 }, { "distill_loss": 0.12327567487955093, "epoch": 6.294196130753836, "step": 18870 }, { "epoch": 6.294196130753836, "ref_ce_loss": 0.07539641857147217, "step": 18870 }, { "epoch": 6.297531687791861, "loss": 0.3752, "step": 18880 }, { "epoch": 6.297531687791861, "grad_norm": 3.315523386001587, "step": 18880 }, { "epoch": 6.297531687791861, "learning_rate": 3.4241425155922634e-05, "step": 18880 }, { "epoch": 6.297531687791861, "loss": 0.29335570335388184, "step": 18880 }, { "ce_loss": 0.06978707015514374, "epoch": 6.297531687791861, "step": 18880 }, { "distill_loss": 0.14715451002120972, "epoch": 6.297531687791861, "step": 18880 }, { "epoch": 6.297531687791861, "ref_ce_loss": 0.060952696949243546, "step": 18880 }, { "epoch": 6.297531687791861, "loss": 0.44177842140197754, "step": 18880 }, { "ce_loss": 0.04377702251076698, "epoch": 6.297531687791861, "step": 18880 }, { "distill_loss": 0.1425521969795227, "epoch": 6.297531687791861, "step": 18880 }, { "epoch": 6.297531687791861, "ref_ce_loss": 0.06627517193555832, "step": 18880 }, { "epoch": 6.297531687791861, "loss": 0.2356526106595993, "step": 18880 }, { "ce_loss": 0.03892301395535469, "epoch": 6.297531687791861, "step": 18880 }, { "distill_loss": 0.12812058627605438, "epoch": 6.297531687791861, "step": 18880 }, { "epoch": 6.297531687791861, "ref_ce_loss": 0.050661858171224594, "step": 18880 }, { "epoch": 6.297531687791861, "loss": 0.19576594233512878, "step": 18880 }, { "ce_loss": 0.022691428661346436, "epoch": 6.297531687791861, "step": 18880 }, { "distill_loss": 0.1026381254196167, "epoch": 6.297531687791861, "step": 18880 }, { "epoch": 6.297531687791861, "ref_ce_loss": 0.05652888864278793, "step": 18880 }, { "epoch": 6.300867244829886, "loss": 0.3281, "step": 18890 }, { "epoch": 6.300867244829886, "grad_norm": 3.531867027282715, "step": 18890 }, { "epoch": 6.300867244829886, "learning_rate": 3.4112710225628344e-05, "step": 18890 }, { "epoch": 6.300867244829886, "loss": 0.45936667919158936, "step": 18890 }, { "ce_loss": 0.06959517300128937, "epoch": 6.300867244829886, "step": 18890 }, { "distill_loss": 0.20052653551101685, "epoch": 6.300867244829886, "step": 18890 }, { "epoch": 6.300867244829886, "ref_ce_loss": 0.046475961804389954, "step": 18890 }, { "epoch": 6.300867244829886, "loss": 0.22902725636959076, "step": 18890 }, { "ce_loss": 0.030058017000555992, "epoch": 6.300867244829886, "step": 18890 }, { "distill_loss": 0.1144430935382843, "epoch": 6.300867244829886, "step": 18890 }, { "epoch": 6.300867244829886, "ref_ce_loss": 0.051945485174655914, "step": 18890 }, { "epoch": 6.300867244829886, "loss": 0.46311238408088684, "step": 18890 }, { "ce_loss": 0.042298149317502975, "epoch": 6.300867244829886, "step": 18890 }, { "distill_loss": 0.3087809383869171, "epoch": 6.300867244829886, "step": 18890 }, { "epoch": 6.300867244829886, "ref_ce_loss": 0.05733121559023857, "step": 18890 }, { "epoch": 6.300867244829886, "loss": 0.34530389308929443, "step": 18890 }, { "ce_loss": 0.030319813638925552, "epoch": 6.300867244829886, "step": 18890 }, { "distill_loss": 0.15547746419906616, "epoch": 6.300867244829886, "step": 18890 }, { "epoch": 6.300867244829886, "ref_ce_loss": 0.08323720842599869, "step": 18890 }, { "epoch": 6.304202801867912, "loss": 0.3448, "step": 18900 }, { "epoch": 6.304202801867912, "grad_norm": 2.434091091156006, "step": 18900 }, { "epoch": 6.304202801867912, "learning_rate": 3.398420662803684e-05, "step": 18900 }, { "epoch": 6.304202801867912, "loss": 0.2415168285369873, "step": 18900 }, { "ce_loss": 0.020679494366049767, "epoch": 6.304202801867912, "step": 18900 }, { "distill_loss": 0.16226181387901306, "epoch": 6.304202801867912, "step": 18900 }, { "epoch": 6.304202801867912, "ref_ce_loss": 0.024102669209241867, "step": 18900 }, { "epoch": 6.304202801867912, "loss": 0.2756112515926361, "step": 18900 }, { "ce_loss": 0.03828546032309532, "epoch": 6.304202801867912, "step": 18900 }, { "distill_loss": 0.14190034568309784, "epoch": 6.304202801867912, "step": 18900 }, { "epoch": 6.304202801867912, "ref_ce_loss": 0.044629864394664764, "step": 18900 }, { "epoch": 6.304202801867912, "loss": 0.2844380736351013, "step": 18900 }, { "ce_loss": 0.0461580716073513, "epoch": 6.304202801867912, "step": 18900 }, { "distill_loss": 0.12029999494552612, "epoch": 6.304202801867912, "step": 18900 }, { "epoch": 6.304202801867912, "ref_ce_loss": 0.07012398540973663, "step": 18900 }, { "epoch": 6.304202801867912, "loss": 0.2737322449684143, "step": 18900 }, { "ce_loss": 0.04904641583561897, "epoch": 6.304202801867912, "step": 18900 }, { "distill_loss": 0.12067732214927673, "epoch": 6.304202801867912, "step": 18900 }, { "epoch": 6.304202801867912, "ref_ce_loss": 0.05247277021408081, "step": 18900 }, { "epoch": 6.307538358905937, "loss": 0.3068, "step": 18910 }, { "epoch": 6.307538358905937, "grad_norm": 2.252596139907837, "step": 18910 }, { "epoch": 6.307538358905937, "learning_rate": 3.385591459748793e-05, "step": 18910 }, { "epoch": 6.307538358905937, "loss": 0.29819396138191223, "step": 18910 }, { "ce_loss": 0.05503129959106445, "epoch": 6.307538358905937, "step": 18910 }, { "distill_loss": 0.1623593419790268, "epoch": 6.307538358905937, "step": 18910 }, { "epoch": 6.307538358905937, "ref_ce_loss": 0.08060084283351898, "step": 18910 }, { "epoch": 6.307538358905937, "loss": 0.37681642174720764, "step": 18910 }, { "ce_loss": 0.05450444296002388, "epoch": 6.307538358905937, "step": 18910 }, { "distill_loss": 0.20877453684806824, "epoch": 6.307538358905937, "step": 18910 }, { "epoch": 6.307538358905937, "ref_ce_loss": 0.0601249523460865, "step": 18910 }, { "epoch": 6.307538358905937, "loss": 0.39602282643318176, "step": 18910 }, { "ce_loss": 0.08872856199741364, "epoch": 6.307538358905937, "step": 18910 }, { "distill_loss": 0.17702454328536987, "epoch": 6.307538358905937, "step": 18910 }, { "epoch": 6.307538358905937, "ref_ce_loss": 0.09893164783716202, "step": 18910 }, { "epoch": 6.307538358905937, "loss": 0.25537437200546265, "step": 18910 }, { "ce_loss": 0.016225768253207207, "epoch": 6.307538358905937, "step": 18910 }, { "distill_loss": 0.13233773410320282, "epoch": 6.307538358905937, "step": 18910 }, { "epoch": 6.307538358905937, "ref_ce_loss": 0.04172232374548912, "step": 18910 }, { "epoch": 6.310873915943962, "loss": 0.3138, "step": 18920 }, { "epoch": 6.310873915943962, "grad_norm": 2.0156707763671875, "step": 18920 }, { "epoch": 6.310873915943962, "learning_rate": 3.3727834367935634e-05, "step": 18920 }, { "epoch": 6.310873915943962, "loss": 0.4027228355407715, "step": 18920 }, { "ce_loss": 0.09059377759695053, "epoch": 6.310873915943962, "step": 18920 }, { "distill_loss": 0.16408628225326538, "epoch": 6.310873915943962, "step": 18920 }, { "epoch": 6.310873915943962, "ref_ce_loss": 0.046289198100566864, "step": 18920 }, { "epoch": 6.310873915943962, "loss": 0.4065344035625458, "step": 18920 }, { "ce_loss": 0.04324747249484062, "epoch": 6.310873915943962, "step": 18920 }, { "distill_loss": 0.21216806769371033, "epoch": 6.310873915943962, "step": 18920 }, { "epoch": 6.310873915943962, "ref_ce_loss": 0.08379409462213516, "step": 18920 }, { "epoch": 6.310873915943962, "loss": 0.28860020637512207, "step": 18920 }, { "ce_loss": 0.0696495994925499, "epoch": 6.310873915943962, "step": 18920 }, { "distill_loss": 0.15288947522640228, "epoch": 6.310873915943962, "step": 18920 }, { "epoch": 6.310873915943962, "ref_ce_loss": 0.06594584882259369, "step": 18920 }, { "epoch": 6.310873915943962, "loss": 0.5158238410949707, "step": 18920 }, { "ce_loss": 0.07455329596996307, "epoch": 6.310873915943962, "step": 18920 }, { "distill_loss": 0.33285900950431824, "epoch": 6.310873915943962, "step": 18920 }, { "epoch": 6.310873915943962, "ref_ce_loss": 0.07242995500564575, "step": 18920 }, { "epoch": 6.314209472981988, "loss": 0.3283, "step": 18930 }, { "epoch": 6.314209472981988, "grad_norm": 2.4332995414733887, "step": 18930 }, { "epoch": 6.314209472981988, "learning_rate": 3.35999661729479e-05, "step": 18930 }, { "epoch": 6.314209472981988, "loss": 0.4734187722206116, "step": 18930 }, { "ce_loss": 0.051276229321956635, "epoch": 6.314209472981988, "step": 18930 }, { "distill_loss": 0.21224486827850342, "epoch": 6.314209472981988, "step": 18930 }, { "epoch": 6.314209472981988, "ref_ce_loss": 0.05212824419140816, "step": 18930 }, { "epoch": 6.314209472981988, "loss": 0.2177160680294037, "step": 18930 }, { "ce_loss": 0.033990684896707535, "epoch": 6.314209472981988, "step": 18930 }, { "distill_loss": 0.15057498216629028, "epoch": 6.314209472981988, "step": 18930 }, { "epoch": 6.314209472981988, "ref_ce_loss": 0.03305617719888687, "step": 18930 }, { "epoch": 6.314209472981988, "loss": 0.4837043881416321, "step": 18930 }, { "ce_loss": 0.06694763898849487, "epoch": 6.314209472981988, "step": 18930 }, { "distill_loss": 0.2243620753288269, "epoch": 6.314209472981988, "step": 18930 }, { "epoch": 6.314209472981988, "ref_ce_loss": 0.07067155838012695, "step": 18930 }, { "epoch": 6.314209472981988, "loss": 0.34912294149398804, "step": 18930 }, { "ce_loss": 0.06255176663398743, "epoch": 6.314209472981988, "step": 18930 }, { "distill_loss": 0.1636684238910675, "epoch": 6.314209472981988, "step": 18930 }, { "epoch": 6.314209472981988, "ref_ce_loss": 0.06407450139522552, "step": 18930 }, { "epoch": 6.317545030020013, "loss": 0.3386, "step": 18940 }, { "epoch": 6.317545030020013, "grad_norm": 3.589062213897705, "step": 18940 }, { "epoch": 6.317545030020013, "learning_rate": 3.347231024570578e-05, "step": 18940 }, { "epoch": 6.317545030020013, "loss": 0.2020236700773239, "step": 18940 }, { "ce_loss": 0.031198319047689438, "epoch": 6.317545030020013, "step": 18940 }, { "distill_loss": 0.11204557120800018, "epoch": 6.317545030020013, "step": 18940 }, { "epoch": 6.317545030020013, "ref_ce_loss": 0.05850578099489212, "step": 18940 }, { "epoch": 6.317545030020013, "loss": 0.2590986490249634, "step": 18940 }, { "ce_loss": 0.03589296713471413, "epoch": 6.317545030020013, "step": 18940 }, { "distill_loss": 0.17840811610221863, "epoch": 6.317545030020013, "step": 18940 }, { "epoch": 6.317545030020013, "ref_ce_loss": 0.04469051584601402, "step": 18940 }, { "epoch": 6.317545030020013, "loss": 0.25700971484184265, "step": 18940 }, { "ce_loss": 0.018838005140423775, "epoch": 6.317545030020013, "step": 18940 }, { "distill_loss": 0.18496237695217133, "epoch": 6.317545030020013, "step": 18940 }, { "epoch": 6.317545030020013, "ref_ce_loss": 0.03536644205451012, "step": 18940 }, { "epoch": 6.317545030020013, "loss": 0.19318623840808868, "step": 18940 }, { "ce_loss": 0.011522611603140831, "epoch": 6.317545030020013, "step": 18940 }, { "distill_loss": 0.10571037232875824, "epoch": 6.317545030020013, "step": 18940 }, { "epoch": 6.317545030020013, "ref_ce_loss": 0.04865308105945587, "step": 18940 }, { "epoch": 6.3208805870580385, "loss": 0.3627, "step": 18950 }, { "epoch": 6.3208805870580385, "grad_norm": 2.7662694454193115, "step": 18950 }, { "epoch": 6.3208805870580385, "learning_rate": 3.3344866819003374e-05, "step": 18950 }, { "epoch": 6.3208805870580385, "loss": 0.35670697689056396, "step": 18950 }, { "ce_loss": 0.031022926792502403, "epoch": 6.3208805870580385, "step": 18950 }, { "distill_loss": 0.1557820737361908, "epoch": 6.3208805870580385, "step": 18950 }, { "epoch": 6.3208805870580385, "ref_ce_loss": 0.04982922598719597, "step": 18950 }, { "epoch": 6.3208805870580385, "loss": 0.2733045220375061, "step": 18950 }, { "ce_loss": 0.04908668249845505, "epoch": 6.3208805870580385, "step": 18950 }, { "distill_loss": 0.11684387177228928, "epoch": 6.3208805870580385, "step": 18950 }, { "epoch": 6.3208805870580385, "ref_ce_loss": 0.055443841964006424, "step": 18950 }, { "epoch": 6.3208805870580385, "loss": 0.2913722097873688, "step": 18950 }, { "ce_loss": 0.06559805572032928, "epoch": 6.3208805870580385, "step": 18950 }, { "distill_loss": 0.1549169272184372, "epoch": 6.3208805870580385, "step": 18950 }, { "epoch": 6.3208805870580385, "ref_ce_loss": 0.0595548078417778, "step": 18950 }, { "epoch": 6.3208805870580385, "loss": 0.2763633131980896, "step": 18950 }, { "ce_loss": 0.05452756956219673, "epoch": 6.3208805870580385, "step": 18950 }, { "distill_loss": 0.1420384794473648, "epoch": 6.3208805870580385, "step": 18950 }, { "epoch": 6.3208805870580385, "ref_ce_loss": 0.05865325778722763, "step": 18950 }, { "epoch": 6.324216144096064, "loss": 0.3197, "step": 18960 }, { "epoch": 6.324216144096064, "grad_norm": 3.9764132499694824, "step": 18960 }, { "epoch": 6.324216144096064, "learning_rate": 3.321763612524716e-05, "step": 18960 }, { "epoch": 6.324216144096064, "loss": 0.2885870039463043, "step": 18960 }, { "ce_loss": 0.019036101177334785, "epoch": 6.324216144096064, "step": 18960 }, { "distill_loss": 0.14021548628807068, "epoch": 6.324216144096064, "step": 18960 }, { "epoch": 6.324216144096064, "ref_ce_loss": 0.06481176614761353, "step": 18960 }, { "epoch": 6.324216144096064, "loss": 0.3427791893482208, "step": 18960 }, { "ce_loss": 0.039608050137758255, "epoch": 6.324216144096064, "step": 18960 }, { "distill_loss": 0.14981134235858917, "epoch": 6.324216144096064, "step": 18960 }, { "epoch": 6.324216144096064, "ref_ce_loss": 0.08245816081762314, "step": 18960 }, { "epoch": 6.324216144096064, "loss": 0.42633309960365295, "step": 18960 }, { "ce_loss": 0.05362397059798241, "epoch": 6.324216144096064, "step": 18960 }, { "distill_loss": 0.24959130585193634, "epoch": 6.324216144096064, "step": 18960 }, { "epoch": 6.324216144096064, "ref_ce_loss": 0.04351172223687172, "step": 18960 }, { "epoch": 6.324216144096064, "loss": 0.43009763956069946, "step": 18960 }, { "ce_loss": 0.11033513396978378, "epoch": 6.324216144096064, "step": 18960 }, { "distill_loss": 0.16854149103164673, "epoch": 6.324216144096064, "step": 18960 }, { "epoch": 6.324216144096064, "ref_ce_loss": 0.08382659405469894, "step": 18960 }, { "epoch": 6.327551701134089, "loss": 0.3519, "step": 18970 }, { "epoch": 6.327551701134089, "grad_norm": 3.6981232166290283, "step": 18970 }, { "epoch": 6.327551701134089, "learning_rate": 3.309061839645578e-05, "step": 18970 }, { "epoch": 6.327551701134089, "loss": 0.37140506505966187, "step": 18970 }, { "ce_loss": 0.0781472697854042, "epoch": 6.327551701134089, "step": 18970 }, { "distill_loss": 0.17404945194721222, "epoch": 6.327551701134089, "step": 18970 }, { "epoch": 6.327551701134089, "ref_ce_loss": 0.07811552286148071, "step": 18970 }, { "epoch": 6.327551701134089, "loss": 0.27958112955093384, "step": 18970 }, { "ce_loss": 0.04201284795999527, "epoch": 6.327551701134089, "step": 18970 }, { "distill_loss": 0.13044360280036926, "epoch": 6.327551701134089, "step": 18970 }, { "epoch": 6.327551701134089, "ref_ce_loss": 0.048841409385204315, "step": 18970 }, { "epoch": 6.327551701134089, "loss": 0.3533022999763489, "step": 18970 }, { "ce_loss": 0.05124984681606293, "epoch": 6.327551701134089, "step": 18970 }, { "distill_loss": 0.23570401966571808, "epoch": 6.327551701134089, "step": 18970 }, { "epoch": 6.327551701134089, "ref_ce_loss": 0.05524509772658348, "step": 18970 }, { "epoch": 6.327551701134089, "loss": 0.41169291734695435, "step": 18970 }, { "ce_loss": 0.07361854612827301, "epoch": 6.327551701134089, "step": 18970 }, { "distill_loss": 0.22925794124603271, "epoch": 6.327551701134089, "step": 18970 }, { "epoch": 6.327551701134089, "ref_ce_loss": 0.07110333442687988, "step": 18970 }, { "epoch": 6.3308872581721145, "loss": 0.3328, "step": 18980 }, { "epoch": 6.3308872581721145, "grad_norm": 4.419796943664551, "step": 18980 }, { "epoch": 6.3308872581721145, "learning_rate": 3.2963813864259436e-05, "step": 18980 }, { "epoch": 6.3308872581721145, "loss": 0.3489697575569153, "step": 18980 }, { "ce_loss": 0.019743165001273155, "epoch": 6.3308872581721145, "step": 18980 }, { "distill_loss": 0.180571511387825, "epoch": 6.3308872581721145, "step": 18980 }, { "epoch": 6.3308872581721145, "ref_ce_loss": 0.047717832028865814, "step": 18980 }, { "epoch": 6.3308872581721145, "loss": 0.2253534197807312, "step": 18980 }, { "ce_loss": 0.03531552106142044, "epoch": 6.3308872581721145, "step": 18980 }, { "distill_loss": 0.1357894390821457, "epoch": 6.3308872581721145, "step": 18980 }, { "epoch": 6.3308872581721145, "ref_ce_loss": 0.054144252091646194, "step": 18980 }, { "epoch": 6.3308872581721145, "loss": 0.30089667439460754, "step": 18980 }, { "ce_loss": 0.025029117241501808, "epoch": 6.3308872581721145, "step": 18980 }, { "distill_loss": 0.14975345134735107, "epoch": 6.3308872581721145, "step": 18980 }, { "epoch": 6.3308872581721145, "ref_ce_loss": 0.05282760038971901, "step": 18980 }, { "epoch": 6.3308872581721145, "loss": 0.3071041703224182, "step": 18980 }, { "ce_loss": 0.035489119589328766, "epoch": 6.3308872581721145, "step": 18980 }, { "distill_loss": 0.1634751409292221, "epoch": 6.3308872581721145, "step": 18980 }, { "epoch": 6.3308872581721145, "ref_ce_loss": 0.04332876205444336, "step": 18980 }, { "epoch": 6.33422281521014, "loss": 0.3604, "step": 18990 }, { "epoch": 6.33422281521014, "grad_norm": 3.840822458267212, "step": 18990 }, { "epoch": 6.33422281521014, "learning_rate": 3.2837222759899615e-05, "step": 18990 }, { "epoch": 6.33422281521014, "loss": 0.26387926936149597, "step": 18990 }, { "ce_loss": 0.033305756747722626, "epoch": 6.33422281521014, "step": 18990 }, { "distill_loss": 0.17537467181682587, "epoch": 6.33422281521014, "step": 18990 }, { "epoch": 6.33422281521014, "ref_ce_loss": 0.04105931892991066, "step": 18990 }, { "epoch": 6.33422281521014, "loss": 0.4222206771373749, "step": 18990 }, { "ce_loss": 0.04392791911959648, "epoch": 6.33422281521014, "step": 18990 }, { "distill_loss": 0.1172325387597084, "epoch": 6.33422281521014, "step": 18990 }, { "epoch": 6.33422281521014, "ref_ce_loss": 0.0700860545039177, "step": 18990 }, { "epoch": 6.33422281521014, "loss": 0.22235901653766632, "step": 18990 }, { "ce_loss": 0.019134419038891792, "epoch": 6.33422281521014, "step": 18990 }, { "distill_loss": 0.13299764692783356, "epoch": 6.33422281521014, "step": 18990 }, { "epoch": 6.33422281521014, "ref_ce_loss": 0.05059622600674629, "step": 18990 }, { "epoch": 6.33422281521014, "loss": 0.3985365629196167, "step": 18990 }, { "ce_loss": 0.029248295351862907, "epoch": 6.33422281521014, "step": 18990 }, { "distill_loss": 0.19520241022109985, "epoch": 6.33422281521014, "step": 18990 }, { "epoch": 6.33422281521014, "ref_ce_loss": 0.048385899513959885, "step": 18990 }, { "epoch": 6.337558372248165, "loss": 0.3584, "step": 19000 }, { "epoch": 6.337558372248165, "grad_norm": 4.76231575012207, "step": 19000 }, { "epoch": 6.337558372248165, "learning_rate": 3.271084531422857e-05, "step": 19000 }, { "epoch": 6.337558372248165, "loss": 0.36909541487693787, "step": 19000 }, { "ce_loss": 0.05047956109046936, "epoch": 6.337558372248165, "step": 19000 }, { "distill_loss": 0.12576617300510406, "epoch": 6.337558372248165, "step": 19000 }, { "epoch": 6.337558372248165, "ref_ce_loss": 0.08630871772766113, "step": 19000 }, { "epoch": 6.337558372248165, "loss": 0.346733421087265, "step": 19000 }, { "ce_loss": 0.08098523318767548, "epoch": 6.337558372248165, "step": 19000 }, { "distill_loss": 0.1754927784204483, "epoch": 6.337558372248165, "step": 19000 }, { "epoch": 6.337558372248165, "ref_ce_loss": 0.09000211209058762, "step": 19000 }, { "epoch": 6.337558372248165, "loss": 0.2860293984413147, "step": 19000 }, { "ce_loss": 0.027446920052170753, "epoch": 6.337558372248165, "step": 19000 }, { "distill_loss": 0.1255069524049759, "epoch": 6.337558372248165, "step": 19000 }, { "epoch": 6.337558372248165, "ref_ce_loss": 0.05317498371005058, "step": 19000 }, { "epoch": 6.337558372248165, "loss": 0.24360382556915283, "step": 19000 }, { "ce_loss": 0.028923992067575455, "epoch": 6.337558372248165, "step": 19000 }, { "distill_loss": 0.1311366856098175, "epoch": 6.337558372248165, "step": 19000 }, { "epoch": 6.337558372248165, "ref_ce_loss": 0.03659401834011078, "step": 19000 }, { "epoch": 6.3408939292861906, "loss": 0.2933, "step": 19010 }, { "epoch": 6.3408939292861906, "grad_norm": 2.842592477798462, "step": 19010 }, { "epoch": 6.3408939292861906, "learning_rate": 3.258468175770884e-05, "step": 19010 }, { "epoch": 6.3408939292861906, "loss": 0.2973235845565796, "step": 19010 }, { "ce_loss": 0.03437792509794235, "epoch": 6.3408939292861906, "step": 19010 }, { "distill_loss": 0.10890894383192062, "epoch": 6.3408939292861906, "step": 19010 }, { "epoch": 6.3408939292861906, "ref_ce_loss": 0.08101943135261536, "step": 19010 }, { "epoch": 6.3408939292861906, "loss": 0.25962209701538086, "step": 19010 }, { "ce_loss": 0.02973208576440811, "epoch": 6.3408939292861906, "step": 19010 }, { "distill_loss": 0.10564002394676208, "epoch": 6.3408939292861906, "step": 19010 }, { "epoch": 6.3408939292861906, "ref_ce_loss": 0.04377582296729088, "step": 19010 }, { "epoch": 6.3408939292861906, "loss": 0.4331674575805664, "step": 19010 }, { "ce_loss": 0.053108036518096924, "epoch": 6.3408939292861906, "step": 19010 }, { "distill_loss": 0.14030146598815918, "epoch": 6.3408939292861906, "step": 19010 }, { "epoch": 6.3408939292861906, "ref_ce_loss": 0.09056838601827621, "step": 19010 }, { "epoch": 6.3408939292861906, "loss": 0.2938518524169922, "step": 19010 }, { "ce_loss": 0.045009806752204895, "epoch": 6.3408939292861906, "step": 19010 }, { "distill_loss": 0.1620580404996872, "epoch": 6.3408939292861906, "step": 19010 }, { "epoch": 6.3408939292861906, "ref_ce_loss": 0.049979884177446365, "step": 19010 }, { "epoch": 6.344229486324216, "loss": 0.2939, "step": 19020 }, { "epoch": 6.344229486324216, "grad_norm": 8.198354721069336, "step": 19020 }, { "epoch": 6.344229486324216, "learning_rate": 3.245873232041302e-05, "step": 19020 }, { "epoch": 6.344229486324216, "loss": 0.3113076090812683, "step": 19020 }, { "ce_loss": 0.0427481047809124, "epoch": 6.344229486324216, "step": 19020 }, { "distill_loss": 0.1602908819913864, "epoch": 6.344229486324216, "step": 19020 }, { "epoch": 6.344229486324216, "ref_ce_loss": 0.08124813437461853, "step": 19020 }, { "epoch": 6.344229486324216, "loss": 0.3438887894153595, "step": 19020 }, { "ce_loss": 0.031248973682522774, "epoch": 6.344229486324216, "step": 19020 }, { "distill_loss": 0.20599618554115295, "epoch": 6.344229486324216, "step": 19020 }, { "epoch": 6.344229486324216, "ref_ce_loss": 0.07038242369890213, "step": 19020 }, { "epoch": 6.344229486324216, "loss": 0.43153202533721924, "step": 19020 }, { "ce_loss": 0.021240759640932083, "epoch": 6.344229486324216, "step": 19020 }, { "distill_loss": 0.13821086287498474, "epoch": 6.344229486324216, "step": 19020 }, { "epoch": 6.344229486324216, "ref_ce_loss": 0.05561009421944618, "step": 19020 }, { "epoch": 6.344229486324216, "loss": 0.4814144968986511, "step": 19020 }, { "ce_loss": 0.043110962957143784, "epoch": 6.344229486324216, "step": 19020 }, { "distill_loss": 0.3201991617679596, "epoch": 6.344229486324216, "step": 19020 }, { "epoch": 6.344229486324216, "ref_ce_loss": 0.09728872776031494, "step": 19020 }, { "epoch": 6.347565043362241, "loss": 0.3491, "step": 19030 }, { "epoch": 6.347565043362241, "grad_norm": 2.9248974323272705, "step": 19030 }, { "epoch": 6.347565043362241, "learning_rate": 3.233299723202319e-05, "step": 19030 }, { "epoch": 6.347565043362241, "loss": 0.32620903849601746, "step": 19030 }, { "ce_loss": 0.0165343526750803, "epoch": 6.347565043362241, "step": 19030 }, { "distill_loss": 0.09083996713161469, "epoch": 6.347565043362241, "step": 19030 }, { "epoch": 6.347565043362241, "ref_ce_loss": 0.05633927881717682, "step": 19030 }, { "epoch": 6.347565043362241, "loss": 0.28984490036964417, "step": 19030 }, { "ce_loss": 0.04113488271832466, "epoch": 6.347565043362241, "step": 19030 }, { "distill_loss": 0.20167671144008636, "epoch": 6.347565043362241, "step": 19030 }, { "epoch": 6.347565043362241, "ref_ce_loss": 0.04684454947710037, "step": 19030 }, { "epoch": 6.347565043362241, "loss": 0.32723718881607056, "step": 19030 }, { "ce_loss": 0.022285494953393936, "epoch": 6.347565043362241, "step": 19030 }, { "distill_loss": 0.15769782662391663, "epoch": 6.347565043362241, "step": 19030 }, { "epoch": 6.347565043362241, "ref_ce_loss": 0.0603957436978817, "step": 19030 }, { "epoch": 6.347565043362241, "loss": 0.40456095337867737, "step": 19030 }, { "ce_loss": 0.012262187898159027, "epoch": 6.347565043362241, "step": 19030 }, { "distill_loss": 0.17209021747112274, "epoch": 6.347565043362241, "step": 19030 }, { "epoch": 6.347565043362241, "ref_ce_loss": 0.06936907768249512, "step": 19030 }, { "epoch": 6.350900600400267, "loss": 0.3313, "step": 19040 }, { "epoch": 6.350900600400267, "grad_norm": 4.83967924118042, "step": 19040 }, { "epoch": 6.350900600400267, "learning_rate": 3.2207476721830575e-05, "step": 19040 }, { "epoch": 6.350900600400267, "loss": 0.34516626596450806, "step": 19040 }, { "ce_loss": 0.02484416589140892, "epoch": 6.350900600400267, "step": 19040 }, { "distill_loss": 0.20764781534671783, "epoch": 6.350900600400267, "step": 19040 }, { "epoch": 6.350900600400267, "ref_ce_loss": 0.11241208761930466, "step": 19040 }, { "epoch": 6.350900600400267, "loss": 0.314484566450119, "step": 19040 }, { "ce_loss": 0.07472291588783264, "epoch": 6.350900600400267, "step": 19040 }, { "distill_loss": 0.1334226131439209, "epoch": 6.350900600400267, "step": 19040 }, { "epoch": 6.350900600400267, "ref_ce_loss": 0.06843020021915436, "step": 19040 }, { "epoch": 6.350900600400267, "loss": 0.2526988387107849, "step": 19040 }, { "ce_loss": 0.08345230668783188, "epoch": 6.350900600400267, "step": 19040 }, { "distill_loss": 0.12088726460933685, "epoch": 6.350900600400267, "step": 19040 }, { "epoch": 6.350900600400267, "ref_ce_loss": 0.03264693170785904, "step": 19040 }, { "epoch": 6.350900600400267, "loss": 0.22657568752765656, "step": 19040 }, { "ce_loss": 0.023793227970600128, "epoch": 6.350900600400267, "step": 19040 }, { "distill_loss": 0.13206267356872559, "epoch": 6.350900600400267, "step": 19040 }, { "epoch": 6.350900600400267, "ref_ce_loss": 0.07039143890142441, "step": 19040 }, { "epoch": 6.354236157438292, "loss": 0.3438, "step": 19050 }, { "epoch": 6.354236157438292, "grad_norm": 5.042229652404785, "step": 19050 }, { "epoch": 6.354236157438292, "learning_rate": 3.208217101873505e-05, "step": 19050 }, { "epoch": 6.354236157438292, "loss": 0.22500789165496826, "step": 19050 }, { "ce_loss": 0.004221898503601551, "epoch": 6.354236157438292, "step": 19050 }, { "distill_loss": 0.1627054512500763, "epoch": 6.354236157438292, "step": 19050 }, { "epoch": 6.354236157438292, "ref_ce_loss": 0.05779343843460083, "step": 19050 }, { "epoch": 6.354236157438292, "loss": 0.19450661540031433, "step": 19050 }, { "ce_loss": 0.008154689334332943, "epoch": 6.354236157438292, "step": 19050 }, { "distill_loss": 0.10214529931545258, "epoch": 6.354236157438292, "step": 19050 }, { "epoch": 6.354236157438292, "ref_ce_loss": 0.04879332706332207, "step": 19050 }, { "epoch": 6.354236157438292, "loss": 0.9522825479507446, "step": 19050 }, { "ce_loss": 0.027610354125499725, "epoch": 6.354236157438292, "step": 19050 }, { "distill_loss": 0.1594891995191574, "epoch": 6.354236157438292, "step": 19050 }, { "epoch": 6.354236157438292, "ref_ce_loss": 0.0504218228161335, "step": 19050 }, { "epoch": 6.354236157438292, "loss": 0.5254677534103394, "step": 19050 }, { "ce_loss": 0.07183732837438583, "epoch": 6.354236157438292, "step": 19050 }, { "distill_loss": 0.17535251379013062, "epoch": 6.354236157438292, "step": 19050 }, { "epoch": 6.354236157438292, "ref_ce_loss": 0.05224858969449997, "step": 19050 }, { "epoch": 6.357571714476317, "loss": 0.342, "step": 19060 }, { "epoch": 6.357571714476317, "grad_norm": 2.714850664138794, "step": 19060 }, { "epoch": 6.357571714476317, "learning_rate": 3.195708035124485e-05, "step": 19060 }, { "epoch": 6.357571714476317, "loss": 0.28869467973709106, "step": 19060 }, { "ce_loss": 0.009162982925772667, "epoch": 6.357571714476317, "step": 19060 }, { "distill_loss": 0.15321744978427887, "epoch": 6.357571714476317, "step": 19060 }, { "epoch": 6.357571714476317, "ref_ce_loss": 0.04659789428114891, "step": 19060 }, { "epoch": 6.357571714476317, "loss": 0.4857400059700012, "step": 19060 }, { "ce_loss": 0.0692988932132721, "epoch": 6.357571714476317, "step": 19060 }, { "distill_loss": 0.16876783967018127, "epoch": 6.357571714476317, "step": 19060 }, { "epoch": 6.357571714476317, "ref_ce_loss": 0.07196694612503052, "step": 19060 }, { "epoch": 6.357571714476317, "loss": 0.2311512678861618, "step": 19060 }, { "ce_loss": 0.002413057256489992, "epoch": 6.357571714476317, "step": 19060 }, { "distill_loss": 0.1793450266122818, "epoch": 6.357571714476317, "step": 19060 }, { "epoch": 6.357571714476317, "ref_ce_loss": 0.04919705539941788, "step": 19060 }, { "epoch": 6.357571714476317, "loss": 0.4767001271247864, "step": 19060 }, { "ce_loss": 0.0869223102927208, "epoch": 6.357571714476317, "step": 19060 }, { "distill_loss": 0.2395099699497223, "epoch": 6.357571714476317, "step": 19060 }, { "epoch": 6.357571714476317, "ref_ce_loss": 0.06657543778419495, "step": 19060 }, { "epoch": 6.360907271514343, "loss": 0.328, "step": 19070 }, { "epoch": 6.360907271514343, "grad_norm": 2.230041265487671, "step": 19070 }, { "epoch": 6.360907271514343, "learning_rate": 3.183220494747591e-05, "step": 19070 }, { "epoch": 6.360907271514343, "loss": 0.3135024607181549, "step": 19070 }, { "ce_loss": 0.05656009167432785, "epoch": 6.360907271514343, "step": 19070 }, { "distill_loss": 0.1409039944410324, "epoch": 6.360907271514343, "step": 19070 }, { "epoch": 6.360907271514343, "ref_ce_loss": 0.08445709198713303, "step": 19070 }, { "epoch": 6.360907271514343, "loss": 0.3105989992618561, "step": 19070 }, { "ce_loss": 0.055232059210538864, "epoch": 6.360907271514343, "step": 19070 }, { "distill_loss": 0.17802393436431885, "epoch": 6.360907271514343, "step": 19070 }, { "epoch": 6.360907271514343, "ref_ce_loss": 0.058957234025001526, "step": 19070 }, { "epoch": 6.360907271514343, "loss": 0.2731439471244812, "step": 19070 }, { "ce_loss": 0.02287127636373043, "epoch": 6.360907271514343, "step": 19070 }, { "distill_loss": 0.1520075649023056, "epoch": 6.360907271514343, "step": 19070 }, { "epoch": 6.360907271514343, "ref_ce_loss": 0.05730840191245079, "step": 19070 }, { "epoch": 6.360907271514343, "loss": 0.21526774764060974, "step": 19070 }, { "ce_loss": 0.015572651289403439, "epoch": 6.360907271514343, "step": 19070 }, { "distill_loss": 0.10528266429901123, "epoch": 6.360907271514343, "step": 19070 }, { "epoch": 6.360907271514343, "ref_ce_loss": 0.0473422110080719, "step": 19070 }, { "epoch": 6.364242828552368, "loss": 0.3333, "step": 19080 }, { "epoch": 6.364242828552368, "grad_norm": 2.177327871322632, "step": 19080 }, { "epoch": 6.364242828552368, "learning_rate": 3.170754503515176e-05, "step": 19080 }, { "epoch": 6.364242828552368, "loss": 0.21960368752479553, "step": 19080 }, { "ce_loss": 0.004566080868244171, "epoch": 6.364242828552368, "step": 19080 }, { "distill_loss": 0.12495573610067368, "epoch": 6.364242828552368, "step": 19080 }, { "epoch": 6.364242828552368, "ref_ce_loss": 0.051851414144039154, "step": 19080 }, { "epoch": 6.364242828552368, "loss": 0.23354806005954742, "step": 19080 }, { "ce_loss": 0.030264711007475853, "epoch": 6.364242828552368, "step": 19080 }, { "distill_loss": 0.09896160662174225, "epoch": 6.364242828552368, "step": 19080 }, { "epoch": 6.364242828552368, "ref_ce_loss": 0.05575990676879883, "step": 19080 }, { "epoch": 6.364242828552368, "loss": 0.353571355342865, "step": 19080 }, { "ce_loss": 0.10856593400239944, "epoch": 6.364242828552368, "step": 19080 }, { "distill_loss": 0.1839468628168106, "epoch": 6.364242828552368, "step": 19080 }, { "epoch": 6.364242828552368, "ref_ce_loss": 0.060834918171167374, "step": 19080 }, { "epoch": 6.364242828552368, "loss": 0.3653889298439026, "step": 19080 }, { "ce_loss": 0.1770656853914261, "epoch": 6.364242828552368, "step": 19080 }, { "distill_loss": 0.09319852292537689, "epoch": 6.364242828552368, "step": 19080 }, { "epoch": 6.364242828552368, "ref_ce_loss": 0.07829434424638748, "step": 19080 }, { "epoch": 6.367578385590393, "loss": 0.3147, "step": 19090 }, { "epoch": 6.367578385590393, "grad_norm": 2.2435762882232666, "step": 19090 }, { "epoch": 6.367578385590393, "learning_rate": 3.15831008416029e-05, "step": 19090 }, { "epoch": 6.367578385590393, "loss": 0.1861688494682312, "step": 19090 }, { "ce_loss": 0.04676009714603424, "epoch": 6.367578385590393, "step": 19090 }, { "distill_loss": 0.09529206156730652, "epoch": 6.367578385590393, "step": 19090 }, { "epoch": 6.367578385590393, "ref_ce_loss": 0.04402006417512894, "step": 19090 }, { "epoch": 6.367578385590393, "loss": 0.2757784426212311, "step": 19090 }, { "ce_loss": 0.027549242600798607, "epoch": 6.367578385590393, "step": 19090 }, { "distill_loss": 0.1115616038441658, "epoch": 6.367578385590393, "step": 19090 }, { "epoch": 6.367578385590393, "ref_ce_loss": 0.0614810436964035, "step": 19090 }, { "epoch": 6.367578385590393, "loss": 0.2042546570301056, "step": 19090 }, { "ce_loss": 0.008537779562175274, "epoch": 6.367578385590393, "step": 19090 }, { "distill_loss": 0.09340881556272507, "epoch": 6.367578385590393, "step": 19090 }, { "epoch": 6.367578385590393, "ref_ce_loss": 0.03826262801885605, "step": 19090 }, { "epoch": 6.367578385590393, "loss": 0.27964353561401367, "step": 19090 }, { "ce_loss": 0.00564216123893857, "epoch": 6.367578385590393, "step": 19090 }, { "distill_loss": 0.12422443181276321, "epoch": 6.367578385590393, "step": 19090 }, { "epoch": 6.367578385590393, "ref_ce_loss": 0.03275144472718239, "step": 19090 }, { "epoch": 6.370913942628419, "loss": 0.3159, "step": 19100 }, { "epoch": 6.370913942628419, "grad_norm": 2.6322593688964844, "step": 19100 }, { "epoch": 6.370913942628419, "learning_rate": 3.1458872593766445e-05, "step": 19100 }, { "epoch": 6.370913942628419, "loss": 0.2862546741962433, "step": 19100 }, { "ce_loss": 0.02663441374897957, "epoch": 6.370913942628419, "step": 19100 }, { "distill_loss": 0.14039082825183868, "epoch": 6.370913942628419, "step": 19100 }, { "epoch": 6.370913942628419, "ref_ce_loss": 0.053199347108602524, "step": 19100 }, { "epoch": 6.370913942628419, "loss": 0.3178987503051758, "step": 19100 }, { "ce_loss": 0.04347086697816849, "epoch": 6.370913942628419, "step": 19100 }, { "distill_loss": 0.20151656866073608, "epoch": 6.370913942628419, "step": 19100 }, { "epoch": 6.370913942628419, "ref_ce_loss": 0.07276992499828339, "step": 19100 }, { "epoch": 6.370913942628419, "loss": 0.32933807373046875, "step": 19100 }, { "ce_loss": 0.07511289417743683, "epoch": 6.370913942628419, "step": 19100 }, { "distill_loss": 0.16823294758796692, "epoch": 6.370913942628419, "step": 19100 }, { "epoch": 6.370913942628419, "ref_ce_loss": 0.08579640090465546, "step": 19100 }, { "epoch": 6.370913942628419, "loss": 0.3977130353450775, "step": 19100 }, { "ce_loss": 0.013505751267075539, "epoch": 6.370913942628419, "step": 19100 }, { "distill_loss": 0.25070807337760925, "epoch": 6.370913942628419, "step": 19100 }, { "epoch": 6.370913942628419, "ref_ce_loss": 0.0833851620554924, "step": 19100 }, { "epoch": 6.374249499666444, "loss": 0.3429, "step": 19110 }, { "epoch": 6.374249499666444, "grad_norm": 3.5291786193847656, "step": 19110 }, { "epoch": 6.374249499666444, "learning_rate": 3.133486051818576e-05, "step": 19110 }, { "epoch": 6.374249499666444, "loss": 0.3006848692893982, "step": 19110 }, { "ce_loss": 0.07178663462400436, "epoch": 6.374249499666444, "step": 19110 }, { "distill_loss": 0.16004428267478943, "epoch": 6.374249499666444, "step": 19110 }, { "epoch": 6.374249499666444, "ref_ce_loss": 0.05003441497683525, "step": 19110 }, { "epoch": 6.374249499666444, "loss": 0.37611380219459534, "step": 19110 }, { "ce_loss": 0.06637578457593918, "epoch": 6.374249499666444, "step": 19110 }, { "distill_loss": 0.1888526827096939, "epoch": 6.374249499666444, "step": 19110 }, { "epoch": 6.374249499666444, "ref_ce_loss": 0.07265367358922958, "step": 19110 }, { "epoch": 6.374249499666444, "loss": 0.31075137853622437, "step": 19110 }, { "ce_loss": 0.048460204154253006, "epoch": 6.374249499666444, "step": 19110 }, { "distill_loss": 0.1777610331773758, "epoch": 6.374249499666444, "step": 19110 }, { "epoch": 6.374249499666444, "ref_ce_loss": 0.06488906592130661, "step": 19110 }, { "epoch": 6.374249499666444, "loss": 0.2050069272518158, "step": 19110 }, { "ce_loss": 0.012430096045136452, "epoch": 6.374249499666444, "step": 19110 }, { "distill_loss": 0.11187323927879333, "epoch": 6.374249499666444, "step": 19110 }, { "epoch": 6.374249499666444, "ref_ce_loss": 0.061666443943977356, "step": 19110 }, { "epoch": 6.377585056704469, "loss": 0.3316, "step": 19120 }, { "epoch": 6.377585056704469, "grad_norm": 2.4090237617492676, "step": 19120 }, { "epoch": 6.377585056704469, "learning_rate": 3.121106484100988e-05, "step": 19120 }, { "epoch": 6.377585056704469, "loss": 0.44150298833847046, "step": 19120 }, { "ce_loss": 0.1467040777206421, "epoch": 6.377585056704469, "step": 19120 }, { "distill_loss": 0.17174892127513885, "epoch": 6.377585056704469, "step": 19120 }, { "epoch": 6.377585056704469, "ref_ce_loss": 0.1003832221031189, "step": 19120 }, { "epoch": 6.377585056704469, "loss": 0.22515568137168884, "step": 19120 }, { "ce_loss": 0.017367742955684662, "epoch": 6.377585056704469, "step": 19120 }, { "distill_loss": 0.15258149802684784, "epoch": 6.377585056704469, "step": 19120 }, { "epoch": 6.377585056704469, "ref_ce_loss": 0.05509953573346138, "step": 19120 }, { "epoch": 6.377585056704469, "loss": 0.4535076320171356, "step": 19120 }, { "ce_loss": 0.03954509273171425, "epoch": 6.377585056704469, "step": 19120 }, { "distill_loss": 0.2209874838590622, "epoch": 6.377585056704469, "step": 19120 }, { "epoch": 6.377585056704469, "ref_ce_loss": 0.10431575775146484, "step": 19120 }, { "epoch": 6.377585056704469, "loss": 0.2742467522621155, "step": 19120 }, { "ce_loss": 0.029925430193543434, "epoch": 6.377585056704469, "step": 19120 }, { "distill_loss": 0.1687852144241333, "epoch": 6.377585056704469, "step": 19120 }, { "epoch": 6.377585056704469, "ref_ce_loss": 0.04623578116297722, "step": 19120 }, { "epoch": 6.380920613742495, "loss": 0.3146, "step": 19130 }, { "epoch": 6.380920613742495, "grad_norm": 2.218641996383667, "step": 19130 }, { "epoch": 6.380920613742495, "learning_rate": 3.1087485787993364e-05, "step": 19130 }, { "epoch": 6.380920613742495, "loss": 0.4525134563446045, "step": 19130 }, { "ce_loss": 0.06234154850244522, "epoch": 6.380920613742495, "step": 19130 }, { "distill_loss": 0.21077755093574524, "epoch": 6.380920613742495, "step": 19130 }, { "epoch": 6.380920613742495, "ref_ce_loss": 0.04453861340880394, "step": 19130 }, { "epoch": 6.380920613742495, "loss": 0.5359585285186768, "step": 19130 }, { "ce_loss": 0.05334095284342766, "epoch": 6.380920613742495, "step": 19130 }, { "distill_loss": 0.15786990523338318, "epoch": 6.380920613742495, "step": 19130 }, { "epoch": 6.380920613742495, "ref_ce_loss": 0.06534440070390701, "step": 19130 }, { "epoch": 6.380920613742495, "loss": 0.2719331383705139, "step": 19130 }, { "ce_loss": 0.03220943361520767, "epoch": 6.380920613742495, "step": 19130 }, { "distill_loss": 0.15897700190544128, "epoch": 6.380920613742495, "step": 19130 }, { "epoch": 6.380920613742495, "ref_ce_loss": 0.059988368302583694, "step": 19130 }, { "epoch": 6.380920613742495, "loss": 0.31654953956604004, "step": 19130 }, { "ce_loss": 0.07118566334247589, "epoch": 6.380920613742495, "step": 19130 }, { "distill_loss": 0.1746058613061905, "epoch": 6.380920613742495, "step": 19130 }, { "epoch": 6.380920613742495, "ref_ce_loss": 0.06019550561904907, "step": 19130 }, { "epoch": 6.38425617078052, "loss": 0.3493, "step": 19140 }, { "epoch": 6.38425617078052, "grad_norm": 2.842442750930786, "step": 19140 }, { "epoch": 6.38425617078052, "learning_rate": 3.096412358449551e-05, "step": 19140 }, { "epoch": 6.38425617078052, "loss": 0.3088509440422058, "step": 19140 }, { "ce_loss": 0.013124763034284115, "epoch": 6.38425617078052, "step": 19140 }, { "distill_loss": 0.20202720165252686, "epoch": 6.38425617078052, "step": 19140 }, { "epoch": 6.38425617078052, "ref_ce_loss": 0.041209910064935684, "step": 19140 }, { "epoch": 6.38425617078052, "loss": 0.28975409269332886, "step": 19140 }, { "ce_loss": 0.0419902503490448, "epoch": 6.38425617078052, "step": 19140 }, { "distill_loss": 0.15306031703948975, "epoch": 6.38425617078052, "step": 19140 }, { "epoch": 6.38425617078052, "ref_ce_loss": 0.04372680187225342, "step": 19140 }, { "epoch": 6.38425617078052, "loss": 0.264797568321228, "step": 19140 }, { "ce_loss": 0.009839768521487713, "epoch": 6.38425617078052, "step": 19140 }, { "distill_loss": 0.1636621654033661, "epoch": 6.38425617078052, "step": 19140 }, { "epoch": 6.38425617078052, "ref_ce_loss": 0.046388011425733566, "step": 19140 }, { "epoch": 6.38425617078052, "loss": 0.45161283016204834, "step": 19140 }, { "ce_loss": 0.033660754561424255, "epoch": 6.38425617078052, "step": 19140 }, { "distill_loss": 0.30521446466445923, "epoch": 6.38425617078052, "step": 19140 }, { "epoch": 6.38425617078052, "ref_ce_loss": 0.07519791275262833, "step": 19140 }, { "epoch": 6.3875917278185455, "loss": 0.3443, "step": 19150 }, { "epoch": 6.3875917278185455, "grad_norm": 2.7326242923736572, "step": 19150 }, { "epoch": 6.3875917278185455, "learning_rate": 3.0840978455480466e-05, "step": 19150 }, { "epoch": 6.3875917278185455, "loss": 0.23642776906490326, "step": 19150 }, { "ce_loss": 0.02581688016653061, "epoch": 6.3875917278185455, "step": 19150 }, { "distill_loss": 0.1117728054523468, "epoch": 6.3875917278185455, "step": 19150 }, { "epoch": 6.3875917278185455, "ref_ce_loss": 0.028648825362324715, "step": 19150 }, { "epoch": 6.3875917278185455, "loss": 0.29809409379959106, "step": 19150 }, { "ce_loss": 0.050406236201524734, "epoch": 6.3875917278185455, "step": 19150 }, { "distill_loss": 0.17046400904655457, "epoch": 6.3875917278185455, "step": 19150 }, { "epoch": 6.3875917278185455, "ref_ce_loss": 0.05761267989873886, "step": 19150 }, { "epoch": 6.3875917278185455, "loss": 0.2136099636554718, "step": 19150 }, { "ce_loss": 0.008947102352976799, "epoch": 6.3875917278185455, "step": 19150 }, { "distill_loss": 0.16757823526859283, "epoch": 6.3875917278185455, "step": 19150 }, { "epoch": 6.3875917278185455, "ref_ce_loss": 0.036926738917827606, "step": 19150 }, { "epoch": 6.3875917278185455, "loss": 0.420158714056015, "step": 19150 }, { "ce_loss": 0.026515640318393707, "epoch": 6.3875917278185455, "step": 19150 }, { "distill_loss": 0.22909250855445862, "epoch": 6.3875917278185455, "step": 19150 }, { "epoch": 6.3875917278185455, "ref_ce_loss": 0.03809387609362602, "step": 19150 }, { "epoch": 6.390927284856571, "loss": 0.3354, "step": 19160 }, { "epoch": 6.390927284856571, "grad_norm": 1.891335368156433, "step": 19160 }, { "epoch": 6.390927284856571, "learning_rate": 3.071805062551638e-05, "step": 19160 }, { "epoch": 6.390927284856571, "loss": 0.6429652571678162, "step": 19160 }, { "ce_loss": 0.08628448098897934, "epoch": 6.390927284856571, "step": 19160 }, { "distill_loss": 0.21496467292308807, "epoch": 6.390927284856571, "step": 19160 }, { "epoch": 6.390927284856571, "ref_ce_loss": 0.0815964788198471, "step": 19160 }, { "epoch": 6.390927284856571, "loss": 0.3746568560600281, "step": 19160 }, { "ce_loss": 0.09126932173967361, "epoch": 6.390927284856571, "step": 19160 }, { "distill_loss": 0.1949469894170761, "epoch": 6.390927284856571, "step": 19160 }, { "epoch": 6.390927284856571, "ref_ce_loss": 0.0748150646686554, "step": 19160 }, { "epoch": 6.390927284856571, "loss": 0.18672306835651398, "step": 19160 }, { "ce_loss": 0.041305914521217346, "epoch": 6.390927284856571, "step": 19160 }, { "distill_loss": 0.09442579746246338, "epoch": 6.390927284856571, "step": 19160 }, { "epoch": 6.390927284856571, "ref_ce_loss": 0.05088605359196663, "step": 19160 }, { "epoch": 6.390927284856571, "loss": 0.43783873319625854, "step": 19160 }, { "ce_loss": 0.05301617830991745, "epoch": 6.390927284856571, "step": 19160 }, { "distill_loss": 0.18607942759990692, "epoch": 6.390927284856571, "step": 19160 }, { "epoch": 6.390927284856571, "ref_ce_loss": 0.07295983284711838, "step": 19160 }, { "epoch": 6.394262841894596, "loss": 0.3414, "step": 19170 }, { "epoch": 6.394262841894596, "grad_norm": 4.8256096839904785, "step": 19170 }, { "epoch": 6.394262841894596, "learning_rate": 3.0595340318775e-05, "step": 19170 }, { "epoch": 6.394262841894596, "loss": 0.3920935094356537, "step": 19170 }, { "ce_loss": 0.0550379641354084, "epoch": 6.394262841894596, "step": 19170 }, { "distill_loss": 0.22485697269439697, "epoch": 6.394262841894596, "step": 19170 }, { "epoch": 6.394262841894596, "ref_ce_loss": 0.07897008210420609, "step": 19170 }, { "epoch": 6.394262841894596, "loss": 0.2931138873100281, "step": 19170 }, { "ce_loss": 0.023997221142053604, "epoch": 6.394262841894596, "step": 19170 }, { "distill_loss": 0.15474891662597656, "epoch": 6.394262841894596, "step": 19170 }, { "epoch": 6.394262841894596, "ref_ce_loss": 0.04679024592041969, "step": 19170 }, { "epoch": 6.394262841894596, "loss": 0.5692634582519531, "step": 19170 }, { "ce_loss": 0.07035572826862335, "epoch": 6.394262841894596, "step": 19170 }, { "distill_loss": 0.14807996153831482, "epoch": 6.394262841894596, "step": 19170 }, { "epoch": 6.394262841894596, "ref_ce_loss": 0.0663730576634407, "step": 19170 }, { "epoch": 6.394262841894596, "loss": 0.34553074836730957, "step": 19170 }, { "ce_loss": 0.04271988570690155, "epoch": 6.394262841894596, "step": 19170 }, { "distill_loss": 0.09763485938310623, "epoch": 6.394262841894596, "step": 19170 }, { "epoch": 6.394262841894596, "ref_ce_loss": 0.07433194667100906, "step": 19170 }, { "epoch": 6.3975983989326215, "loss": 0.3648, "step": 19180 }, { "epoch": 6.3975983989326215, "grad_norm": 2.8879714012145996, "step": 19180 }, { "epoch": 6.3975983989326215, "learning_rate": 3.0472847759031644e-05, "step": 19180 }, { "epoch": 6.3975983989326215, "loss": 0.3636305630207062, "step": 19180 }, { "ce_loss": 0.028095953166484833, "epoch": 6.3975983989326215, "step": 19180 }, { "distill_loss": 0.23860491812229156, "epoch": 6.3975983989326215, "step": 19180 }, { "epoch": 6.3975983989326215, "ref_ce_loss": 0.05020011588931084, "step": 19180 }, { "epoch": 6.3975983989326215, "loss": 0.6124520301818848, "step": 19180 }, { "ce_loss": 0.0461311973631382, "epoch": 6.3975983989326215, "step": 19180 }, { "distill_loss": 0.16430391371250153, "epoch": 6.3975983989326215, "step": 19180 }, { "epoch": 6.3975983989326215, "ref_ce_loss": 0.059981171041727066, "step": 19180 }, { "epoch": 6.3975983989326215, "loss": 0.21800105273723602, "step": 19180 }, { "ce_loss": 0.02115524932742119, "epoch": 6.3975983989326215, "step": 19180 }, { "distill_loss": 0.12055207043886185, "epoch": 6.3975983989326215, "step": 19180 }, { "epoch": 6.3975983989326215, "ref_ce_loss": 0.05713411420583725, "step": 19180 }, { "epoch": 6.3975983989326215, "loss": 0.16409829258918762, "step": 19180 }, { "ce_loss": 0.011136075481772423, "epoch": 6.3975983989326215, "step": 19180 }, { "distill_loss": 0.09124863147735596, "epoch": 6.3975983989326215, "step": 19180 }, { "epoch": 6.3975983989326215, "ref_ce_loss": 0.02289462462067604, "step": 19180 }, { "epoch": 6.400933955970647, "loss": 0.2937, "step": 19190 }, { "epoch": 6.400933955970647, "grad_norm": 3.5310378074645996, "step": 19190 }, { "epoch": 6.400933955970647, "learning_rate": 3.03505731696643e-05, "step": 19190 }, { "epoch": 6.400933955970647, "loss": 0.2121621072292328, "step": 19190 }, { "ce_loss": 0.023766357451677322, "epoch": 6.400933955970647, "step": 19190 }, { "distill_loss": 0.09208349883556366, "epoch": 6.400933955970647, "step": 19190 }, { "epoch": 6.400933955970647, "ref_ce_loss": 0.042189568281173706, "step": 19190 }, { "epoch": 6.400933955970647, "loss": 0.6151399612426758, "step": 19190 }, { "ce_loss": 0.0640786662697792, "epoch": 6.400933955970647, "step": 19190 }, { "distill_loss": 0.13085195422172546, "epoch": 6.400933955970647, "step": 19190 }, { "epoch": 6.400933955970647, "ref_ce_loss": 0.04363051429390907, "step": 19190 }, { "epoch": 6.400933955970647, "loss": 0.2088690996170044, "step": 19190 }, { "ce_loss": 0.02814677730202675, "epoch": 6.400933955970647, "step": 19190 }, { "distill_loss": 0.11532458662986755, "epoch": 6.400933955970647, "step": 19190 }, { "epoch": 6.400933955970647, "ref_ce_loss": 0.06498730182647705, "step": 19190 }, { "epoch": 6.400933955970647, "loss": 0.2948390543460846, "step": 19190 }, { "ce_loss": 0.05973248556256294, "epoch": 6.400933955970647, "step": 19190 }, { "distill_loss": 0.14913040399551392, "epoch": 6.400933955970647, "step": 19190 }, { "epoch": 6.400933955970647, "ref_ce_loss": 0.0471370592713356, "step": 19190 }, { "epoch": 6.404269513008672, "loss": 0.3206, "step": 19200 }, { "epoch": 6.404269513008672, "grad_norm": 2.9538049697875977, "step": 19200 }, { "epoch": 6.404269513008672, "learning_rate": 3.0228516773653623e-05, "step": 19200 }, { "epoch": 6.404269513008672, "loss": 0.2782128155231476, "step": 19200 }, { "ce_loss": 0.033618733286857605, "epoch": 6.404269513008672, "step": 19200 }, { "distill_loss": 0.13662630319595337, "epoch": 6.404269513008672, "step": 19200 }, { "epoch": 6.404269513008672, "ref_ce_loss": 0.0756227895617485, "step": 19200 }, { "epoch": 6.404269513008672, "loss": 0.2780166566371918, "step": 19200 }, { "ce_loss": 0.036082640290260315, "epoch": 6.404269513008672, "step": 19200 }, { "distill_loss": 0.13568821549415588, "epoch": 6.404269513008672, "step": 19200 }, { "epoch": 6.404269513008672, "ref_ce_loss": 0.050824616104364395, "step": 19200 }, { "epoch": 6.404269513008672, "loss": 0.3180280327796936, "step": 19200 }, { "ce_loss": 0.022766657173633575, "epoch": 6.404269513008672, "step": 19200 }, { "distill_loss": 0.18709643185138702, "epoch": 6.404269513008672, "step": 19200 }, { "epoch": 6.404269513008672, "ref_ce_loss": 0.047577161341905594, "step": 19200 }, { "epoch": 6.404269513008672, "loss": 0.29317015409469604, "step": 19200 }, { "ce_loss": 0.02827540971338749, "epoch": 6.404269513008672, "step": 19200 }, { "distill_loss": 0.12621057033538818, "epoch": 6.404269513008672, "step": 19200 }, { "epoch": 6.404269513008672, "ref_ce_loss": 0.06611227989196777, "step": 19200 }, { "epoch": 6.4076050700466975, "loss": 0.3585, "step": 19210 }, { "epoch": 6.4076050700466975, "grad_norm": 2.182614803314209, "step": 19210 }, { "epoch": 6.4076050700466975, "learning_rate": 3.0106678793582428e-05, "step": 19210 }, { "epoch": 6.4076050700466975, "loss": 0.3438469171524048, "step": 19210 }, { "ce_loss": 0.03777853399515152, "epoch": 6.4076050700466975, "step": 19210 }, { "distill_loss": 0.13100846111774445, "epoch": 6.4076050700466975, "step": 19210 }, { "epoch": 6.4076050700466975, "ref_ce_loss": 0.06402308493852615, "step": 19210 }, { "epoch": 6.4076050700466975, "loss": 0.15743282437324524, "step": 19210 }, { "ce_loss": 0.01302177831530571, "epoch": 6.4076050700466975, "step": 19210 }, { "distill_loss": 0.07181822508573532, "epoch": 6.4076050700466975, "step": 19210 }, { "epoch": 6.4076050700466975, "ref_ce_loss": 0.03954114392399788, "step": 19210 }, { "epoch": 6.4076050700466975, "loss": 0.3400402367115021, "step": 19210 }, { "ce_loss": 0.037813831120729446, "epoch": 6.4076050700466975, "step": 19210 }, { "distill_loss": 0.18940778076648712, "epoch": 6.4076050700466975, "step": 19210 }, { "epoch": 6.4076050700466975, "ref_ce_loss": 0.08064355701208115, "step": 19210 }, { "epoch": 6.4076050700466975, "loss": 0.3088165819644928, "step": 19210 }, { "ce_loss": 0.0556054562330246, "epoch": 6.4076050700466975, "step": 19210 }, { "distill_loss": 0.1560615450143814, "epoch": 6.4076050700466975, "step": 19210 }, { "epoch": 6.4076050700466975, "ref_ce_loss": 0.06648774445056915, "step": 19210 }, { "epoch": 6.410940627084723, "loss": 0.3273, "step": 19220 }, { "epoch": 6.410940627084723, "grad_norm": 4.7798004150390625, "step": 19220 }, { "epoch": 6.410940627084723, "learning_rate": 2.9985059451635023e-05, "step": 19220 }, { "epoch": 6.410940627084723, "loss": 0.35199153423309326, "step": 19220 }, { "ce_loss": 0.07559584826231003, "epoch": 6.410940627084723, "step": 19220 }, { "distill_loss": 0.12564608454704285, "epoch": 6.410940627084723, "step": 19220 }, { "epoch": 6.410940627084723, "ref_ce_loss": 0.07979488372802734, "step": 19220 }, { "epoch": 6.410940627084723, "loss": 0.5481826066970825, "step": 19220 }, { "ce_loss": 0.07940838485956192, "epoch": 6.410940627084723, "step": 19220 }, { "distill_loss": 0.25284120440483093, "epoch": 6.410940627084723, "step": 19220 }, { "epoch": 6.410940627084723, "ref_ce_loss": 0.09138768911361694, "step": 19220 }, { "epoch": 6.410940627084723, "loss": 0.3082050085067749, "step": 19220 }, { "ce_loss": 0.05846491456031799, "epoch": 6.410940627084723, "step": 19220 }, { "distill_loss": 0.11728785932064056, "epoch": 6.410940627084723, "step": 19220 }, { "epoch": 6.410940627084723, "ref_ce_loss": 0.06966432929039001, "step": 19220 }, { "epoch": 6.410940627084723, "loss": 0.2505358159542084, "step": 19220 }, { "ce_loss": 0.05416925996541977, "epoch": 6.410940627084723, "step": 19220 }, { "distill_loss": 0.12237006425857544, "epoch": 6.410940627084723, "step": 19220 }, { "epoch": 6.410940627084723, "ref_ce_loss": 0.043212343007326126, "step": 19220 }, { "epoch": 6.414276184122748, "loss": 0.3379, "step": 19230 }, { "epoch": 6.414276184122748, "grad_norm": 2.7173337936401367, "step": 19230 }, { "epoch": 6.414276184122748, "learning_rate": 2.986365896959715e-05, "step": 19230 }, { "epoch": 6.414276184122748, "loss": 0.5793491005897522, "step": 19230 }, { "ce_loss": 0.0702369213104248, "epoch": 6.414276184122748, "step": 19230 }, { "distill_loss": 0.20983470976352692, "epoch": 6.414276184122748, "step": 19230 }, { "epoch": 6.414276184122748, "ref_ce_loss": 0.09685465693473816, "step": 19230 }, { "epoch": 6.414276184122748, "loss": 0.26481112837791443, "step": 19230 }, { "ce_loss": 0.043139562010765076, "epoch": 6.414276184122748, "step": 19230 }, { "distill_loss": 0.14797475934028625, "epoch": 6.414276184122748, "step": 19230 }, { "epoch": 6.414276184122748, "ref_ce_loss": 0.055984094738960266, "step": 19230 }, { "epoch": 6.414276184122748, "loss": 0.33620429039001465, "step": 19230 }, { "ce_loss": 0.024415483698248863, "epoch": 6.414276184122748, "step": 19230 }, { "distill_loss": 0.18126405775547028, "epoch": 6.414276184122748, "step": 19230 }, { "epoch": 6.414276184122748, "ref_ce_loss": 0.09445316344499588, "step": 19230 }, { "epoch": 6.414276184122748, "loss": 0.20489488542079926, "step": 19230 }, { "ce_loss": 0.011756706051528454, "epoch": 6.414276184122748, "step": 19230 }, { "distill_loss": 0.08259446918964386, "epoch": 6.414276184122748, "step": 19230 }, { "epoch": 6.414276184122748, "ref_ce_loss": 0.045302245765924454, "step": 19230 }, { "epoch": 6.417611741160774, "loss": 0.349, "step": 19240 }, { "epoch": 6.417611741160774, "grad_norm": 2.606694459915161, "step": 19240 }, { "epoch": 6.417611741160774, "learning_rate": 2.9742477568855427e-05, "step": 19240 }, { "epoch": 6.417611741160774, "loss": 0.33460116386413574, "step": 19240 }, { "ce_loss": 0.019410649314522743, "epoch": 6.417611741160774, "step": 19240 }, { "distill_loss": 0.11360645294189453, "epoch": 6.417611741160774, "step": 19240 }, { "epoch": 6.417611741160774, "ref_ce_loss": 0.059435781091451645, "step": 19240 }, { "epoch": 6.417611741160774, "loss": 0.2454586774110794, "step": 19240 }, { "ce_loss": 0.03153756633400917, "epoch": 6.417611741160774, "step": 19240 }, { "distill_loss": 0.142329141497612, "epoch": 6.417611741160774, "step": 19240 }, { "epoch": 6.417611741160774, "ref_ce_loss": 0.0708671510219574, "step": 19240 }, { "epoch": 6.417611741160774, "loss": 0.23517194390296936, "step": 19240 }, { "ce_loss": 0.013855576515197754, "epoch": 6.417611741160774, "step": 19240 }, { "distill_loss": 0.08239498734474182, "epoch": 6.417611741160774, "step": 19240 }, { "epoch": 6.417611741160774, "ref_ce_loss": 0.03543057292699814, "step": 19240 }, { "epoch": 6.417611741160774, "loss": 0.4140257239341736, "step": 19240 }, { "ce_loss": 0.05405563861131668, "epoch": 6.417611741160774, "step": 19240 }, { "distill_loss": 0.23824673891067505, "epoch": 6.417611741160774, "step": 19240 }, { "epoch": 6.417611741160774, "ref_ce_loss": 0.051883019506931305, "step": 19240 }, { "epoch": 6.420947298198799, "loss": 0.3119, "step": 19250 }, { "epoch": 6.420947298198799, "grad_norm": 3.075127601623535, "step": 19250 }, { "epoch": 6.420947298198799, "learning_rate": 2.9621515470396873e-05, "step": 19250 }, { "epoch": 6.420947298198799, "loss": 0.38381344079971313, "step": 19250 }, { "ce_loss": 0.02055729180574417, "epoch": 6.420947298198799, "step": 19250 }, { "distill_loss": 0.1190321072936058, "epoch": 6.420947298198799, "step": 19250 }, { "epoch": 6.420947298198799, "ref_ce_loss": 0.04908378794789314, "step": 19250 }, { "epoch": 6.420947298198799, "loss": 0.30875587463378906, "step": 19250 }, { "ce_loss": 0.00966921728104353, "epoch": 6.420947298198799, "step": 19250 }, { "distill_loss": 0.11165569722652435, "epoch": 6.420947298198799, "step": 19250 }, { "epoch": 6.420947298198799, "ref_ce_loss": 0.039333853870630264, "step": 19250 }, { "epoch": 6.420947298198799, "loss": 0.13892872631549835, "step": 19250 }, { "ce_loss": 0.01093357801437378, "epoch": 6.420947298198799, "step": 19250 }, { "distill_loss": 0.08393251895904541, "epoch": 6.420947298198799, "step": 19250 }, { "epoch": 6.420947298198799, "ref_ce_loss": 0.043848972767591476, "step": 19250 }, { "epoch": 6.420947298198799, "loss": 0.2550225257873535, "step": 19250 }, { "ce_loss": 0.03890252485871315, "epoch": 6.420947298198799, "step": 19250 }, { "distill_loss": 0.10823406279087067, "epoch": 6.420947298198799, "step": 19250 }, { "epoch": 6.420947298198799, "ref_ce_loss": 0.04668232426047325, "step": 19250 }, { "epoch": 6.424282855236824, "loss": 0.3383, "step": 19260 }, { "epoch": 6.424282855236824, "grad_norm": 2.459117889404297, "step": 19260 }, { "epoch": 6.424282855236824, "learning_rate": 2.950077289480865e-05, "step": 19260 }, { "epoch": 6.424282855236824, "loss": 0.19606022536754608, "step": 19260 }, { "ce_loss": 0.023685600608587265, "epoch": 6.424282855236824, "step": 19260 }, { "distill_loss": 0.10593249648809433, "epoch": 6.424282855236824, "step": 19260 }, { "epoch": 6.424282855236824, "ref_ce_loss": 0.04287222400307655, "step": 19260 }, { "epoch": 6.424282855236824, "loss": 0.23405814170837402, "step": 19260 }, { "ce_loss": 0.05165858939290047, "epoch": 6.424282855236824, "step": 19260 }, { "distill_loss": 0.10588884353637695, "epoch": 6.424282855236824, "step": 19260 }, { "epoch": 6.424282855236824, "ref_ce_loss": 0.05655386298894882, "step": 19260 }, { "epoch": 6.424282855236824, "loss": 0.34876590967178345, "step": 19260 }, { "ce_loss": 0.080736443400383, "epoch": 6.424282855236824, "step": 19260 }, { "distill_loss": 0.13183307647705078, "epoch": 6.424282855236824, "step": 19260 }, { "epoch": 6.424282855236824, "ref_ce_loss": 0.0751199871301651, "step": 19260 }, { "epoch": 6.424282855236824, "loss": 0.30815309286117554, "step": 19260 }, { "ce_loss": 0.05185553804039955, "epoch": 6.424282855236824, "step": 19260 }, { "distill_loss": 0.1449497938156128, "epoch": 6.424282855236824, "step": 19260 }, { "epoch": 6.424282855236824, "ref_ce_loss": 0.08047346770763397, "step": 19260 }, { "epoch": 6.42761841227485, "loss": 0.3324, "step": 19270 }, { "epoch": 6.42761841227485, "grad_norm": 3.1468026638031006, "step": 19270 }, { "epoch": 6.42761841227485, "learning_rate": 2.938025006227761e-05, "step": 19270 }, { "epoch": 6.42761841227485, "loss": 0.3166249394416809, "step": 19270 }, { "ce_loss": 0.043620720505714417, "epoch": 6.42761841227485, "step": 19270 }, { "distill_loss": 0.23102976381778717, "epoch": 6.42761841227485, "step": 19270 }, { "epoch": 6.42761841227485, "ref_ce_loss": 0.041840847581624985, "step": 19270 }, { "epoch": 6.42761841227485, "loss": 0.3161732852458954, "step": 19270 }, { "ce_loss": 0.0137014864012599, "epoch": 6.42761841227485, "step": 19270 }, { "distill_loss": 0.18281744420528412, "epoch": 6.42761841227485, "step": 19270 }, { "epoch": 6.42761841227485, "ref_ce_loss": 0.06067816540598869, "step": 19270 }, { "epoch": 6.42761841227485, "loss": 0.26537391543388367, "step": 19270 }, { "ce_loss": 0.044217947870492935, "epoch": 6.42761841227485, "step": 19270 }, { "distill_loss": 0.13335096836090088, "epoch": 6.42761841227485, "step": 19270 }, { "epoch": 6.42761841227485, "ref_ce_loss": 0.06993230432271957, "step": 19270 }, { "epoch": 6.42761841227485, "loss": 0.16318552196025848, "step": 19270 }, { "ce_loss": 0.010892470367252827, "epoch": 6.42761841227485, "step": 19270 }, { "distill_loss": 0.10559059679508209, "epoch": 6.42761841227485, "step": 19270 }, { "epoch": 6.42761841227485, "ref_ce_loss": 0.02419205941259861, "step": 19270 }, { "epoch": 6.430953969312875, "loss": 0.3088, "step": 19280 }, { "epoch": 6.430953969312875, "grad_norm": 2.171274185180664, "step": 19280 }, { "epoch": 6.430953969312875, "learning_rate": 2.9259947192589843e-05, "step": 19280 }, { "epoch": 6.430953969312875, "loss": 0.2121484875679016, "step": 19280 }, { "ce_loss": 0.042485639452934265, "epoch": 6.430953969312875, "step": 19280 }, { "distill_loss": 0.09462635964155197, "epoch": 6.430953969312875, "step": 19280 }, { "epoch": 6.430953969312875, "ref_ce_loss": 0.052952077239751816, "step": 19280 }, { "epoch": 6.430953969312875, "loss": 0.34515848755836487, "step": 19280 }, { "ce_loss": 0.08936353772878647, "epoch": 6.430953969312875, "step": 19280 }, { "distill_loss": 0.17028677463531494, "epoch": 6.430953969312875, "step": 19280 }, { "epoch": 6.430953969312875, "ref_ce_loss": 0.07250023633241653, "step": 19280 }, { "epoch": 6.430953969312875, "loss": 0.28915169835090637, "step": 19280 }, { "ce_loss": 0.04281798377633095, "epoch": 6.430953969312875, "step": 19280 }, { "distill_loss": 0.1258801370859146, "epoch": 6.430953969312875, "step": 19280 }, { "epoch": 6.430953969312875, "ref_ce_loss": 0.055673930794000626, "step": 19280 }, { "epoch": 6.430953969312875, "loss": 0.37340790033340454, "step": 19280 }, { "ce_loss": 0.07182245701551437, "epoch": 6.430953969312875, "step": 19280 }, { "distill_loss": 0.20456330478191376, "epoch": 6.430953969312875, "step": 19280 }, { "epoch": 6.430953969312875, "ref_ce_loss": 0.05647760257124901, "step": 19280 }, { "epoch": 6.4342895263509, "loss": 0.3328, "step": 19290 }, { "epoch": 6.4342895263509, "grad_norm": 2.09809947013855, "step": 19290 }, { "epoch": 6.4342895263509, "learning_rate": 2.913986450513036e-05, "step": 19290 }, { "epoch": 6.4342895263509, "loss": 0.5191903114318848, "step": 19290 }, { "ce_loss": 0.09405925869941711, "epoch": 6.4342895263509, "step": 19290 }, { "distill_loss": 0.32466500997543335, "epoch": 6.4342895263509, "step": 19290 }, { "epoch": 6.4342895263509, "ref_ce_loss": 0.07379074394702911, "step": 19290 }, { "epoch": 6.4342895263509, "loss": 0.3605400323867798, "step": 19290 }, { "ce_loss": 0.0426713228225708, "epoch": 6.4342895263509, "step": 19290 }, { "distill_loss": 0.08181039243936539, "epoch": 6.4342895263509, "step": 19290 }, { "epoch": 6.4342895263509, "ref_ce_loss": 0.06672769784927368, "step": 19290 }, { "epoch": 6.4342895263509, "loss": 0.2661738991737366, "step": 19290 }, { "ce_loss": 0.0049125500954687595, "epoch": 6.4342895263509, "step": 19290 }, { "distill_loss": 0.20455636084079742, "epoch": 6.4342895263509, "step": 19290 }, { "epoch": 6.4342895263509, "ref_ce_loss": 0.03672386705875397, "step": 19290 }, { "epoch": 6.4342895263509, "loss": 0.234100803732872, "step": 19290 }, { "ce_loss": 0.03455338999629021, "epoch": 6.4342895263509, "step": 19290 }, { "distill_loss": 0.12428376823663712, "epoch": 6.4342895263509, "step": 19290 }, { "epoch": 6.4342895263509, "ref_ce_loss": 0.05475219711661339, "step": 19290 }, { "epoch": 6.437625083388926, "loss": 0.3237, "step": 19300 }, { "epoch": 6.437625083388926, "grad_norm": 2.9320175647735596, "step": 19300 }, { "epoch": 6.437625083388926, "learning_rate": 2.902000221888256e-05, "step": 19300 }, { "epoch": 6.437625083388926, "loss": 0.29273325204849243, "step": 19300 }, { "ce_loss": 0.03256535902619362, "epoch": 6.437625083388926, "step": 19300 }, { "distill_loss": 0.151731476187706, "epoch": 6.437625083388926, "step": 19300 }, { "epoch": 6.437625083388926, "ref_ce_loss": 0.0654408186674118, "step": 19300 }, { "epoch": 6.437625083388926, "loss": 0.22853368520736694, "step": 19300 }, { "ce_loss": 0.05244693532586098, "epoch": 6.437625083388926, "step": 19300 }, { "distill_loss": 0.13937723636627197, "epoch": 6.437625083388926, "step": 19300 }, { "epoch": 6.437625083388926, "ref_ce_loss": 0.03658019378781319, "step": 19300 }, { "epoch": 6.437625083388926, "loss": 0.18904545903205872, "step": 19300 }, { "ce_loss": 0.021132905036211014, "epoch": 6.437625083388926, "step": 19300 }, { "distill_loss": 0.11984669417142868, "epoch": 6.437625083388926, "step": 19300 }, { "epoch": 6.437625083388926, "ref_ce_loss": 0.04028644412755966, "step": 19300 }, { "epoch": 6.437625083388926, "loss": 0.473442018032074, "step": 19300 }, { "ce_loss": 0.04387342929840088, "epoch": 6.437625083388926, "step": 19300 }, { "distill_loss": 0.21055030822753906, "epoch": 6.437625083388926, "step": 19300 }, { "epoch": 6.437625083388926, "ref_ce_loss": 0.0811781957745552, "step": 19300 }, { "epoch": 6.440960640426951, "loss": 0.3371, "step": 19310 }, { "epoch": 6.440960640426951, "grad_norm": 2.3369381427764893, "step": 19310 }, { "epoch": 6.440960640426951, "learning_rate": 2.890036055242801e-05, "step": 19310 }, { "epoch": 6.440960640426951, "loss": 0.25813591480255127, "step": 19310 }, { "ce_loss": 0.06117769703269005, "epoch": 6.440960640426951, "step": 19310 }, { "distill_loss": 0.134428933262825, "epoch": 6.440960640426951, "step": 19310 }, { "epoch": 6.440960640426951, "ref_ce_loss": 0.06232694536447525, "step": 19310 }, { "epoch": 6.440960640426951, "loss": 0.19962909817695618, "step": 19310 }, { "ce_loss": 0.023090876638889313, "epoch": 6.440960640426951, "step": 19310 }, { "distill_loss": 0.10459668934345245, "epoch": 6.440960640426951, "step": 19310 }, { "epoch": 6.440960640426951, "ref_ce_loss": 0.07150375843048096, "step": 19310 }, { "epoch": 6.440960640426951, "loss": 0.33547577261924744, "step": 19310 }, { "ce_loss": 0.05752252787351608, "epoch": 6.440960640426951, "step": 19310 }, { "distill_loss": 0.21133238077163696, "epoch": 6.440960640426951, "step": 19310 }, { "epoch": 6.440960640426951, "ref_ce_loss": 0.05103033781051636, "step": 19310 }, { "epoch": 6.440960640426951, "loss": 0.35321956872940063, "step": 19310 }, { "ce_loss": 0.04094832018017769, "epoch": 6.440960640426951, "step": 19310 }, { "distill_loss": 0.19313184916973114, "epoch": 6.440960640426951, "step": 19310 }, { "epoch": 6.440960640426951, "ref_ce_loss": 0.049679603427648544, "step": 19310 }, { "epoch": 6.444296197464976, "loss": 0.3082, "step": 19320 }, { "epoch": 6.444296197464976, "grad_norm": 3.1166648864746094, "step": 19320 }, { "epoch": 6.444296197464976, "learning_rate": 2.8780939723945884e-05, "step": 19320 }, { "epoch": 6.444296197464976, "loss": 0.4541691541671753, "step": 19320 }, { "ce_loss": 0.02630157396197319, "epoch": 6.444296197464976, "step": 19320 }, { "distill_loss": 0.23176150023937225, "epoch": 6.444296197464976, "step": 19320 }, { "epoch": 6.444296197464976, "ref_ce_loss": 0.08258750289678574, "step": 19320 }, { "epoch": 6.444296197464976, "loss": 0.20466068387031555, "step": 19320 }, { "ce_loss": 0.03209497034549713, "epoch": 6.444296197464976, "step": 19320 }, { "distill_loss": 0.10391910374164581, "epoch": 6.444296197464976, "step": 19320 }, { "epoch": 6.444296197464976, "ref_ce_loss": 0.04612681642174721, "step": 19320 }, { "epoch": 6.444296197464976, "loss": 0.34781479835510254, "step": 19320 }, { "ce_loss": 0.06227179616689682, "epoch": 6.444296197464976, "step": 19320 }, { "distill_loss": 0.16145853698253632, "epoch": 6.444296197464976, "step": 19320 }, { "epoch": 6.444296197464976, "ref_ce_loss": 0.055882956832647324, "step": 19320 }, { "epoch": 6.444296197464976, "loss": 0.2700772285461426, "step": 19320 }, { "ce_loss": 0.02936231903731823, "epoch": 6.444296197464976, "step": 19320 }, { "distill_loss": 0.14836882054805756, "epoch": 6.444296197464976, "step": 19320 }, { "epoch": 6.444296197464976, "ref_ce_loss": 0.05383678898215294, "step": 19320 }, { "epoch": 6.447631754503002, "loss": 0.3323, "step": 19330 }, { "epoch": 6.447631754503002, "grad_norm": 3.125654458999634, "step": 19330 }, { "epoch": 6.447631754503002, "learning_rate": 2.8661739951212698e-05, "step": 19330 }, { "epoch": 6.447631754503002, "loss": 0.4950518012046814, "step": 19330 }, { "ce_loss": 0.0657888725399971, "epoch": 6.447631754503002, "step": 19330 }, { "distill_loss": 0.11284708976745605, "epoch": 6.447631754503002, "step": 19330 }, { "epoch": 6.447631754503002, "ref_ce_loss": 0.0673019289970398, "step": 19330 }, { "epoch": 6.447631754503002, "loss": 0.2192465364933014, "step": 19330 }, { "ce_loss": 0.033647581934928894, "epoch": 6.447631754503002, "step": 19330 }, { "distill_loss": 0.14369481801986694, "epoch": 6.447631754503002, "step": 19330 }, { "epoch": 6.447631754503002, "ref_ce_loss": 0.041760560125112534, "step": 19330 }, { "epoch": 6.447631754503002, "loss": 0.20164823532104492, "step": 19330 }, { "ce_loss": 0.02778281643986702, "epoch": 6.447631754503002, "step": 19330 }, { "distill_loss": 0.12590767443180084, "epoch": 6.447631754503002, "step": 19330 }, { "epoch": 6.447631754503002, "ref_ce_loss": 0.047779470682144165, "step": 19330 }, { "epoch": 6.447631754503002, "loss": 0.2597501575946808, "step": 19330 }, { "ce_loss": 0.01448689866811037, "epoch": 6.447631754503002, "step": 19330 }, { "distill_loss": 0.14149931073188782, "epoch": 6.447631754503002, "step": 19330 }, { "epoch": 6.447631754503002, "ref_ce_loss": 0.061557453125715256, "step": 19330 }, { "epoch": 6.450967311541027, "loss": 0.3349, "step": 19340 }, { "epoch": 6.450967311541027, "grad_norm": 2.500840902328491, "step": 19340 }, { "epoch": 6.450967311541027, "learning_rate": 2.8542761451601837e-05, "step": 19340 }, { "epoch": 6.450967311541027, "loss": 0.2467081993818283, "step": 19340 }, { "ce_loss": 0.01818380504846573, "epoch": 6.450967311541027, "step": 19340 }, { "distill_loss": 0.16112403571605682, "epoch": 6.450967311541027, "step": 19340 }, { "epoch": 6.450967311541027, "ref_ce_loss": 0.06723609566688538, "step": 19340 }, { "epoch": 6.450967311541027, "loss": 0.4431634545326233, "step": 19340 }, { "ce_loss": 0.05379560962319374, "epoch": 6.450967311541027, "step": 19340 }, { "distill_loss": 0.19823703169822693, "epoch": 6.450967311541027, "step": 19340 }, { "epoch": 6.450967311541027, "ref_ce_loss": 0.07378227263689041, "step": 19340 }, { "epoch": 6.450967311541027, "loss": 0.7392683625221252, "step": 19340 }, { "ce_loss": 0.03509441763162613, "epoch": 6.450967311541027, "step": 19340 }, { "distill_loss": 0.17824314534664154, "epoch": 6.450967311541027, "step": 19340 }, { "epoch": 6.450967311541027, "ref_ce_loss": 0.06962034851312637, "step": 19340 }, { "epoch": 6.450967311541027, "loss": 0.6556627750396729, "step": 19340 }, { "ce_loss": 0.11523985862731934, "epoch": 6.450967311541027, "step": 19340 }, { "distill_loss": 0.17302967607975006, "epoch": 6.450967311541027, "step": 19340 }, { "epoch": 6.450967311541027, "ref_ce_loss": 0.06542285531759262, "step": 19340 }, { "epoch": 6.454302868579052, "loss": 0.3345, "step": 19350 }, { "epoch": 6.454302868579052, "grad_norm": 2.523237943649292, "step": 19350 }, { "epoch": 6.454302868579052, "learning_rate": 2.8424004442083075e-05, "step": 19350 }, { "epoch": 6.454302868579052, "loss": 0.42306017875671387, "step": 19350 }, { "ce_loss": 0.05187192186713219, "epoch": 6.454302868579052, "step": 19350 }, { "distill_loss": 0.16845226287841797, "epoch": 6.454302868579052, "step": 19350 }, { "epoch": 6.454302868579052, "ref_ce_loss": 0.06847023963928223, "step": 19350 }, { "epoch": 6.454302868579052, "loss": 0.2361203134059906, "step": 19350 }, { "ce_loss": 0.06244039162993431, "epoch": 6.454302868579052, "step": 19350 }, { "distill_loss": 0.1332026571035385, "epoch": 6.454302868579052, "step": 19350 }, { "epoch": 6.454302868579052, "ref_ce_loss": 0.040215637534856796, "step": 19350 }, { "epoch": 6.454302868579052, "loss": 0.2239590734243393, "step": 19350 }, { "ce_loss": 0.0467652902007103, "epoch": 6.454302868579052, "step": 19350 }, { "distill_loss": 0.09530991315841675, "epoch": 6.454302868579052, "step": 19350 }, { "epoch": 6.454302868579052, "ref_ce_loss": 0.05247754231095314, "step": 19350 }, { "epoch": 6.454302868579052, "loss": 0.24619795382022858, "step": 19350 }, { "ce_loss": 0.0250222310423851, "epoch": 6.454302868579052, "step": 19350 }, { "distill_loss": 0.12926128506660461, "epoch": 6.454302868579052, "step": 19350 }, { "epoch": 6.454302868579052, "ref_ce_loss": 0.0656496062874794, "step": 19350 }, { "epoch": 6.457638425617078, "loss": 0.3546, "step": 19360 }, { "epoch": 6.457638425617078, "grad_norm": 2.459033250808716, "step": 19360 }, { "epoch": 6.457638425617078, "learning_rate": 2.8305469139222398e-05, "step": 19360 }, { "epoch": 6.457638425617078, "loss": 0.34734630584716797, "step": 19360 }, { "ce_loss": 0.03935239836573601, "epoch": 6.457638425617078, "step": 19360 }, { "distill_loss": 0.15108200907707214, "epoch": 6.457638425617078, "step": 19360 }, { "epoch": 6.457638425617078, "ref_ce_loss": 0.0873950719833374, "step": 19360 }, { "epoch": 6.457638425617078, "loss": 0.24328087270259857, "step": 19360 }, { "ce_loss": 0.026929683983325958, "epoch": 6.457638425617078, "step": 19360 }, { "distill_loss": 0.09555982053279877, "epoch": 6.457638425617078, "step": 19360 }, { "epoch": 6.457638425617078, "ref_ce_loss": 0.07317261397838593, "step": 19360 }, { "epoch": 6.457638425617078, "loss": 0.2929458022117615, "step": 19360 }, { "ce_loss": 0.042498935014009476, "epoch": 6.457638425617078, "step": 19360 }, { "distill_loss": 0.16249480843544006, "epoch": 6.457638425617078, "step": 19360 }, { "epoch": 6.457638425617078, "ref_ce_loss": 0.04562801495194435, "step": 19360 }, { "epoch": 6.457638425617078, "loss": 0.2420826107263565, "step": 19360 }, { "ce_loss": 0.027378996834158897, "epoch": 6.457638425617078, "step": 19360 }, { "distill_loss": 0.14949826896190643, "epoch": 6.457638425617078, "step": 19360 }, { "epoch": 6.457638425617078, "ref_ce_loss": 0.06513167917728424, "step": 19360 }, { "epoch": 6.460973982655103, "loss": 0.3376, "step": 19370 }, { "epoch": 6.460973982655103, "grad_norm": 2.392075538635254, "step": 19370 }, { "epoch": 6.460973982655103, "learning_rate": 2.8187155759181425e-05, "step": 19370 }, { "epoch": 6.460973982655103, "loss": 0.12967228889465332, "step": 19370 }, { "ce_loss": 0.013152679428458214, "epoch": 6.460973982655103, "step": 19370 }, { "distill_loss": 0.08467573672533035, "epoch": 6.460973982655103, "step": 19370 }, { "epoch": 6.460973982655103, "ref_ce_loss": 0.03173118457198143, "step": 19370 }, { "epoch": 6.460973982655103, "loss": 0.33118006587028503, "step": 19370 }, { "ce_loss": 0.08657965064048767, "epoch": 6.460973982655103, "step": 19370 }, { "distill_loss": 0.17172375321388245, "epoch": 6.460973982655103, "step": 19370 }, { "epoch": 6.460973982655103, "ref_ce_loss": 0.05565835162997246, "step": 19370 }, { "epoch": 6.460973982655103, "loss": 0.23161426186561584, "step": 19370 }, { "ce_loss": 0.029349392279982567, "epoch": 6.460973982655103, "step": 19370 }, { "distill_loss": 0.11940692365169525, "epoch": 6.460973982655103, "step": 19370 }, { "epoch": 6.460973982655103, "ref_ce_loss": 0.06689442694187164, "step": 19370 }, { "epoch": 6.460973982655103, "loss": 0.19995811581611633, "step": 19370 }, { "ce_loss": 0.004559206310659647, "epoch": 6.460973982655103, "step": 19370 }, { "distill_loss": 0.095821812748909, "epoch": 6.460973982655103, "step": 19370 }, { "epoch": 6.460973982655103, "ref_ce_loss": 0.03864465653896332, "step": 19370 }, { "epoch": 6.4643095396931285, "loss": 0.2942, "step": 19380 }, { "epoch": 6.4643095396931285, "grad_norm": 2.8001151084899902, "step": 19380 }, { "epoch": 6.4643095396931285, "learning_rate": 2.8069064517717115e-05, "step": 19380 }, { "epoch": 6.4643095396931285, "loss": 0.564293384552002, "step": 19380 }, { "ce_loss": 0.0951477438211441, "epoch": 6.4643095396931285, "step": 19380 }, { "distill_loss": 0.16672182083129883, "epoch": 6.4643095396931285, "step": 19380 }, { "epoch": 6.4643095396931285, "ref_ce_loss": 0.10000166296958923, "step": 19380 }, { "epoch": 6.4643095396931285, "loss": 0.21771246194839478, "step": 19380 }, { "ce_loss": 0.019646715372800827, "epoch": 6.4643095396931285, "step": 19380 }, { "distill_loss": 0.14466574788093567, "epoch": 6.4643095396931285, "step": 19380 }, { "epoch": 6.4643095396931285, "ref_ce_loss": 0.05324574559926987, "step": 19380 }, { "epoch": 6.4643095396931285, "loss": 0.1648833155632019, "step": 19380 }, { "ce_loss": 0.0035948504228144884, "epoch": 6.4643095396931285, "step": 19380 }, { "distill_loss": 0.10792934894561768, "epoch": 6.4643095396931285, "step": 19380 }, { "epoch": 6.4643095396931285, "ref_ce_loss": 0.034741222858428955, "step": 19380 }, { "epoch": 6.4643095396931285, "loss": 0.11709931492805481, "step": 19380 }, { "ce_loss": 0.0012851167703047395, "epoch": 6.4643095396931285, "step": 19380 }, { "distill_loss": 0.07367859780788422, "epoch": 6.4643095396931285, "step": 19380 }, { "epoch": 6.4643095396931285, "ref_ce_loss": 0.02455708011984825, "step": 19380 }, { "epoch": 6.467645096731154, "loss": 0.3228, "step": 19390 }, { "epoch": 6.467645096731154, "grad_norm": 4.402524948120117, "step": 19390 }, { "epoch": 6.467645096731154, "learning_rate": 2.795119563018133e-05, "step": 19390 }, { "epoch": 6.467645096731154, "loss": 0.42270222306251526, "step": 19390 }, { "ce_loss": 0.10031957924365997, "epoch": 6.467645096731154, "step": 19390 }, { "distill_loss": 0.1878599226474762, "epoch": 6.467645096731154, "step": 19390 }, { "epoch": 6.467645096731154, "ref_ce_loss": 0.10024359822273254, "step": 19390 }, { "epoch": 6.467645096731154, "loss": 0.17374208569526672, "step": 19390 }, { "ce_loss": 0.027342218905687332, "epoch": 6.467645096731154, "step": 19390 }, { "distill_loss": 0.11525329202413559, "epoch": 6.467645096731154, "step": 19390 }, { "epoch": 6.467645096731154, "ref_ce_loss": 0.024801230058073997, "step": 19390 }, { "epoch": 6.467645096731154, "loss": 0.716946542263031, "step": 19390 }, { "ce_loss": 0.07414887100458145, "epoch": 6.467645096731154, "step": 19390 }, { "distill_loss": 0.18353484570980072, "epoch": 6.467645096731154, "step": 19390 }, { "epoch": 6.467645096731154, "ref_ce_loss": 0.04698779433965683, "step": 19390 }, { "epoch": 6.467645096731154, "loss": 0.21807637810707092, "step": 19390 }, { "ce_loss": 0.05198301002383232, "epoch": 6.467645096731154, "step": 19390 }, { "distill_loss": 0.11755422502756119, "epoch": 6.467645096731154, "step": 19390 }, { "epoch": 6.467645096731154, "ref_ce_loss": 0.03329809382557869, "step": 19390 }, { "epoch": 6.470980653769179, "loss": 0.3604, "step": 19400 }, { "epoch": 6.470980653769179, "grad_norm": 2.1826651096343994, "step": 19400 }, { "epoch": 6.470980653769179, "learning_rate": 2.7833549311520352e-05, "step": 19400 }, { "epoch": 6.470980653769179, "loss": 0.4379744827747345, "step": 19400 }, { "ce_loss": 0.03534636273980141, "epoch": 6.470980653769179, "step": 19400 }, { "distill_loss": 0.1718473583459854, "epoch": 6.470980653769179, "step": 19400 }, { "epoch": 6.470980653769179, "ref_ce_loss": 0.08096379786729813, "step": 19400 }, { "epoch": 6.470980653769179, "loss": 0.36185845732688904, "step": 19400 }, { "ce_loss": 0.05027947574853897, "epoch": 6.470980653769179, "step": 19400 }, { "distill_loss": 0.1847555935382843, "epoch": 6.470980653769179, "step": 19400 }, { "epoch": 6.470980653769179, "ref_ce_loss": 0.07357639074325562, "step": 19400 }, { "epoch": 6.470980653769179, "loss": 0.27249574661254883, "step": 19400 }, { "ce_loss": 0.0771196261048317, "epoch": 6.470980653769179, "step": 19400 }, { "distill_loss": 0.10766123235225677, "epoch": 6.470980653769179, "step": 19400 }, { "epoch": 6.470980653769179, "ref_ce_loss": 0.04266400635242462, "step": 19400 }, { "epoch": 6.470980653769179, "loss": 0.26876991987228394, "step": 19400 }, { "ce_loss": 0.019434722140431404, "epoch": 6.470980653769179, "step": 19400 }, { "distill_loss": 0.19553104043006897, "epoch": 6.470980653769179, "step": 19400 }, { "epoch": 6.470980653769179, "ref_ce_loss": 0.03554742410778999, "step": 19400 }, { "epoch": 6.4743162108072045, "loss": 0.3295, "step": 19410 }, { "epoch": 6.4743162108072045, "grad_norm": 2.4636645317077637, "step": 19410 }, { "epoch": 6.4743162108072045, "learning_rate": 2.7716125776274694e-05, "step": 19410 }, { "epoch": 6.4743162108072045, "loss": 0.393093079328537, "step": 19410 }, { "ce_loss": 0.07405968010425568, "epoch": 6.4743162108072045, "step": 19410 }, { "distill_loss": 0.13686689734458923, "epoch": 6.4743162108072045, "step": 19410 }, { "epoch": 6.4743162108072045, "ref_ce_loss": 0.07957617193460464, "step": 19410 }, { "epoch": 6.4743162108072045, "loss": 0.25447648763656616, "step": 19410 }, { "ce_loss": 0.022265994921326637, "epoch": 6.4743162108072045, "step": 19410 }, { "distill_loss": 0.1449824571609497, "epoch": 6.4743162108072045, "step": 19410 }, { "epoch": 6.4743162108072045, "ref_ce_loss": 0.07170706242322922, "step": 19410 }, { "epoch": 6.4743162108072045, "loss": 0.3648896813392639, "step": 19410 }, { "ce_loss": 0.07455446571111679, "epoch": 6.4743162108072045, "step": 19410 }, { "distill_loss": 0.1877792775630951, "epoch": 6.4743162108072045, "step": 19410 }, { "epoch": 6.4743162108072045, "ref_ce_loss": 0.06565703451633453, "step": 19410 }, { "epoch": 6.4743162108072045, "loss": 0.4845498204231262, "step": 19410 }, { "ce_loss": 0.052607208490371704, "epoch": 6.4743162108072045, "step": 19410 }, { "distill_loss": 0.19217851758003235, "epoch": 6.4743162108072045, "step": 19410 }, { "epoch": 6.4743162108072045, "ref_ce_loss": 0.05669070780277252, "step": 19410 }, { "epoch": 6.47765176784523, "loss": 0.344, "step": 19420 }, { "epoch": 6.47765176784523, "grad_norm": 2.88173508644104, "step": 19420 }, { "epoch": 6.47765176784523, "learning_rate": 2.759892523857858e-05, "step": 19420 }, { "epoch": 6.47765176784523, "loss": 0.22536416351795197, "step": 19420 }, { "ce_loss": 0.039910074323415756, "epoch": 6.47765176784523, "step": 19420 }, { "distill_loss": 0.11129461973905563, "epoch": 6.47765176784523, "step": 19420 }, { "epoch": 6.47765176784523, "ref_ce_loss": 0.05070248246192932, "step": 19420 }, { "epoch": 6.47765176784523, "loss": 0.254660040140152, "step": 19420 }, { "ce_loss": 0.03568845987319946, "epoch": 6.47765176784523, "step": 19420 }, { "distill_loss": 0.15099768340587616, "epoch": 6.47765176784523, "step": 19420 }, { "epoch": 6.47765176784523, "ref_ce_loss": 0.06769350916147232, "step": 19420 }, { "epoch": 6.47765176784523, "loss": 0.6331043243408203, "step": 19420 }, { "ce_loss": 0.04798339679837227, "epoch": 6.47765176784523, "step": 19420 }, { "distill_loss": 0.17899511754512787, "epoch": 6.47765176784523, "step": 19420 }, { "epoch": 6.47765176784523, "ref_ce_loss": 0.06974631547927856, "step": 19420 }, { "epoch": 6.47765176784523, "loss": 0.2030748426914215, "step": 19420 }, { "ce_loss": 0.03313259407877922, "epoch": 6.47765176784523, "step": 19420 }, { "distill_loss": 0.10403706133365631, "epoch": 6.47765176784523, "step": 19420 }, { "epoch": 6.47765176784523, "ref_ce_loss": 0.03035002388060093, "step": 19420 }, { "epoch": 6.480987324883255, "loss": 0.3198, "step": 19430 }, { "epoch": 6.480987324883255, "grad_norm": 4.207744121551514, "step": 19430 }, { "epoch": 6.480987324883255, "learning_rate": 2.7481947912159542e-05, "step": 19430 }, { "epoch": 6.480987324883255, "loss": 0.18555955588817596, "step": 19430 }, { "ce_loss": 0.014027750119566917, "epoch": 6.480987324883255, "step": 19430 }, { "distill_loss": 0.1059509664773941, "epoch": 6.480987324883255, "step": 19430 }, { "epoch": 6.480987324883255, "ref_ce_loss": 0.041501760482788086, "step": 19430 }, { "epoch": 6.480987324883255, "loss": 0.5299850106239319, "step": 19430 }, { "ce_loss": 0.11079461127519608, "epoch": 6.480987324883255, "step": 19430 }, { "distill_loss": 0.24580931663513184, "epoch": 6.480987324883255, "step": 19430 }, { "epoch": 6.480987324883255, "ref_ce_loss": 0.056581251323223114, "step": 19430 }, { "epoch": 6.480987324883255, "loss": 0.16338509321212769, "step": 19430 }, { "ce_loss": 0.031098000705242157, "epoch": 6.480987324883255, "step": 19430 }, { "distill_loss": 0.09898979216814041, "epoch": 6.480987324883255, "step": 19430 }, { "epoch": 6.480987324883255, "ref_ce_loss": 0.03305355831980705, "step": 19430 }, { "epoch": 6.480987324883255, "loss": 0.3786182403564453, "step": 19430 }, { "ce_loss": 0.06183497980237007, "epoch": 6.480987324883255, "step": 19430 }, { "distill_loss": 0.18642447888851166, "epoch": 6.480987324883255, "step": 19430 }, { "epoch": 6.480987324883255, "ref_ce_loss": 0.05898001417517662, "step": 19430 }, { "epoch": 6.484322881921281, "loss": 0.2907, "step": 19440 }, { "epoch": 6.484322881921281, "grad_norm": 5.101585865020752, "step": 19440 }, { "epoch": 6.484322881921281, "learning_rate": 2.7365194010338126e-05, "step": 19440 }, { "epoch": 6.484322881921281, "loss": 0.3683335781097412, "step": 19440 }, { "ce_loss": 0.017139267176389694, "epoch": 6.484322881921281, "step": 19440 }, { "distill_loss": 0.17401042580604553, "epoch": 6.484322881921281, "step": 19440 }, { "epoch": 6.484322881921281, "ref_ce_loss": 0.05724998190999031, "step": 19440 }, { "epoch": 6.484322881921281, "loss": 0.19354148209095, "step": 19440 }, { "ce_loss": 0.0053457519970834255, "epoch": 6.484322881921281, "step": 19440 }, { "distill_loss": 0.1339099109172821, "epoch": 6.484322881921281, "step": 19440 }, { "epoch": 6.484322881921281, "ref_ce_loss": 0.0540962778031826, "step": 19440 }, { "epoch": 6.484322881921281, "loss": 0.26310113072395325, "step": 19440 }, { "ce_loss": 0.047900937497615814, "epoch": 6.484322881921281, "step": 19440 }, { "distill_loss": 0.12943311035633087, "epoch": 6.484322881921281, "step": 19440 }, { "epoch": 6.484322881921281, "ref_ce_loss": 0.05395814776420593, "step": 19440 }, { "epoch": 6.484322881921281, "loss": 0.21164949238300323, "step": 19440 }, { "ce_loss": 0.019808808341622353, "epoch": 6.484322881921281, "step": 19440 }, { "distill_loss": 0.10979584604501724, "epoch": 6.484322881921281, "step": 19440 }, { "epoch": 6.484322881921281, "ref_ce_loss": 0.052284590899944305, "step": 19440 }, { "epoch": 6.487658438959306, "loss": 0.3068, "step": 19450 }, { "epoch": 6.487658438959306, "grad_norm": 2.3882312774658203, "step": 19450 }, { "epoch": 6.487658438959306, "learning_rate": 2.7248663746027305e-05, "step": 19450 }, { "epoch": 6.487658438959306, "loss": 0.26894763112068176, "step": 19450 }, { "ce_loss": 0.02126028575003147, "epoch": 6.487658438959306, "step": 19450 }, { "distill_loss": 0.10786668211221695, "epoch": 6.487658438959306, "step": 19450 }, { "epoch": 6.487658438959306, "ref_ce_loss": 0.058729927986860275, "step": 19450 }, { "epoch": 6.487658438959306, "loss": 0.38041186332702637, "step": 19450 }, { "ce_loss": 0.035523854196071625, "epoch": 6.487658438959306, "step": 19450 }, { "distill_loss": 0.15720032155513763, "epoch": 6.487658438959306, "step": 19450 }, { "epoch": 6.487658438959306, "ref_ce_loss": 0.06419675052165985, "step": 19450 }, { "epoch": 6.487658438959306, "loss": 0.39652425050735474, "step": 19450 }, { "ce_loss": 0.05832938849925995, "epoch": 6.487658438959306, "step": 19450 }, { "distill_loss": 0.15557998418807983, "epoch": 6.487658438959306, "step": 19450 }, { "epoch": 6.487658438959306, "ref_ce_loss": 0.07637973874807358, "step": 19450 }, { "epoch": 6.487658438959306, "loss": 0.20689250528812408, "step": 19450 }, { "ce_loss": 0.012813769280910492, "epoch": 6.487658438959306, "step": 19450 }, { "distill_loss": 0.15325000882148743, "epoch": 6.487658438959306, "step": 19450 }, { "epoch": 6.487658438959306, "ref_ce_loss": 0.04054896533489227, "step": 19450 }, { "epoch": 6.490993995997331, "loss": 0.3084, "step": 19460 }, { "epoch": 6.490993995997331, "grad_norm": 3.626648426055908, "step": 19460 }, { "epoch": 6.490993995997331, "learning_rate": 2.7132357331732356e-05, "step": 19460 }, { "epoch": 6.490993995997331, "loss": 0.15348277986049652, "step": 19460 }, { "ce_loss": 0.01779790408909321, "epoch": 6.490993995997331, "step": 19460 }, { "distill_loss": 0.1153775304555893, "epoch": 6.490993995997331, "step": 19460 }, { "epoch": 6.490993995997331, "ref_ce_loss": 0.020220749080181122, "step": 19460 }, { "epoch": 6.490993995997331, "loss": 0.22395391762256622, "step": 19460 }, { "ce_loss": 0.046976491808891296, "epoch": 6.490993995997331, "step": 19460 }, { "distill_loss": 0.11234377324581146, "epoch": 6.490993995997331, "step": 19460 }, { "epoch": 6.490993995997331, "ref_ce_loss": 0.06450348347425461, "step": 19460 }, { "epoch": 6.490993995997331, "loss": 0.44756627082824707, "step": 19460 }, { "ce_loss": 0.09042006731033325, "epoch": 6.490993995997331, "step": 19460 }, { "distill_loss": 0.2449221909046173, "epoch": 6.490993995997331, "step": 19460 }, { "epoch": 6.490993995997331, "ref_ce_loss": 0.080929696559906, "step": 19460 }, { "epoch": 6.490993995997331, "loss": 0.2803463935852051, "step": 19460 }, { "ce_loss": 0.04512576386332512, "epoch": 6.490993995997331, "step": 19460 }, { "distill_loss": 0.10338424146175385, "epoch": 6.490993995997331, "step": 19460 }, { "epoch": 6.490993995997331, "ref_ce_loss": 0.0502689890563488, "step": 19460 }, { "epoch": 6.494329553035357, "loss": 0.3017, "step": 19470 }, { "epoch": 6.494329553035357, "grad_norm": 1.9026871919631958, "step": 19470 }, { "epoch": 6.494329553035357, "learning_rate": 2.7016274979550357e-05, "step": 19470 }, { "epoch": 6.494329553035357, "loss": 0.45376911759376526, "step": 19470 }, { "ce_loss": 0.07573352009057999, "epoch": 6.494329553035357, "step": 19470 }, { "distill_loss": 0.17655305564403534, "epoch": 6.494329553035357, "step": 19470 }, { "epoch": 6.494329553035357, "ref_ce_loss": 0.05633631721138954, "step": 19470 }, { "epoch": 6.494329553035357, "loss": 0.23756596446037292, "step": 19470 }, { "ce_loss": 0.040350113064050674, "epoch": 6.494329553035357, "step": 19470 }, { "distill_loss": 0.11733220517635345, "epoch": 6.494329553035357, "step": 19470 }, { "epoch": 6.494329553035357, "ref_ce_loss": 0.0678890123963356, "step": 19470 }, { "epoch": 6.494329553035357, "loss": 0.23397405445575714, "step": 19470 }, { "ce_loss": 0.009241566061973572, "epoch": 6.494329553035357, "step": 19470 }, { "distill_loss": 0.13413383066654205, "epoch": 6.494329553035357, "step": 19470 }, { "epoch": 6.494329553035357, "ref_ce_loss": 0.04893755912780762, "step": 19470 }, { "epoch": 6.494329553035357, "loss": 0.4001774191856384, "step": 19470 }, { "ce_loss": 0.01755928434431553, "epoch": 6.494329553035357, "step": 19470 }, { "distill_loss": 0.2129809856414795, "epoch": 6.494329553035357, "step": 19470 }, { "epoch": 6.494329553035357, "ref_ce_loss": 0.05770876631140709, "step": 19470 }, { "epoch": 6.497665110073382, "loss": 0.2931, "step": 19480 }, { "epoch": 6.497665110073382, "grad_norm": 2.623253345489502, "step": 19480 }, { "epoch": 6.497665110073382, "learning_rate": 2.6900416901169586e-05, "step": 19480 }, { "epoch": 6.497665110073382, "loss": 0.3188818693161011, "step": 19480 }, { "ce_loss": 0.08621389418840408, "epoch": 6.497665110073382, "step": 19480 }, { "distill_loss": 0.16553746163845062, "epoch": 6.497665110073382, "step": 19480 }, { "epoch": 6.497665110073382, "ref_ce_loss": 0.04960956051945686, "step": 19480 }, { "epoch": 6.497665110073382, "loss": 0.21584922075271606, "step": 19480 }, { "ce_loss": 0.0591447688639164, "epoch": 6.497665110073382, "step": 19480 }, { "distill_loss": 0.11659364402294159, "epoch": 6.497665110073382, "step": 19480 }, { "epoch": 6.497665110073382, "ref_ce_loss": 0.04008140414953232, "step": 19480 }, { "epoch": 6.497665110073382, "loss": 0.15966036915779114, "step": 19480 }, { "ce_loss": 0.02307932637631893, "epoch": 6.497665110073382, "step": 19480 }, { "distill_loss": 0.10009153187274933, "epoch": 6.497665110073382, "step": 19480 }, { "epoch": 6.497665110073382, "ref_ce_loss": 0.03619556874036789, "step": 19480 }, { "epoch": 6.497665110073382, "loss": 0.3482780456542969, "step": 19480 }, { "ce_loss": 0.08321208506822586, "epoch": 6.497665110073382, "step": 19480 }, { "distill_loss": 0.14295196533203125, "epoch": 6.497665110073382, "step": 19480 }, { "epoch": 6.497665110073382, "ref_ce_loss": 0.09516530483961105, "step": 19480 }, { "epoch": 6.501000667111407, "loss": 0.2853, "step": 19490 }, { "epoch": 6.501000667111407, "grad_norm": 2.0386288166046143, "step": 19490 }, { "epoch": 6.501000667111407, "learning_rate": 2.6784783307869624e-05, "step": 19490 }, { "epoch": 6.501000667111407, "loss": 0.3476499319076538, "step": 19490 }, { "ce_loss": 0.03272515535354614, "epoch": 6.501000667111407, "step": 19490 }, { "distill_loss": 0.179308220744133, "epoch": 6.501000667111407, "step": 19490 }, { "epoch": 6.501000667111407, "ref_ce_loss": 0.09251862019300461, "step": 19490 }, { "epoch": 6.501000667111407, "loss": 0.3680034279823303, "step": 19490 }, { "ce_loss": 0.06351684033870697, "epoch": 6.501000667111407, "step": 19490 }, { "distill_loss": 0.16472645103931427, "epoch": 6.501000667111407, "step": 19490 }, { "epoch": 6.501000667111407, "ref_ce_loss": 0.06539975851774216, "step": 19490 }, { "epoch": 6.501000667111407, "loss": 0.5050325989723206, "step": 19490 }, { "ce_loss": 0.051710743457078934, "epoch": 6.501000667111407, "step": 19490 }, { "distill_loss": 0.28480255603790283, "epoch": 6.501000667111407, "step": 19490 }, { "epoch": 6.501000667111407, "ref_ce_loss": 0.07189995795488358, "step": 19490 }, { "epoch": 6.501000667111407, "loss": 0.2924560010433197, "step": 19490 }, { "ce_loss": 0.0532911941409111, "epoch": 6.501000667111407, "step": 19490 }, { "distill_loss": 0.13427993655204773, "epoch": 6.501000667111407, "step": 19490 }, { "epoch": 6.501000667111407, "ref_ce_loss": 0.06003837287425995, "step": 19490 }, { "epoch": 6.504336224149433, "loss": 0.3273, "step": 19500 }, { "epoch": 6.504336224149433, "grad_norm": 3.2773895263671875, "step": 19500 }, { "epoch": 6.504336224149433, "learning_rate": 2.666937441052049e-05, "step": 19500 }, { "epoch": 6.504336224149433, "loss": 0.31917792558670044, "step": 19500 }, { "ce_loss": 0.06857472658157349, "epoch": 6.504336224149433, "step": 19500 }, { "distill_loss": 0.14248031377792358, "epoch": 6.504336224149433, "step": 19500 }, { "epoch": 6.504336224149433, "ref_ce_loss": 0.061197735369205475, "step": 19500 }, { "epoch": 6.504336224149433, "loss": 0.2877909541130066, "step": 19500 }, { "ce_loss": 0.03194195777177811, "epoch": 6.504336224149433, "step": 19500 }, { "distill_loss": 0.11442811787128448, "epoch": 6.504336224149433, "step": 19500 }, { "epoch": 6.504336224149433, "ref_ce_loss": 0.049518194049596786, "step": 19500 }, { "epoch": 6.504336224149433, "loss": 0.20438994467258453, "step": 19500 }, { "ce_loss": 0.005468576215207577, "epoch": 6.504336224149433, "step": 19500 }, { "distill_loss": 0.15710853040218353, "epoch": 6.504336224149433, "step": 19500 }, { "epoch": 6.504336224149433, "ref_ce_loss": 0.04172136262059212, "step": 19500 }, { "epoch": 6.504336224149433, "loss": 0.4296785593032837, "step": 19500 }, { "ce_loss": 0.11481533944606781, "epoch": 6.504336224149433, "step": 19500 }, { "distill_loss": 0.17444954812526703, "epoch": 6.504336224149433, "step": 19500 }, { "epoch": 6.504336224149433, "ref_ce_loss": 0.09351933747529984, "step": 19500 }, { "epoch": 6.507671781187458, "loss": 0.3024, "step": 19510 }, { "epoch": 6.507671781187458, "grad_norm": 2.5083553791046143, "step": 19510 }, { "epoch": 6.507671781187458, "learning_rate": 2.6554190419582432e-05, "step": 19510 }, { "epoch": 6.507671781187458, "loss": 0.22696717083454132, "step": 19510 }, { "ce_loss": 0.024120118468999863, "epoch": 6.507671781187458, "step": 19510 }, { "distill_loss": 0.118647001683712, "epoch": 6.507671781187458, "step": 19510 }, { "epoch": 6.507671781187458, "ref_ce_loss": 0.05346282944083214, "step": 19510 }, { "epoch": 6.507671781187458, "loss": 0.42250338196754456, "step": 19510 }, { "ce_loss": 0.04991762340068817, "epoch": 6.507671781187458, "step": 19510 }, { "distill_loss": 0.2798449993133545, "epoch": 6.507671781187458, "step": 19510 }, { "epoch": 6.507671781187458, "ref_ce_loss": 0.06090565398335457, "step": 19510 }, { "epoch": 6.507671781187458, "loss": 0.21890859305858612, "step": 19510 }, { "ce_loss": 0.04088882729411125, "epoch": 6.507671781187458, "step": 19510 }, { "distill_loss": 0.13536345958709717, "epoch": 6.507671781187458, "step": 19510 }, { "epoch": 6.507671781187458, "ref_ce_loss": 0.042482659220695496, "step": 19510 }, { "epoch": 6.507671781187458, "loss": 0.48329630494117737, "step": 19510 }, { "ce_loss": 0.08375865966081619, "epoch": 6.507671781187458, "step": 19510 }, { "distill_loss": 0.18079915642738342, "epoch": 6.507671781187458, "step": 19510 }, { "epoch": 6.507671781187458, "ref_ce_loss": 0.05869739502668381, "step": 19510 }, { "epoch": 6.511007338225483, "loss": 0.3078, "step": 19520 }, { "epoch": 6.511007338225483, "grad_norm": 1.8151508569717407, "step": 19520 }, { "epoch": 6.511007338225483, "learning_rate": 2.64392315451057e-05, "step": 19520 }, { "epoch": 6.511007338225483, "loss": 0.20219141244888306, "step": 19520 }, { "ce_loss": 0.02228640951216221, "epoch": 6.511007338225483, "step": 19520 }, { "distill_loss": 0.1161341741681099, "epoch": 6.511007338225483, "step": 19520 }, { "epoch": 6.511007338225483, "ref_ce_loss": 0.035142045468091965, "step": 19520 }, { "epoch": 6.511007338225483, "loss": 0.3143247663974762, "step": 19520 }, { "ce_loss": 0.05089471861720085, "epoch": 6.511007338225483, "step": 19520 }, { "distill_loss": 0.18314094841480255, "epoch": 6.511007338225483, "step": 19520 }, { "epoch": 6.511007338225483, "ref_ce_loss": 0.061314839869737625, "step": 19520 }, { "epoch": 6.511007338225483, "loss": 0.4873974025249481, "step": 19520 }, { "ce_loss": 0.0976187065243721, "epoch": 6.511007338225483, "step": 19520 }, { "distill_loss": 0.2322872132062912, "epoch": 6.511007338225483, "step": 19520 }, { "epoch": 6.511007338225483, "ref_ce_loss": 0.06849783658981323, "step": 19520 }, { "epoch": 6.511007338225483, "loss": 0.23710933327674866, "step": 19520 }, { "ce_loss": 0.04801061376929283, "epoch": 6.511007338225483, "step": 19520 }, { "distill_loss": 0.10981900990009308, "epoch": 6.511007338225483, "step": 19520 }, { "epoch": 6.511007338225483, "ref_ce_loss": 0.06101440265774727, "step": 19520 }, { "epoch": 6.514342895263509, "loss": 0.3379, "step": 19530 }, { "epoch": 6.514342895263509, "grad_norm": 2.442229986190796, "step": 19530 }, { "epoch": 6.514342895263509, "learning_rate": 2.6324497996729826e-05, "step": 19530 }, { "epoch": 6.514342895263509, "loss": 0.36532866954803467, "step": 19530 }, { "ce_loss": 0.04357844591140747, "epoch": 6.514342895263509, "step": 19530 }, { "distill_loss": 0.1568370908498764, "epoch": 6.514342895263509, "step": 19530 }, { "epoch": 6.514342895263509, "ref_ce_loss": 0.0767655000090599, "step": 19530 }, { "epoch": 6.514342895263509, "loss": 0.5425922274589539, "step": 19530 }, { "ce_loss": 0.005232425406575203, "epoch": 6.514342895263509, "step": 19530 }, { "distill_loss": 0.15154841542243958, "epoch": 6.514342895263509, "step": 19530 }, { "epoch": 6.514342895263509, "ref_ce_loss": 0.0805111974477768, "step": 19530 }, { "epoch": 6.514342895263509, "loss": 0.3620684742927551, "step": 19530 }, { "ce_loss": 0.004763866309076548, "epoch": 6.514342895263509, "step": 19530 }, { "distill_loss": 0.2671721279621124, "epoch": 6.514342895263509, "step": 19530 }, { "epoch": 6.514342895263509, "ref_ce_loss": 0.06306330114603043, "step": 19530 }, { "epoch": 6.514342895263509, "loss": 0.22314409911632538, "step": 19530 }, { "ce_loss": 0.0018380864057689905, "epoch": 6.514342895263509, "step": 19530 }, { "distill_loss": 0.08867863565683365, "epoch": 6.514342895263509, "step": 19530 }, { "epoch": 6.514342895263509, "ref_ce_loss": 0.04645582661032677, "step": 19530 }, { "epoch": 6.517678452301534, "loss": 0.327, "step": 19540 }, { "epoch": 6.517678452301534, "grad_norm": 2.6358141899108887, "step": 19540 }, { "epoch": 6.517678452301534, "learning_rate": 2.620998998368358e-05, "step": 19540 }, { "epoch": 6.517678452301534, "loss": 0.2660335600376129, "step": 19540 }, { "ce_loss": 0.006880718749016523, "epoch": 6.517678452301534, "step": 19540 }, { "distill_loss": 0.15171124041080475, "epoch": 6.517678452301534, "step": 19540 }, { "epoch": 6.517678452301534, "ref_ce_loss": 0.05697092413902283, "step": 19540 }, { "epoch": 6.517678452301534, "loss": 0.34473586082458496, "step": 19540 }, { "ce_loss": 0.026561858132481575, "epoch": 6.517678452301534, "step": 19540 }, { "distill_loss": 0.12937229871749878, "epoch": 6.517678452301534, "step": 19540 }, { "epoch": 6.517678452301534, "ref_ce_loss": 0.0451975092291832, "step": 19540 }, { "epoch": 6.517678452301534, "loss": 0.38856256008148193, "step": 19540 }, { "ce_loss": 0.04068716615438461, "epoch": 6.517678452301534, "step": 19540 }, { "distill_loss": 0.10566786676645279, "epoch": 6.517678452301534, "step": 19540 }, { "epoch": 6.517678452301534, "ref_ce_loss": 0.04714061692357063, "step": 19540 }, { "epoch": 6.517678452301534, "loss": 0.16135339438915253, "step": 19540 }, { "ce_loss": 0.02129344828426838, "epoch": 6.517678452301534, "step": 19540 }, { "distill_loss": 0.08420635759830475, "epoch": 6.517678452301534, "step": 19540 }, { "epoch": 6.517678452301534, "ref_ce_loss": 0.03777875006198883, "step": 19540 }, { "epoch": 6.521014009339559, "loss": 0.3013, "step": 19550 }, { "epoch": 6.521014009339559, "grad_norm": 2.993635654449463, "step": 19550 }, { "epoch": 6.521014009339559, "learning_rate": 2.6095707714784515e-05, "step": 19550 }, { "epoch": 6.521014009339559, "loss": 0.28706902265548706, "step": 19550 }, { "ce_loss": 0.04375440999865532, "epoch": 6.521014009339559, "step": 19550 }, { "distill_loss": 0.1598789095878601, "epoch": 6.521014009339559, "step": 19550 }, { "epoch": 6.521014009339559, "ref_ce_loss": 0.05451327934861183, "step": 19550 }, { "epoch": 6.521014009339559, "loss": 0.25459298491477966, "step": 19550 }, { "ce_loss": 0.041393257677555084, "epoch": 6.521014009339559, "step": 19550 }, { "distill_loss": 0.11711672693490982, "epoch": 6.521014009339559, "step": 19550 }, { "epoch": 6.521014009339559, "ref_ce_loss": 0.05613476410508156, "step": 19550 }, { "epoch": 6.521014009339559, "loss": 0.24426577985286713, "step": 19550 }, { "ce_loss": 0.044446419924497604, "epoch": 6.521014009339559, "step": 19550 }, { "distill_loss": 0.10138127952814102, "epoch": 6.521014009339559, "step": 19550 }, { "epoch": 6.521014009339559, "ref_ce_loss": 0.06777234375476837, "step": 19550 }, { "epoch": 6.521014009339559, "loss": 0.3395106792449951, "step": 19550 }, { "ce_loss": 0.05529210716485977, "epoch": 6.521014009339559, "step": 19550 }, { "distill_loss": 0.1776595264673233, "epoch": 6.521014009339559, "step": 19550 }, { "epoch": 6.521014009339559, "ref_ce_loss": 0.07718256115913391, "step": 19550 }, { "epoch": 6.524349566377585, "loss": 0.3031, "step": 19560 }, { "epoch": 6.524349566377585, "grad_norm": 2.298203229904175, "step": 19560 }, { "epoch": 6.524349566377585, "learning_rate": 2.5981651398438262e-05, "step": 19560 }, { "epoch": 6.524349566377585, "loss": 0.3190702795982361, "step": 19560 }, { "ce_loss": 0.08099543303251266, "epoch": 6.524349566377585, "step": 19560 }, { "distill_loss": 0.143859401345253, "epoch": 6.524349566377585, "step": 19560 }, { "epoch": 6.524349566377585, "ref_ce_loss": 0.06572287529706955, "step": 19560 }, { "epoch": 6.524349566377585, "loss": 0.22269880771636963, "step": 19560 }, { "ce_loss": 0.0325460359454155, "epoch": 6.524349566377585, "step": 19560 }, { "distill_loss": 0.1178068220615387, "epoch": 6.524349566377585, "step": 19560 }, { "epoch": 6.524349566377585, "ref_ce_loss": 0.05695272609591484, "step": 19560 }, { "epoch": 6.524349566377585, "loss": 0.35457098484039307, "step": 19560 }, { "ce_loss": 0.02406204119324684, "epoch": 6.524349566377585, "step": 19560 }, { "distill_loss": 0.18583686649799347, "epoch": 6.524349566377585, "step": 19560 }, { "epoch": 6.524349566377585, "ref_ce_loss": 0.07211542874574661, "step": 19560 }, { "epoch": 6.524349566377585, "loss": 0.34551870822906494, "step": 19560 }, { "ce_loss": 0.016009874641895294, "epoch": 6.524349566377585, "step": 19560 }, { "distill_loss": 0.12047820538282394, "epoch": 6.524349566377585, "step": 19560 }, { "epoch": 6.524349566377585, "ref_ce_loss": 0.04428781569004059, "step": 19560 }, { "epoch": 6.52768512341561, "loss": 0.2768, "step": 19570 }, { "epoch": 6.52768512341561, "grad_norm": 2.2297275066375732, "step": 19570 }, { "epoch": 6.52768512341561, "learning_rate": 2.586782124263867e-05, "step": 19570 }, { "epoch": 6.52768512341561, "loss": 0.15318286418914795, "step": 19570 }, { "ce_loss": 0.01596526987850666, "epoch": 6.52768512341561, "step": 19570 }, { "distill_loss": 0.09388856589794159, "epoch": 6.52768512341561, "step": 19570 }, { "epoch": 6.52768512341561, "ref_ce_loss": 0.043243296444416046, "step": 19570 }, { "epoch": 6.52768512341561, "loss": 0.3122502863407135, "step": 19570 }, { "ce_loss": 0.07972883433103561, "epoch": 6.52768512341561, "step": 19570 }, { "distill_loss": 0.11671815812587738, "epoch": 6.52768512341561, "step": 19570 }, { "epoch": 6.52768512341561, "ref_ce_loss": 0.09080839902162552, "step": 19570 }, { "epoch": 6.52768512341561, "loss": 0.3512519299983978, "step": 19570 }, { "ce_loss": 0.03099985606968403, "epoch": 6.52768512341561, "step": 19570 }, { "distill_loss": 0.13011573255062103, "epoch": 6.52768512341561, "step": 19570 }, { "epoch": 6.52768512341561, "ref_ce_loss": 0.06851420551538467, "step": 19570 }, { "epoch": 6.52768512341561, "loss": 0.34722644090652466, "step": 19570 }, { "ce_loss": 0.045901209115982056, "epoch": 6.52768512341561, "step": 19570 }, { "distill_loss": 0.16597992181777954, "epoch": 6.52768512341561, "step": 19570 }, { "epoch": 6.52768512341561, "ref_ce_loss": 0.056316304951906204, "step": 19570 }, { "epoch": 6.5310206804536355, "loss": 0.3432, "step": 19580 }, { "epoch": 6.5310206804536355, "grad_norm": 3.3036727905273438, "step": 19580 }, { "epoch": 6.5310206804536355, "learning_rate": 2.5754217454966937e-05, "step": 19580 }, { "epoch": 6.5310206804536355, "loss": 0.43914103507995605, "step": 19580 }, { "ce_loss": 0.04930100217461586, "epoch": 6.5310206804536355, "step": 19580 }, { "distill_loss": 0.27969545125961304, "epoch": 6.5310206804536355, "step": 19580 }, { "epoch": 6.5310206804536355, "ref_ce_loss": 0.07513166964054108, "step": 19580 }, { "epoch": 6.5310206804536355, "loss": 0.45070549845695496, "step": 19580 }, { "ce_loss": 0.0634470283985138, "epoch": 6.5310206804536355, "step": 19580 }, { "distill_loss": 0.15421168506145477, "epoch": 6.5310206804536355, "step": 19580 }, { "epoch": 6.5310206804536355, "ref_ce_loss": 0.05427777022123337, "step": 19580 }, { "epoch": 6.5310206804536355, "loss": 0.41704732179641724, "step": 19580 }, { "ce_loss": 0.06318405270576477, "epoch": 6.5310206804536355, "step": 19580 }, { "distill_loss": 0.1228623166680336, "epoch": 6.5310206804536355, "step": 19580 }, { "epoch": 6.5310206804536355, "ref_ce_loss": 0.0780521035194397, "step": 19580 }, { "epoch": 6.5310206804536355, "loss": 0.6786539554595947, "step": 19580 }, { "ce_loss": 0.045191653072834015, "epoch": 6.5310206804536355, "step": 19580 }, { "distill_loss": 0.20268556475639343, "epoch": 6.5310206804536355, "step": 19580 }, { "epoch": 6.5310206804536355, "ref_ce_loss": 0.09469419717788696, "step": 19580 }, { "epoch": 6.534356237491661, "loss": 0.3673, "step": 19590 }, { "epoch": 6.534356237491661, "grad_norm": 3.4037835597991943, "step": 19590 }, { "epoch": 6.534356237491661, "learning_rate": 2.564084024259159e-05, "step": 19590 }, { "epoch": 6.534356237491661, "loss": 0.3583972454071045, "step": 19590 }, { "ce_loss": 0.04135497286915779, "epoch": 6.534356237491661, "step": 19590 }, { "distill_loss": 0.20269116759300232, "epoch": 6.534356237491661, "step": 19590 }, { "epoch": 6.534356237491661, "ref_ce_loss": 0.06698576360940933, "step": 19590 }, { "epoch": 6.534356237491661, "loss": 0.6243804097175598, "step": 19590 }, { "ce_loss": 0.05481613427400589, "epoch": 6.534356237491661, "step": 19590 }, { "distill_loss": 0.16993610560894012, "epoch": 6.534356237491661, "step": 19590 }, { "epoch": 6.534356237491661, "ref_ce_loss": 0.07230786234140396, "step": 19590 }, { "epoch": 6.534356237491661, "loss": 0.2733704745769501, "step": 19590 }, { "ce_loss": 0.05378222465515137, "epoch": 6.534356237491661, "step": 19590 }, { "distill_loss": 0.14002224802970886, "epoch": 6.534356237491661, "step": 19590 }, { "epoch": 6.534356237491661, "ref_ce_loss": 0.07940661907196045, "step": 19590 }, { "epoch": 6.534356237491661, "loss": 0.44000738859176636, "step": 19590 }, { "ce_loss": 0.023452216759324074, "epoch": 6.534356237491661, "step": 19590 }, { "distill_loss": 0.0927174910902977, "epoch": 6.534356237491661, "step": 19590 }, { "epoch": 6.534356237491661, "ref_ce_loss": 0.03387615829706192, "step": 19590 }, { "epoch": 6.537691794529686, "loss": 0.3117, "step": 19600 }, { "epoch": 6.537691794529686, "grad_norm": 2.190343141555786, "step": 19600 }, { "epoch": 6.537691794529686, "learning_rate": 2.5527689812267987e-05, "step": 19600 }, { "epoch": 6.537691794529686, "loss": 0.28801581263542175, "step": 19600 }, { "ce_loss": 0.051726970821619034, "epoch": 6.537691794529686, "step": 19600 }, { "distill_loss": 0.1135430559515953, "epoch": 6.537691794529686, "step": 19600 }, { "epoch": 6.537691794529686, "ref_ce_loss": 0.022708710283041, "step": 19600 }, { "epoch": 6.537691794529686, "loss": 0.27752554416656494, "step": 19600 }, { "ce_loss": 0.028300290927290916, "epoch": 6.537691794529686, "step": 19600 }, { "distill_loss": 0.1339690089225769, "epoch": 6.537691794529686, "step": 19600 }, { "epoch": 6.537691794529686, "ref_ce_loss": 0.06856250762939453, "step": 19600 }, { "epoch": 6.537691794529686, "loss": 0.3291257619857788, "step": 19600 }, { "ce_loss": 0.044249553233385086, "epoch": 6.537691794529686, "step": 19600 }, { "distill_loss": 0.12134141474962234, "epoch": 6.537691794529686, "step": 19600 }, { "epoch": 6.537691794529686, "ref_ce_loss": 0.043635156005620956, "step": 19600 }, { "epoch": 6.537691794529686, "loss": 0.5704269409179688, "step": 19600 }, { "ce_loss": 0.11629431694746017, "epoch": 6.537691794529686, "step": 19600 }, { "distill_loss": 0.30639445781707764, "epoch": 6.537691794529686, "step": 19600 }, { "epoch": 6.537691794529686, "ref_ce_loss": 0.07897748053073883, "step": 19600 }, { "epoch": 6.5410273515677115, "loss": 0.3411, "step": 19610 }, { "epoch": 6.5410273515677115, "grad_norm": 4.623196601867676, "step": 19610 }, { "epoch": 6.5410273515677115, "learning_rate": 2.5414766370337814e-05, "step": 19610 }, { "epoch": 6.5410273515677115, "loss": 0.21084001660346985, "step": 19610 }, { "ce_loss": 0.02315536141395569, "epoch": 6.5410273515677115, "step": 19610 }, { "distill_loss": 0.11800826340913773, "epoch": 6.5410273515677115, "step": 19610 }, { "epoch": 6.5410273515677115, "ref_ce_loss": 0.04089812561869621, "step": 19610 }, { "epoch": 6.5410273515677115, "loss": 0.2711634635925293, "step": 19610 }, { "ce_loss": 0.021899105980992317, "epoch": 6.5410273515677115, "step": 19610 }, { "distill_loss": 0.12212783098220825, "epoch": 6.5410273515677115, "step": 19610 }, { "epoch": 6.5410273515677115, "ref_ce_loss": 0.05432126671075821, "step": 19610 }, { "epoch": 6.5410273515677115, "loss": 0.30355560779571533, "step": 19610 }, { "ce_loss": 0.04438330605626106, "epoch": 6.5410273515677115, "step": 19610 }, { "distill_loss": 0.13687101006507874, "epoch": 6.5410273515677115, "step": 19610 }, { "epoch": 6.5410273515677115, "ref_ce_loss": 0.08127232640981674, "step": 19610 }, { "epoch": 6.5410273515677115, "loss": 0.42635995149612427, "step": 19610 }, { "ce_loss": 0.030337288975715637, "epoch": 6.5410273515677115, "step": 19610 }, { "distill_loss": 0.14552177488803864, "epoch": 6.5410273515677115, "step": 19610 }, { "epoch": 6.5410273515677115, "ref_ce_loss": 0.06268715858459473, "step": 19610 }, { "epoch": 6.544362908605737, "loss": 0.3284, "step": 19620 }, { "epoch": 6.544362908605737, "grad_norm": 2.259993076324463, "step": 19620 }, { "epoch": 6.544362908605737, "learning_rate": 2.530207012272898e-05, "step": 19620 }, { "epoch": 6.544362908605737, "loss": 0.31314048171043396, "step": 19620 }, { "ce_loss": 0.07636706531047821, "epoch": 6.544362908605737, "step": 19620 }, { "distill_loss": 0.16563206911087036, "epoch": 6.544362908605737, "step": 19620 }, { "epoch": 6.544362908605737, "ref_ce_loss": 0.05040166527032852, "step": 19620 }, { "epoch": 6.544362908605737, "loss": 0.22983363270759583, "step": 19620 }, { "ce_loss": 0.042005062103271484, "epoch": 6.544362908605737, "step": 19620 }, { "distill_loss": 0.10108568519353867, "epoch": 6.544362908605737, "step": 19620 }, { "epoch": 6.544362908605737, "ref_ce_loss": 0.059286121279001236, "step": 19620 }, { "epoch": 6.544362908605737, "loss": 0.2737887501716614, "step": 19620 }, { "ce_loss": 0.011679647490382195, "epoch": 6.544362908605737, "step": 19620 }, { "distill_loss": 0.19483031332492828, "epoch": 6.544362908605737, "step": 19620 }, { "epoch": 6.544362908605737, "ref_ce_loss": 0.052262429147958755, "step": 19620 }, { "epoch": 6.544362908605737, "loss": 0.5098199844360352, "step": 19620 }, { "ce_loss": 0.13560065627098083, "epoch": 6.544362908605737, "step": 19620 }, { "distill_loss": 0.20365437865257263, "epoch": 6.544362908605737, "step": 19620 }, { "epoch": 6.544362908605737, "ref_ce_loss": 0.1021995022892952, "step": 19620 }, { "epoch": 6.547698465643762, "loss": 0.3097, "step": 19630 }, { "epoch": 6.547698465643762, "grad_norm": 2.537757635116577, "step": 19630 }, { "epoch": 6.547698465643762, "learning_rate": 2.5189601274954873e-05, "step": 19630 }, { "epoch": 6.547698465643762, "loss": 0.7845300436019897, "step": 19630 }, { "ce_loss": 0.09764357656240463, "epoch": 6.547698465643762, "step": 19630 }, { "distill_loss": 0.18290607631206512, "epoch": 6.547698465643762, "step": 19630 }, { "epoch": 6.547698465643762, "ref_ce_loss": 0.05711343511939049, "step": 19630 }, { "epoch": 6.547698465643762, "loss": 0.3228994607925415, "step": 19630 }, { "ce_loss": 0.017980199307203293, "epoch": 6.547698465643762, "step": 19630 }, { "distill_loss": 0.1762242466211319, "epoch": 6.547698465643762, "step": 19630 }, { "epoch": 6.547698465643762, "ref_ce_loss": 0.03918452188372612, "step": 19630 }, { "epoch": 6.547698465643762, "loss": 0.22011463344097137, "step": 19630 }, { "ce_loss": 0.04538943991065025, "epoch": 6.547698465643762, "step": 19630 }, { "distill_loss": 0.11361631751060486, "epoch": 6.547698465643762, "step": 19630 }, { "epoch": 6.547698465643762, "ref_ce_loss": 0.04930086433887482, "step": 19630 }, { "epoch": 6.547698465643762, "loss": 0.45519328117370605, "step": 19630 }, { "ce_loss": 0.06884275376796722, "epoch": 6.547698465643762, "step": 19630 }, { "distill_loss": 0.18536312878131866, "epoch": 6.547698465643762, "step": 19630 }, { "epoch": 6.547698465643762, "ref_ce_loss": 0.06121458113193512, "step": 19630 }, { "epoch": 6.551034022681788, "loss": 0.3376, "step": 19640 }, { "epoch": 6.551034022681788, "grad_norm": 2.452275514602661, "step": 19640 }, { "epoch": 6.551034022681788, "learning_rate": 2.507736003211435e-05, "step": 19640 }, { "epoch": 6.551034022681788, "loss": 0.5876684188842773, "step": 19640 }, { "ce_loss": 0.07951506227254868, "epoch": 6.551034022681788, "step": 19640 }, { "distill_loss": 0.17339465022087097, "epoch": 6.551034022681788, "step": 19640 }, { "epoch": 6.551034022681788, "ref_ce_loss": 0.06315672397613525, "step": 19640 }, { "epoch": 6.551034022681788, "loss": 0.4543311297893524, "step": 19640 }, { "ce_loss": 0.04960149526596069, "epoch": 6.551034022681788, "step": 19640 }, { "distill_loss": 0.1443757563829422, "epoch": 6.551034022681788, "step": 19640 }, { "epoch": 6.551034022681788, "ref_ce_loss": 0.075065977871418, "step": 19640 }, { "epoch": 6.551034022681788, "loss": 0.29707980155944824, "step": 19640 }, { "ce_loss": 0.06787025928497314, "epoch": 6.551034022681788, "step": 19640 }, { "distill_loss": 0.16085000336170197, "epoch": 6.551034022681788, "step": 19640 }, { "epoch": 6.551034022681788, "ref_ce_loss": 0.058474279940128326, "step": 19640 }, { "epoch": 6.551034022681788, "loss": 0.45773807168006897, "step": 19640 }, { "ce_loss": 0.06551582366228104, "epoch": 6.551034022681788, "step": 19640 }, { "distill_loss": 0.12983930110931396, "epoch": 6.551034022681788, "step": 19640 }, { "epoch": 6.551034022681788, "ref_ce_loss": 0.06701023131608963, "step": 19640 }, { "epoch": 6.554369579719813, "loss": 0.35, "step": 19650 }, { "epoch": 6.554369579719813, "grad_norm": 2.6646275520324707, "step": 19650 }, { "epoch": 6.554369579719813, "learning_rate": 2.4965346598891185e-05, "step": 19650 }, { "epoch": 6.554369579719813, "loss": 0.32107043266296387, "step": 19650 }, { "ce_loss": 0.06096203625202179, "epoch": 6.554369579719813, "step": 19650 }, { "distill_loss": 0.15283983945846558, "epoch": 6.554369579719813, "step": 19650 }, { "epoch": 6.554369579719813, "ref_ce_loss": 0.07251904904842377, "step": 19650 }, { "epoch": 6.554369579719813, "loss": 0.3576314449310303, "step": 19650 }, { "ce_loss": 0.07515022158622742, "epoch": 6.554369579719813, "step": 19650 }, { "distill_loss": 0.1723017394542694, "epoch": 6.554369579719813, "step": 19650 }, { "epoch": 6.554369579719813, "ref_ce_loss": 0.07873984426259995, "step": 19650 }, { "epoch": 6.554369579719813, "loss": 0.21501293778419495, "step": 19650 }, { "ce_loss": 0.009262898936867714, "epoch": 6.554369579719813, "step": 19650 }, { "distill_loss": 0.15100812911987305, "epoch": 6.554369579719813, "step": 19650 }, { "epoch": 6.554369579719813, "ref_ce_loss": 0.0326073132455349, "step": 19650 }, { "epoch": 6.554369579719813, "loss": 0.429170161485672, "step": 19650 }, { "ce_loss": 0.0786544531583786, "epoch": 6.554369579719813, "step": 19650 }, { "distill_loss": 0.2469421774148941, "epoch": 6.554369579719813, "step": 19650 }, { "epoch": 6.554369579719813, "ref_ce_loss": 0.08241055905818939, "step": 19650 }, { "epoch": 6.557705136757838, "loss": 0.3269, "step": 19660 }, { "epoch": 6.557705136757838, "grad_norm": 2.3054118156433105, "step": 19660 }, { "epoch": 6.557705136757838, "learning_rate": 2.485356117955367e-05, "step": 19660 }, { "epoch": 6.557705136757838, "loss": 0.3260459899902344, "step": 19660 }, { "ce_loss": 0.09615436941385269, "epoch": 6.557705136757838, "step": 19660 }, { "distill_loss": 0.1380174458026886, "epoch": 6.557705136757838, "step": 19660 }, { "epoch": 6.557705136757838, "ref_ce_loss": 0.05804324895143509, "step": 19660 }, { "epoch": 6.557705136757838, "loss": 0.36628609895706177, "step": 19660 }, { "ce_loss": 0.015312734991312027, "epoch": 6.557705136757838, "step": 19660 }, { "distill_loss": 0.16019655764102936, "epoch": 6.557705136757838, "step": 19660 }, { "epoch": 6.557705136757838, "ref_ce_loss": 0.0526125393807888, "step": 19660 }, { "epoch": 6.557705136757838, "loss": 0.3451019525527954, "step": 19660 }, { "ce_loss": 0.024979015812277794, "epoch": 6.557705136757838, "step": 19660 }, { "distill_loss": 0.18551787734031677, "epoch": 6.557705136757838, "step": 19660 }, { "epoch": 6.557705136757838, "ref_ce_loss": 0.040732912719249725, "step": 19660 }, { "epoch": 6.557705136757838, "loss": 0.3880947232246399, "step": 19660 }, { "ce_loss": 0.03313489258289337, "epoch": 6.557705136757838, "step": 19660 }, { "distill_loss": 0.13498447835445404, "epoch": 6.557705136757838, "step": 19660 }, { "epoch": 6.557705136757838, "ref_ce_loss": 0.04829491302371025, "step": 19660 }, { "epoch": 6.561040693795864, "loss": 0.3219, "step": 19670 }, { "epoch": 6.561040693795864, "grad_norm": 5.515630722045898, "step": 19670 }, { "epoch": 6.561040693795864, "learning_rate": 2.4742003977954333e-05, "step": 19670 }, { "epoch": 6.561040693795864, "loss": 0.47084906697273254, "step": 19670 }, { "ce_loss": 0.09112807363271713, "epoch": 6.561040693795864, "step": 19670 }, { "distill_loss": 0.14611652493476868, "epoch": 6.561040693795864, "step": 19670 }, { "epoch": 6.561040693795864, "ref_ce_loss": 0.058892469853162766, "step": 19670 }, { "epoch": 6.561040693795864, "loss": 0.3811488747596741, "step": 19670 }, { "ce_loss": 0.04032864049077034, "epoch": 6.561040693795864, "step": 19670 }, { "distill_loss": 0.1337955743074417, "epoch": 6.561040693795864, "step": 19670 }, { "epoch": 6.561040693795864, "ref_ce_loss": 0.08700317144393921, "step": 19670 }, { "epoch": 6.561040693795864, "loss": 0.2712763547897339, "step": 19670 }, { "ce_loss": 0.02721991576254368, "epoch": 6.561040693795864, "step": 19670 }, { "distill_loss": 0.10200801491737366, "epoch": 6.561040693795864, "step": 19670 }, { "epoch": 6.561040693795864, "ref_ce_loss": 0.0590171180665493, "step": 19670 }, { "epoch": 6.561040693795864, "loss": 0.24486324191093445, "step": 19670 }, { "ce_loss": 0.0423901304602623, "epoch": 6.561040693795864, "step": 19670 }, { "distill_loss": 0.1491403877735138, "epoch": 6.561040693795864, "step": 19670 }, { "epoch": 6.561040693795864, "ref_ce_loss": 0.05284509062767029, "step": 19670 }, { "epoch": 6.564376250833889, "loss": 0.3541, "step": 19680 }, { "epoch": 6.564376250833889, "grad_norm": 3.7235469818115234, "step": 19680 }, { "epoch": 6.564376250833889, "learning_rate": 2.4630675197529502e-05, "step": 19680 }, { "epoch": 6.564376250833889, "loss": 0.4071599841117859, "step": 19680 }, { "ce_loss": 0.01765054650604725, "epoch": 6.564376250833889, "step": 19680 }, { "distill_loss": 0.14170853793621063, "epoch": 6.564376250833889, "step": 19680 }, { "epoch": 6.564376250833889, "ref_ce_loss": 0.042518824338912964, "step": 19680 }, { "epoch": 6.564376250833889, "loss": 0.43594300746917725, "step": 19680 }, { "ce_loss": 0.07345236092805862, "epoch": 6.564376250833889, "step": 19680 }, { "distill_loss": 0.1204528734087944, "epoch": 6.564376250833889, "step": 19680 }, { "epoch": 6.564376250833889, "ref_ce_loss": 0.05486462637782097, "step": 19680 }, { "epoch": 6.564376250833889, "loss": 0.2822025716304779, "step": 19680 }, { "ce_loss": 0.057404521852731705, "epoch": 6.564376250833889, "step": 19680 }, { "distill_loss": 0.15036694705486298, "epoch": 6.564376250833889, "step": 19680 }, { "epoch": 6.564376250833889, "ref_ce_loss": 0.05206619203090668, "step": 19680 }, { "epoch": 6.564376250833889, "loss": 0.21536864340305328, "step": 19680 }, { "ce_loss": 0.04800006374716759, "epoch": 6.564376250833889, "step": 19680 }, { "distill_loss": 0.10483649373054504, "epoch": 6.564376250833889, "step": 19680 }, { "epoch": 6.564376250833889, "ref_ce_loss": 0.06246866285800934, "step": 19680 }, { "epoch": 6.567711807871914, "loss": 0.3322, "step": 19690 }, { "epoch": 6.567711807871914, "grad_norm": 2.986670732498169, "step": 19690 }, { "epoch": 6.567711807871914, "learning_rate": 2.4519575041298934e-05, "step": 19690 }, { "epoch": 6.567711807871914, "loss": 0.28218936920166016, "step": 19690 }, { "ce_loss": 0.031078292056918144, "epoch": 6.567711807871914, "step": 19690 }, { "distill_loss": 0.18602138757705688, "epoch": 6.567711807871914, "step": 19690 }, { "epoch": 6.567711807871914, "ref_ce_loss": 0.06485332548618317, "step": 19690 }, { "epoch": 6.567711807871914, "loss": 0.7765505909919739, "step": 19690 }, { "ce_loss": 0.065073661506176, "epoch": 6.567711807871914, "step": 19690 }, { "distill_loss": 0.16883963346481323, "epoch": 6.567711807871914, "step": 19690 }, { "epoch": 6.567711807871914, "ref_ce_loss": 0.05374979227781296, "step": 19690 }, { "epoch": 6.567711807871914, "loss": 0.2536375820636749, "step": 19690 }, { "ce_loss": 0.03500214219093323, "epoch": 6.567711807871914, "step": 19690 }, { "distill_loss": 0.14786192774772644, "epoch": 6.567711807871914, "step": 19690 }, { "epoch": 6.567711807871914, "ref_ce_loss": 0.05251917988061905, "step": 19690 }, { "epoch": 6.567711807871914, "loss": 0.146646186709404, "step": 19690 }, { "ce_loss": 0.03096270188689232, "epoch": 6.567711807871914, "step": 19690 }, { "distill_loss": 0.07382689416408539, "epoch": 6.567711807871914, "step": 19690 }, { "epoch": 6.567711807871914, "ref_ce_loss": 0.03107365407049656, "step": 19690 }, { "epoch": 6.57104736490994, "loss": 0.36, "step": 19700 }, { "epoch": 6.57104736490994, "grad_norm": 7.0674052238464355, "step": 19700 }, { "epoch": 6.57104736490994, "learning_rate": 2.4408703711865507e-05, "step": 19700 }, { "epoch": 6.57104736490994, "loss": 0.21289116144180298, "step": 19700 }, { "ce_loss": 0.010433454066514969, "epoch": 6.57104736490994, "step": 19700 }, { "distill_loss": 0.11860831081867218, "epoch": 6.57104736490994, "step": 19700 }, { "epoch": 6.57104736490994, "ref_ce_loss": 0.060785289853811264, "step": 19700 }, { "epoch": 6.57104736490994, "loss": 0.3320046067237854, "step": 19700 }, { "ce_loss": 0.10327617079019547, "epoch": 6.57104736490994, "step": 19700 }, { "distill_loss": 0.11185026913881302, "epoch": 6.57104736490994, "step": 19700 }, { "epoch": 6.57104736490994, "ref_ce_loss": 0.08796876668930054, "step": 19700 }, { "epoch": 6.57104736490994, "loss": 0.23405468463897705, "step": 19700 }, { "ce_loss": 0.022999800741672516, "epoch": 6.57104736490994, "step": 19700 }, { "distill_loss": 0.1573328971862793, "epoch": 6.57104736490994, "step": 19700 }, { "epoch": 6.57104736490994, "ref_ce_loss": 0.05357423424720764, "step": 19700 }, { "epoch": 6.57104736490994, "loss": 0.2924388349056244, "step": 19700 }, { "ce_loss": 0.046956613659858704, "epoch": 6.57104736490994, "step": 19700 }, { "distill_loss": 0.11884532868862152, "epoch": 6.57104736490994, "step": 19700 }, { "epoch": 6.57104736490994, "ref_ce_loss": 0.06552345305681229, "step": 19700 }, { "epoch": 6.574382921947965, "loss": 0.3085, "step": 19710 }, { "epoch": 6.574382921947965, "grad_norm": 3.8683853149414062, "step": 19710 }, { "epoch": 6.574382921947965, "learning_rate": 2.4298061411414775e-05, "step": 19710 }, { "epoch": 6.574382921947965, "loss": 0.2815577983856201, "step": 19710 }, { "ce_loss": 0.03336150571703911, "epoch": 6.574382921947965, "step": 19710 }, { "distill_loss": 0.16840992867946625, "epoch": 6.574382921947965, "step": 19710 }, { "epoch": 6.574382921947965, "ref_ce_loss": 0.049621641635894775, "step": 19710 }, { "epoch": 6.574382921947965, "loss": 0.38527244329452515, "step": 19710 }, { "ce_loss": 0.030045749619603157, "epoch": 6.574382921947965, "step": 19710 }, { "distill_loss": 0.17417395114898682, "epoch": 6.574382921947965, "step": 19710 }, { "epoch": 6.574382921947965, "ref_ce_loss": 0.045052722096443176, "step": 19710 }, { "epoch": 6.574382921947965, "loss": 0.3040308952331543, "step": 19710 }, { "ce_loss": 0.06124549359083176, "epoch": 6.574382921947965, "step": 19710 }, { "distill_loss": 0.13505572080612183, "epoch": 6.574382921947965, "step": 19710 }, { "epoch": 6.574382921947965, "ref_ce_loss": 0.0780586525797844, "step": 19710 }, { "epoch": 6.574382921947965, "loss": 0.22426313161849976, "step": 19710 }, { "ce_loss": 0.008811813779175282, "epoch": 6.574382921947965, "step": 19710 }, { "distill_loss": 0.1401672214269638, "epoch": 6.574382921947965, "step": 19710 }, { "epoch": 6.574382921947965, "ref_ce_loss": 0.05200808495283127, "step": 19710 }, { "epoch": 6.57771847898599, "loss": 0.3185, "step": 19720 }, { "epoch": 6.57771847898599, "grad_norm": 2.2868518829345703, "step": 19720 }, { "epoch": 6.57771847898599, "learning_rate": 2.418764834171466e-05, "step": 19720 }, { "epoch": 6.57771847898599, "loss": 0.36733943223953247, "step": 19720 }, { "ce_loss": 0.06563469767570496, "epoch": 6.57771847898599, "step": 19720 }, { "distill_loss": 0.14585933089256287, "epoch": 6.57771847898599, "step": 19720 }, { "epoch": 6.57771847898599, "ref_ce_loss": 0.08136098086833954, "step": 19720 }, { "epoch": 6.57771847898599, "loss": 0.5045070648193359, "step": 19720 }, { "ce_loss": 0.10214339941740036, "epoch": 6.57771847898599, "step": 19720 }, { "distill_loss": 0.14387455582618713, "epoch": 6.57771847898599, "step": 19720 }, { "epoch": 6.57771847898599, "ref_ce_loss": 0.08991748094558716, "step": 19720 }, { "epoch": 6.57771847898599, "loss": 0.35117292404174805, "step": 19720 }, { "ce_loss": 0.05239369720220566, "epoch": 6.57771847898599, "step": 19720 }, { "distill_loss": 0.1372997909784317, "epoch": 6.57771847898599, "step": 19720 }, { "epoch": 6.57771847898599, "ref_ce_loss": 0.07272414863109589, "step": 19720 }, { "epoch": 6.57771847898599, "loss": 0.21220049262046814, "step": 19720 }, { "ce_loss": 0.035478733479976654, "epoch": 6.57771847898599, "step": 19720 }, { "distill_loss": 0.09870263934135437, "epoch": 6.57771847898599, "step": 19720 }, { "epoch": 6.57771847898599, "ref_ce_loss": 0.04688771069049835, "step": 19720 }, { "epoch": 6.581054036024016, "loss": 0.3218, "step": 19730 }, { "epoch": 6.581054036024016, "grad_norm": 2.9975807666778564, "step": 19730 }, { "epoch": 6.581054036024016, "learning_rate": 2.407746470411508e-05, "step": 19730 }, { "epoch": 6.581054036024016, "loss": 0.3309047818183899, "step": 19730 }, { "ce_loss": 0.026400120928883553, "epoch": 6.581054036024016, "step": 19730 }, { "distill_loss": 0.20293043553829193, "epoch": 6.581054036024016, "step": 19730 }, { "epoch": 6.581054036024016, "ref_ce_loss": 0.07825887948274612, "step": 19730 }, { "epoch": 6.581054036024016, "loss": 0.41066548228263855, "step": 19730 }, { "ce_loss": 0.0435505174100399, "epoch": 6.581054036024016, "step": 19730 }, { "distill_loss": 0.17935574054718018, "epoch": 6.581054036024016, "step": 19730 }, { "epoch": 6.581054036024016, "ref_ce_loss": 0.06096060946583748, "step": 19730 }, { "epoch": 6.581054036024016, "loss": 0.5351892113685608, "step": 19730 }, { "ce_loss": 0.06245271489024162, "epoch": 6.581054036024016, "step": 19730 }, { "distill_loss": 0.2792205810546875, "epoch": 6.581054036024016, "step": 19730 }, { "epoch": 6.581054036024016, "ref_ce_loss": 0.09187360852956772, "step": 19730 }, { "epoch": 6.581054036024016, "loss": 0.21385572850704193, "step": 19730 }, { "ce_loss": 0.04891948029398918, "epoch": 6.581054036024016, "step": 19730 }, { "distill_loss": 0.09563419222831726, "epoch": 6.581054036024016, "step": 19730 }, { "epoch": 6.581054036024016, "ref_ce_loss": 0.05390109866857529, "step": 19730 }, { "epoch": 6.584389593062041, "loss": 0.3156, "step": 19740 }, { "epoch": 6.584389593062041, "grad_norm": 3.010976791381836, "step": 19740 }, { "epoch": 6.584389593062041, "learning_rate": 2.3967510699547453e-05, "step": 19740 }, { "epoch": 6.584389593062041, "loss": 0.3294471204280853, "step": 19740 }, { "ce_loss": 0.05028639733791351, "epoch": 6.584389593062041, "step": 19740 }, { "distill_loss": 0.17179375886917114, "epoch": 6.584389593062041, "step": 19740 }, { "epoch": 6.584389593062041, "ref_ce_loss": 0.04911860451102257, "step": 19740 }, { "epoch": 6.584389593062041, "loss": 0.9547842741012573, "step": 19740 }, { "ce_loss": 0.062249574810266495, "epoch": 6.584389593062041, "step": 19740 }, { "distill_loss": 0.17876005172729492, "epoch": 6.584389593062041, "step": 19740 }, { "epoch": 6.584389593062041, "ref_ce_loss": 0.08448649197816849, "step": 19740 }, { "epoch": 6.584389593062041, "loss": 0.1854000985622406, "step": 19740 }, { "ce_loss": 0.03438950702548027, "epoch": 6.584389593062041, "step": 19740 }, { "distill_loss": 0.10151194036006927, "epoch": 6.584389593062041, "step": 19740 }, { "epoch": 6.584389593062041, "ref_ce_loss": 0.034116681665182114, "step": 19740 }, { "epoch": 6.584389593062041, "loss": 0.24452723562717438, "step": 19740 }, { "ce_loss": 0.02735762856900692, "epoch": 6.584389593062041, "step": 19740 }, { "distill_loss": 0.14382018148899078, "epoch": 6.584389593062041, "step": 19740 }, { "epoch": 6.584389593062041, "ref_ce_loss": 0.054051473736763, "step": 19740 }, { "epoch": 6.587725150100066, "loss": 0.3189, "step": 19750 }, { "epoch": 6.587725150100066, "grad_norm": 3.135653495788574, "step": 19750 }, { "epoch": 6.587725150100066, "learning_rate": 2.3857786528524607e-05, "step": 19750 }, { "epoch": 6.587725150100066, "loss": 0.21763204038143158, "step": 19750 }, { "ce_loss": 0.04860735684633255, "epoch": 6.587725150100066, "step": 19750 }, { "distill_loss": 0.12845882773399353, "epoch": 6.587725150100066, "step": 19750 }, { "epoch": 6.587725150100066, "ref_ce_loss": 0.04027068614959717, "step": 19750 }, { "epoch": 6.587725150100066, "loss": 0.466017484664917, "step": 19750 }, { "ce_loss": 0.09125525504350662, "epoch": 6.587725150100066, "step": 19750 }, { "distill_loss": 0.25751644372940063, "epoch": 6.587725150100066, "step": 19750 }, { "epoch": 6.587725150100066, "ref_ce_loss": 0.05083215609192848, "step": 19750 }, { "epoch": 6.587725150100066, "loss": 0.316772997379303, "step": 19750 }, { "ce_loss": 0.007712164893746376, "epoch": 6.587725150100066, "step": 19750 }, { "distill_loss": 0.14403752982616425, "epoch": 6.587725150100066, "step": 19750 }, { "epoch": 6.587725150100066, "ref_ce_loss": 0.052840352058410645, "step": 19750 }, { "epoch": 6.587725150100066, "loss": 0.352708637714386, "step": 19750 }, { "ce_loss": 0.05708223953843117, "epoch": 6.587725150100066, "step": 19750 }, { "distill_loss": 0.15306146442890167, "epoch": 6.587725150100066, "step": 19750 }, { "epoch": 6.587725150100066, "ref_ce_loss": 0.05850322172045708, "step": 19750 }, { "epoch": 6.591060707138092, "loss": 0.3042, "step": 19760 }, { "epoch": 6.591060707138092, "grad_norm": 2.567244291305542, "step": 19760 }, { "epoch": 6.591060707138092, "learning_rate": 2.374829239114e-05, "step": 19760 }, { "epoch": 6.591060707138092, "loss": 0.25095030665397644, "step": 19760 }, { "ce_loss": 0.0396430529654026, "epoch": 6.591060707138092, "step": 19760 }, { "distill_loss": 0.0905349925160408, "epoch": 6.591060707138092, "step": 19760 }, { "epoch": 6.591060707138092, "ref_ce_loss": 0.053239136934280396, "step": 19760 }, { "epoch": 6.591060707138092, "loss": 0.28194355964660645, "step": 19760 }, { "ce_loss": 0.044967278838157654, "epoch": 6.591060707138092, "step": 19760 }, { "distill_loss": 0.1526344120502472, "epoch": 6.591060707138092, "step": 19760 }, { "epoch": 6.591060707138092, "ref_ce_loss": 0.058355461806058884, "step": 19760 }, { "epoch": 6.591060707138092, "loss": 0.277133584022522, "step": 19760 }, { "ce_loss": 0.02993590198457241, "epoch": 6.591060707138092, "step": 19760 }, { "distill_loss": 0.09442713856697083, "epoch": 6.591060707138092, "step": 19760 }, { "epoch": 6.591060707138092, "ref_ce_loss": 0.0476006343960762, "step": 19760 }, { "epoch": 6.591060707138092, "loss": 0.6724720001220703, "step": 19760 }, { "ce_loss": 0.046295564621686935, "epoch": 6.591060707138092, "step": 19760 }, { "distill_loss": 0.1278403103351593, "epoch": 6.591060707138092, "step": 19760 }, { "epoch": 6.591060707138092, "ref_ce_loss": 0.035039693117141724, "step": 19760 }, { "epoch": 6.594396264176117, "loss": 0.3334, "step": 19770 }, { "epoch": 6.594396264176117, "grad_norm": 2.155388355255127, "step": 19770 }, { "epoch": 6.594396264176117, "learning_rate": 2.363902848706789e-05, "step": 19770 }, { "epoch": 6.594396264176117, "loss": 0.2597852349281311, "step": 19770 }, { "ce_loss": 0.03040352649986744, "epoch": 6.594396264176117, "step": 19770 }, { "distill_loss": 0.1443725973367691, "epoch": 6.594396264176117, "step": 19770 }, { "epoch": 6.594396264176117, "ref_ce_loss": 0.08462069928646088, "step": 19770 }, { "epoch": 6.594396264176117, "loss": 0.17788177728652954, "step": 19770 }, { "ce_loss": 0.00964513048529625, "epoch": 6.594396264176117, "step": 19770 }, { "distill_loss": 0.11348067969083786, "epoch": 6.594396264176117, "step": 19770 }, { "epoch": 6.594396264176117, "ref_ce_loss": 0.05442032590508461, "step": 19770 }, { "epoch": 6.594396264176117, "loss": 0.31917548179626465, "step": 19770 }, { "ce_loss": 0.046949710696935654, "epoch": 6.594396264176117, "step": 19770 }, { "distill_loss": 0.14541833102703094, "epoch": 6.594396264176117, "step": 19770 }, { "epoch": 6.594396264176117, "ref_ce_loss": 0.09034581482410431, "step": 19770 }, { "epoch": 6.594396264176117, "loss": 0.28402474522590637, "step": 19770 }, { "ce_loss": 0.03271313011646271, "epoch": 6.594396264176117, "step": 19770 }, { "distill_loss": 0.14252933859825134, "epoch": 6.594396264176117, "step": 19770 }, { "epoch": 6.594396264176117, "ref_ce_loss": 0.05029868707060814, "step": 19770 }, { "epoch": 6.5977318212141425, "loss": 0.2999, "step": 19780 }, { "epoch": 6.5977318212141425, "grad_norm": 2.598196029663086, "step": 19780 }, { "epoch": 6.5977318212141425, "learning_rate": 2.352999501556251e-05, "step": 19780 }, { "epoch": 6.5977318212141425, "loss": 0.37572720646858215, "step": 19780 }, { "ce_loss": 0.027209611609578133, "epoch": 6.5977318212141425, "step": 19780 }, { "distill_loss": 0.16785335540771484, "epoch": 6.5977318212141425, "step": 19780 }, { "epoch": 6.5977318212141425, "ref_ce_loss": 0.06948821246623993, "step": 19780 }, { "epoch": 6.5977318212141425, "loss": 0.46975427865982056, "step": 19780 }, { "ce_loss": 0.040823787450790405, "epoch": 6.5977318212141425, "step": 19780 }, { "distill_loss": 0.10489165782928467, "epoch": 6.5977318212141425, "step": 19780 }, { "epoch": 6.5977318212141425, "ref_ce_loss": 0.04194294288754463, "step": 19780 }, { "epoch": 6.5977318212141425, "loss": 0.20247483253479004, "step": 19780 }, { "ce_loss": 0.04620786011219025, "epoch": 6.5977318212141425, "step": 19780 }, { "distill_loss": 0.10938869416713715, "epoch": 6.5977318212141425, "step": 19780 }, { "epoch": 6.5977318212141425, "ref_ce_loss": 0.04655927047133446, "step": 19780 }, { "epoch": 6.5977318212141425, "loss": 0.35143670439720154, "step": 19780 }, { "ce_loss": 0.06539808213710785, "epoch": 6.5977318212141425, "step": 19780 }, { "distill_loss": 0.1366189569234848, "epoch": 6.5977318212141425, "step": 19780 }, { "epoch": 6.5977318212141425, "ref_ce_loss": 0.06182894483208656, "step": 19780 }, { "epoch": 6.601067378252168, "loss": 0.3129, "step": 19790 }, { "epoch": 6.601067378252168, "grad_norm": 2.330700159072876, "step": 19790 }, { "epoch": 6.601067378252168, "learning_rate": 2.3421192175457837e-05, "step": 19790 }, { "epoch": 6.601067378252168, "loss": 0.30677473545074463, "step": 19790 }, { "ce_loss": 0.025118449702858925, "epoch": 6.601067378252168, "step": 19790 }, { "distill_loss": 0.17951764166355133, "epoch": 6.601067378252168, "step": 19790 }, { "epoch": 6.601067378252168, "ref_ce_loss": 0.036090608686208725, "step": 19790 }, { "epoch": 6.601067378252168, "loss": 0.4100573658943176, "step": 19790 }, { "ce_loss": 0.057707469910383224, "epoch": 6.601067378252168, "step": 19790 }, { "distill_loss": 0.2209986001253128, "epoch": 6.601067378252168, "step": 19790 }, { "epoch": 6.601067378252168, "ref_ce_loss": 0.09653498977422714, "step": 19790 }, { "epoch": 6.601067378252168, "loss": 0.24320702254772186, "step": 19790 }, { "ce_loss": 0.03562565892934799, "epoch": 6.601067378252168, "step": 19790 }, { "distill_loss": 0.13927190005779266, "epoch": 6.601067378252168, "step": 19790 }, { "epoch": 6.601067378252168, "ref_ce_loss": 0.03783516213297844, "step": 19790 }, { "epoch": 6.601067378252168, "loss": 0.32648521661758423, "step": 19790 }, { "ce_loss": 0.038021087646484375, "epoch": 6.601067378252168, "step": 19790 }, { "distill_loss": 0.14461548626422882, "epoch": 6.601067378252168, "step": 19790 }, { "epoch": 6.601067378252168, "ref_ce_loss": 0.068316750228405, "step": 19790 }, { "epoch": 6.604402935290193, "loss": 0.3589, "step": 19800 }, { "epoch": 6.604402935290193, "grad_norm": 3.626997947692871, "step": 19800 }, { "epoch": 6.604402935290193, "learning_rate": 2.331262016516736e-05, "step": 19800 }, { "epoch": 6.604402935290193, "loss": 0.557469367980957, "step": 19800 }, { "ce_loss": 0.04689061641693115, "epoch": 6.604402935290193, "step": 19800 }, { "distill_loss": 0.10365889221429825, "epoch": 6.604402935290193, "step": 19800 }, { "epoch": 6.604402935290193, "ref_ce_loss": 0.04763340950012207, "step": 19800 }, { "epoch": 6.604402935290193, "loss": 0.24095264077186584, "step": 19800 }, { "ce_loss": 0.009478450752794743, "epoch": 6.604402935290193, "step": 19800 }, { "distill_loss": 0.16503193974494934, "epoch": 6.604402935290193, "step": 19800 }, { "epoch": 6.604402935290193, "ref_ce_loss": 0.039028752595186234, "step": 19800 }, { "epoch": 6.604402935290193, "loss": 0.2705317437648773, "step": 19800 }, { "ce_loss": 0.034018658101558685, "epoch": 6.604402935290193, "step": 19800 }, { "distill_loss": 0.15237151086330414, "epoch": 6.604402935290193, "step": 19800 }, { "epoch": 6.604402935290193, "ref_ce_loss": 0.08372320234775543, "step": 19800 }, { "epoch": 6.604402935290193, "loss": 0.206894651055336, "step": 19800 }, { "ce_loss": 0.023637909442186356, "epoch": 6.604402935290193, "step": 19800 }, { "distill_loss": 0.13018612563610077, "epoch": 6.604402935290193, "step": 19800 }, { "epoch": 6.604402935290193, "ref_ce_loss": 0.03761601820588112, "step": 19800 }, { "epoch": 6.6077384923282185, "loss": 0.32, "step": 19810 }, { "epoch": 6.6077384923282185, "grad_norm": 2.208696126937866, "step": 19810 }, { "epoch": 6.6077384923282185, "learning_rate": 2.320427918268367e-05, "step": 19810 }, { "epoch": 6.6077384923282185, "loss": 0.16685768961906433, "step": 19810 }, { "ce_loss": 0.032697293907403946, "epoch": 6.6077384923282185, "step": 19810 }, { "distill_loss": 0.10233201086521149, "epoch": 6.6077384923282185, "step": 19810 }, { "epoch": 6.6077384923282185, "ref_ce_loss": 0.03176421672105789, "step": 19810 }, { "epoch": 6.6077384923282185, "loss": 0.3800828158855438, "step": 19810 }, { "ce_loss": 0.042058251798152924, "epoch": 6.6077384923282185, "step": 19810 }, { "distill_loss": 0.12607955932617188, "epoch": 6.6077384923282185, "step": 19810 }, { "epoch": 6.6077384923282185, "ref_ce_loss": 0.07670360058546066, "step": 19810 }, { "epoch": 6.6077384923282185, "loss": 0.22122445702552795, "step": 19810 }, { "ce_loss": 0.01819663867354393, "epoch": 6.6077384923282185, "step": 19810 }, { "distill_loss": 0.14853982627391815, "epoch": 6.6077384923282185, "step": 19810 }, { "epoch": 6.6077384923282185, "ref_ce_loss": 0.03529641404747963, "step": 19810 }, { "epoch": 6.6077384923282185, "loss": 0.6402498483657837, "step": 19810 }, { "ce_loss": 0.06503716856241226, "epoch": 6.6077384923282185, "step": 19810 }, { "distill_loss": 0.13584941625595093, "epoch": 6.6077384923282185, "step": 19810 }, { "epoch": 6.6077384923282185, "ref_ce_loss": 0.06338845938444138, "step": 19810 }, { "epoch": 6.611074049366244, "loss": 0.3375, "step": 19820 }, { "epoch": 6.611074049366244, "grad_norm": 2.6924424171447754, "step": 19820 }, { "epoch": 6.611074049366244, "learning_rate": 2.3096169425577826e-05, "step": 19820 }, { "epoch": 6.611074049366244, "loss": 0.21234257519245148, "step": 19820 }, { "ce_loss": 0.030125008895993233, "epoch": 6.611074049366244, "step": 19820 }, { "distill_loss": 0.13141590356826782, "epoch": 6.611074049366244, "step": 19820 }, { "epoch": 6.611074049366244, "ref_ce_loss": 0.05019145458936691, "step": 19820 }, { "epoch": 6.611074049366244, "loss": 0.21372869610786438, "step": 19820 }, { "ce_loss": 0.023508066311478615, "epoch": 6.611074049366244, "step": 19820 }, { "distill_loss": 0.14203333854675293, "epoch": 6.611074049366244, "step": 19820 }, { "epoch": 6.611074049366244, "ref_ce_loss": 0.03470898047089577, "step": 19820 }, { "epoch": 6.611074049366244, "loss": 0.25468939542770386, "step": 19820 }, { "ce_loss": 0.04465780407190323, "epoch": 6.611074049366244, "step": 19820 }, { "distill_loss": 0.13727760314941406, "epoch": 6.611074049366244, "step": 19820 }, { "epoch": 6.611074049366244, "ref_ce_loss": 0.07237095385789871, "step": 19820 }, { "epoch": 6.611074049366244, "loss": 0.24952755868434906, "step": 19820 }, { "ce_loss": 0.03197052329778671, "epoch": 6.611074049366244, "step": 19820 }, { "distill_loss": 0.1438474804162979, "epoch": 6.611074049366244, "step": 19820 }, { "epoch": 6.611074049366244, "ref_ce_loss": 0.049808233976364136, "step": 19820 }, { "epoch": 6.614409606404269, "loss": 0.3421, "step": 19830 }, { "epoch": 6.614409606404269, "grad_norm": 2.183060646057129, "step": 19830 }, { "epoch": 6.614409606404269, "learning_rate": 2.2988291090999555e-05, "step": 19830 }, { "epoch": 6.614409606404269, "loss": 0.18967588245868683, "step": 19830 }, { "ce_loss": 0.019939763471484184, "epoch": 6.614409606404269, "step": 19830 }, { "distill_loss": 0.13336925208568573, "epoch": 6.614409606404269, "step": 19830 }, { "epoch": 6.614409606404269, "ref_ce_loss": 0.03612978756427765, "step": 19830 }, { "epoch": 6.614409606404269, "loss": 0.18258681893348694, "step": 19830 }, { "ce_loss": 0.020963216200470924, "epoch": 6.614409606404269, "step": 19830 }, { "distill_loss": 0.09793965518474579, "epoch": 6.614409606404269, "step": 19830 }, { "epoch": 6.614409606404269, "ref_ce_loss": 0.04985945671796799, "step": 19830 }, { "epoch": 6.614409606404269, "loss": 0.2376529574394226, "step": 19830 }, { "ce_loss": 0.029153717681765556, "epoch": 6.614409606404269, "step": 19830 }, { "distill_loss": 0.14805583655834198, "epoch": 6.614409606404269, "step": 19830 }, { "epoch": 6.614409606404269, "ref_ce_loss": 0.04305458813905716, "step": 19830 }, { "epoch": 6.614409606404269, "loss": 0.38180723786354065, "step": 19830 }, { "ce_loss": 0.11473371833562851, "epoch": 6.614409606404269, "step": 19830 }, { "distill_loss": 0.14703942835330963, "epoch": 6.614409606404269, "step": 19830 }, { "epoch": 6.614409606404269, "ref_ce_loss": 0.0409589558839798, "step": 19830 }, { "epoch": 6.617745163442295, "loss": 0.3106, "step": 19840 }, { "epoch": 6.617745163442295, "grad_norm": 3.1054604053497314, "step": 19840 }, { "epoch": 6.617745163442295, "learning_rate": 2.2880644375676276e-05, "step": 19840 }, { "epoch": 6.617745163442295, "loss": 0.25367286801338196, "step": 19840 }, { "ce_loss": 0.06676454842090607, "epoch": 6.617745163442295, "step": 19840 }, { "distill_loss": 0.1250157654285431, "epoch": 6.617745163442295, "step": 19840 }, { "epoch": 6.617745163442295, "ref_ce_loss": 0.046141307801008224, "step": 19840 }, { "epoch": 6.617745163442295, "loss": 0.27414095401763916, "step": 19840 }, { "ce_loss": 0.005423164926469326, "epoch": 6.617745163442295, "step": 19840 }, { "distill_loss": 0.11123090237379074, "epoch": 6.617745163442295, "step": 19840 }, { "epoch": 6.617745163442295, "ref_ce_loss": 0.025682704523205757, "step": 19840 }, { "epoch": 6.617745163442295, "loss": 0.5452802777290344, "step": 19840 }, { "ce_loss": 0.05291275680065155, "epoch": 6.617745163442295, "step": 19840 }, { "distill_loss": 0.09778635203838348, "epoch": 6.617745163442295, "step": 19840 }, { "epoch": 6.617745163442295, "ref_ce_loss": 0.05122615769505501, "step": 19840 }, { "epoch": 6.617745163442295, "loss": 0.3839438259601593, "step": 19840 }, { "ce_loss": 0.06498966366052628, "epoch": 6.617745163442295, "step": 19840 }, { "distill_loss": 0.22986114025115967, "epoch": 6.617745163442295, "step": 19840 }, { "epoch": 6.617745163442295, "ref_ce_loss": 0.06074221059679985, "step": 19840 }, { "epoch": 6.62108072048032, "loss": 0.3192, "step": 19850 }, { "epoch": 6.62108072048032, "grad_norm": 1.9881457090377808, "step": 19850 }, { "epoch": 6.62108072048032, "learning_rate": 2.2773229475913163e-05, "step": 19850 }, { "epoch": 6.62108072048032, "loss": 0.3907688856124878, "step": 19850 }, { "ce_loss": 0.09534867107868195, "epoch": 6.62108072048032, "step": 19850 }, { "distill_loss": 0.1554652899503708, "epoch": 6.62108072048032, "step": 19850 }, { "epoch": 6.62108072048032, "ref_ce_loss": 0.07192099839448929, "step": 19850 }, { "epoch": 6.62108072048032, "loss": 0.2497331202030182, "step": 19850 }, { "ce_loss": 0.01978166028857231, "epoch": 6.62108072048032, "step": 19850 }, { "distill_loss": 0.12318174540996552, "epoch": 6.62108072048032, "step": 19850 }, { "epoch": 6.62108072048032, "ref_ce_loss": 0.040960054844617844, "step": 19850 }, { "epoch": 6.62108072048032, "loss": 0.31594735383987427, "step": 19850 }, { "ce_loss": 0.029789138585329056, "epoch": 6.62108072048032, "step": 19850 }, { "distill_loss": 0.1177356094121933, "epoch": 6.62108072048032, "step": 19850 }, { "epoch": 6.62108072048032, "ref_ce_loss": 0.05026852339506149, "step": 19850 }, { "epoch": 6.62108072048032, "loss": 0.4592970609664917, "step": 19850 }, { "ce_loss": 0.15999820828437805, "epoch": 6.62108072048032, "step": 19850 }, { "distill_loss": 0.21153071522712708, "epoch": 6.62108072048032, "step": 19850 }, { "epoch": 6.62108072048032, "ref_ce_loss": 0.0713123083114624, "step": 19850 }, { "epoch": 6.624416277518345, "loss": 0.3245, "step": 19860 }, { "epoch": 6.624416277518345, "grad_norm": 2.375483274459839, "step": 19860 }, { "epoch": 6.624416277518345, "learning_rate": 2.266604658759264e-05, "step": 19860 }, { "epoch": 6.624416277518345, "loss": 0.3437642455101013, "step": 19860 }, { "ce_loss": 0.04593540355563164, "epoch": 6.624416277518345, "step": 19860 }, { "distill_loss": 0.11273406445980072, "epoch": 6.624416277518345, "step": 19860 }, { "epoch": 6.624416277518345, "ref_ce_loss": 0.07087185233831406, "step": 19860 }, { "epoch": 6.624416277518345, "loss": 0.2620472311973572, "step": 19860 }, { "ce_loss": 0.032802462577819824, "epoch": 6.624416277518345, "step": 19860 }, { "distill_loss": 0.15501290559768677, "epoch": 6.624416277518345, "step": 19860 }, { "epoch": 6.624416277518345, "ref_ce_loss": 0.042995888739824295, "step": 19860 }, { "epoch": 6.624416277518345, "loss": 0.24187412858009338, "step": 19860 }, { "ce_loss": 0.036589235067367554, "epoch": 6.624416277518345, "step": 19860 }, { "distill_loss": 0.13851779699325562, "epoch": 6.624416277518345, "step": 19860 }, { "epoch": 6.624416277518345, "ref_ce_loss": 0.04687894880771637, "step": 19860 }, { "epoch": 6.624416277518345, "loss": 0.2755400836467743, "step": 19860 }, { "ce_loss": 0.049808893352746964, "epoch": 6.624416277518345, "step": 19860 }, { "distill_loss": 0.17341046035289764, "epoch": 6.624416277518345, "step": 19860 }, { "epoch": 6.624416277518345, "ref_ce_loss": 0.05203511193394661, "step": 19860 }, { "epoch": 6.627751834556371, "loss": 0.3231, "step": 19870 }, { "epoch": 6.627751834556371, "grad_norm": 3.1982598304748535, "step": 19870 }, { "epoch": 6.627751834556371, "learning_rate": 2.2559095906173975e-05, "step": 19870 }, { "epoch": 6.627751834556371, "loss": 0.2873413562774658, "step": 19870 }, { "ce_loss": 0.06314130872488022, "epoch": 6.627751834556371, "step": 19870 }, { "distill_loss": 0.12433145940303802, "epoch": 6.627751834556371, "step": 19870 }, { "epoch": 6.627751834556371, "ref_ce_loss": 0.07966389507055283, "step": 19870 }, { "epoch": 6.627751834556371, "loss": 0.29026103019714355, "step": 19870 }, { "ce_loss": 0.027912983670830727, "epoch": 6.627751834556371, "step": 19870 }, { "distill_loss": 0.20554634928703308, "epoch": 6.627751834556371, "step": 19870 }, { "epoch": 6.627751834556371, "ref_ce_loss": 0.05665810778737068, "step": 19870 }, { "epoch": 6.627751834556371, "loss": 0.26199522614479065, "step": 19870 }, { "ce_loss": 0.04706485942006111, "epoch": 6.627751834556371, "step": 19870 }, { "distill_loss": 0.13014474511146545, "epoch": 6.627751834556371, "step": 19870 }, { "epoch": 6.627751834556371, "ref_ce_loss": 0.044131312519311905, "step": 19870 }, { "epoch": 6.627751834556371, "loss": 0.2666011452674866, "step": 19870 }, { "ce_loss": 0.04390444606542587, "epoch": 6.627751834556371, "step": 19870 }, { "distill_loss": 0.12119217962026596, "epoch": 6.627751834556371, "step": 19870 }, { "epoch": 6.627751834556371, "ref_ce_loss": 0.08273381739854813, "step": 19870 }, { "epoch": 6.631087391594396, "loss": 0.3399, "step": 19880 }, { "epoch": 6.631087391594396, "grad_norm": 2.9726524353027344, "step": 19880 }, { "epoch": 6.631087391594396, "learning_rate": 2.2452377626693036e-05, "step": 19880 }, { "epoch": 6.631087391594396, "loss": 0.44170236587524414, "step": 19880 }, { "ce_loss": 0.08167297393083572, "epoch": 6.631087391594396, "step": 19880 }, { "distill_loss": 0.18584305047988892, "epoch": 6.631087391594396, "step": 19880 }, { "epoch": 6.631087391594396, "ref_ce_loss": 0.07174765318632126, "step": 19880 }, { "epoch": 6.631087391594396, "loss": 0.32071903347969055, "step": 19880 }, { "ce_loss": 0.04414434731006622, "epoch": 6.631087391594396, "step": 19880 }, { "distill_loss": 0.13835738599300385, "epoch": 6.631087391594396, "step": 19880 }, { "epoch": 6.631087391594396, "ref_ce_loss": 0.05073229968547821, "step": 19880 }, { "epoch": 6.631087391594396, "loss": 0.2592620849609375, "step": 19880 }, { "ce_loss": 0.06465975940227509, "epoch": 6.631087391594396, "step": 19880 }, { "distill_loss": 0.14733535051345825, "epoch": 6.631087391594396, "step": 19880 }, { "epoch": 6.631087391594396, "ref_ce_loss": 0.047104138880968094, "step": 19880 }, { "epoch": 6.631087391594396, "loss": 0.38310331106185913, "step": 19880 }, { "ce_loss": 0.10056553035974503, "epoch": 6.631087391594396, "step": 19880 }, { "distill_loss": 0.1666891872882843, "epoch": 6.631087391594396, "step": 19880 }, { "epoch": 6.631087391594396, "ref_ce_loss": 0.07143868505954742, "step": 19880 }, { "epoch": 6.634422948632421, "loss": 0.362, "step": 19890 }, { "epoch": 6.634422948632421, "grad_norm": 2.2830758094787598, "step": 19890 }, { "epoch": 6.634422948632421, "learning_rate": 2.2345891943761868e-05, "step": 19890 }, { "epoch": 6.634422948632421, "loss": 0.228542760014534, "step": 19890 }, { "ce_loss": 0.012633942998945713, "epoch": 6.634422948632421, "step": 19890 }, { "distill_loss": 0.10933070629835129, "epoch": 6.634422948632421, "step": 19890 }, { "epoch": 6.634422948632421, "ref_ce_loss": 0.03913666307926178, "step": 19890 }, { "epoch": 6.634422948632421, "loss": 0.25820690393447876, "step": 19890 }, { "ce_loss": 0.047330550849437714, "epoch": 6.634422948632421, "step": 19890 }, { "distill_loss": 0.11249566078186035, "epoch": 6.634422948632421, "step": 19890 }, { "epoch": 6.634422948632421, "ref_ce_loss": 0.07109885662794113, "step": 19890 }, { "epoch": 6.634422948632421, "loss": 0.2614418566226959, "step": 19890 }, { "ce_loss": 0.04621879383921623, "epoch": 6.634422948632421, "step": 19890 }, { "distill_loss": 0.12126925587654114, "epoch": 6.634422948632421, "step": 19890 }, { "epoch": 6.634422948632421, "ref_ce_loss": 0.06826559454202652, "step": 19890 }, { "epoch": 6.634422948632421, "loss": 0.3106783330440521, "step": 19890 }, { "ce_loss": 0.08366413414478302, "epoch": 6.634422948632421, "step": 19890 }, { "distill_loss": 0.1382652223110199, "epoch": 6.634422948632421, "step": 19890 }, { "epoch": 6.634422948632421, "ref_ce_loss": 0.05525009334087372, "step": 19890 }, { "epoch": 6.637758505670447, "loss": 0.3104, "step": 19900 }, { "epoch": 6.637758505670447, "grad_norm": 2.347456216812134, "step": 19900 }, { "epoch": 6.637758505670447, "learning_rate": 2.223963905156837e-05, "step": 19900 }, { "epoch": 6.637758505670447, "loss": 0.37175440788269043, "step": 19900 }, { "ce_loss": 0.026647180318832397, "epoch": 6.637758505670447, "step": 19900 }, { "distill_loss": 0.2211373895406723, "epoch": 6.637758505670447, "step": 19900 }, { "epoch": 6.637758505670447, "ref_ce_loss": 0.07487079501152039, "step": 19900 }, { "epoch": 6.637758505670447, "loss": 0.31490853428840637, "step": 19900 }, { "ce_loss": 0.025211889296770096, "epoch": 6.637758505670447, "step": 19900 }, { "distill_loss": 0.18323150277137756, "epoch": 6.637758505670447, "step": 19900 }, { "epoch": 6.637758505670447, "ref_ce_loss": 0.07476285099983215, "step": 19900 }, { "epoch": 6.637758505670447, "loss": 0.3347644507884979, "step": 19900 }, { "ce_loss": 0.07297145575284958, "epoch": 6.637758505670447, "step": 19900 }, { "distill_loss": 0.17718927562236786, "epoch": 6.637758505670447, "step": 19900 }, { "epoch": 6.637758505670447, "ref_ce_loss": 0.05877537652850151, "step": 19900 }, { "epoch": 6.637758505670447, "loss": 0.24840539693832397, "step": 19900 }, { "ce_loss": 0.026537470519542694, "epoch": 6.637758505670447, "step": 19900 }, { "distill_loss": 0.14945150911808014, "epoch": 6.637758505670447, "step": 19900 }, { "epoch": 6.637758505670447, "ref_ce_loss": 0.049950964748859406, "step": 19900 }, { "epoch": 6.641094062708472, "loss": 0.3258, "step": 19910 }, { "epoch": 6.641094062708472, "grad_norm": 3.141188383102417, "step": 19910 }, { "epoch": 6.641094062708472, "learning_rate": 2.2133619143875915e-05, "step": 19910 }, { "epoch": 6.641094062708472, "loss": 0.16172029078006744, "step": 19910 }, { "ce_loss": 0.019133074209094048, "epoch": 6.641094062708472, "step": 19910 }, { "distill_loss": 0.09872619807720184, "epoch": 6.641094062708472, "step": 19910 }, { "epoch": 6.641094062708472, "ref_ce_loss": 0.04360087215900421, "step": 19910 }, { "epoch": 6.641094062708472, "loss": 0.42412227392196655, "step": 19910 }, { "ce_loss": 0.06478115171194077, "epoch": 6.641094062708472, "step": 19910 }, { "distill_loss": 0.14595475792884827, "epoch": 6.641094062708472, "step": 19910 }, { "epoch": 6.641094062708472, "ref_ce_loss": 0.10079323500394821, "step": 19910 }, { "epoch": 6.641094062708472, "loss": 0.4226725697517395, "step": 19910 }, { "ce_loss": 0.0367148257791996, "epoch": 6.641094062708472, "step": 19910 }, { "distill_loss": 0.21141110360622406, "epoch": 6.641094062708472, "step": 19910 }, { "epoch": 6.641094062708472, "ref_ce_loss": 0.07302436232566833, "step": 19910 }, { "epoch": 6.641094062708472, "loss": 0.289275586605072, "step": 19910 }, { "ce_loss": 0.02699970081448555, "epoch": 6.641094062708472, "step": 19910 }, { "distill_loss": 0.18730556964874268, "epoch": 6.641094062708472, "step": 19910 }, { "epoch": 6.641094062708472, "ref_ce_loss": 0.052146654576063156, "step": 19910 }, { "epoch": 6.644429619746497, "loss": 0.3361, "step": 19920 }, { "epoch": 6.644429619746497, "grad_norm": 2.280363082885742, "step": 19920 }, { "epoch": 6.644429619746497, "learning_rate": 2.2027832414022946e-05, "step": 19920 }, { "epoch": 6.644429619746497, "loss": 0.29020074009895325, "step": 19920 }, { "ce_loss": 0.04425665736198425, "epoch": 6.644429619746497, "step": 19920 }, { "distill_loss": 0.16273631155490875, "epoch": 6.644429619746497, "step": 19920 }, { "epoch": 6.644429619746497, "ref_ce_loss": 0.08309007436037064, "step": 19920 }, { "epoch": 6.644429619746497, "loss": 0.26776793599128723, "step": 19920 }, { "ce_loss": 0.03876633942127228, "epoch": 6.644429619746497, "step": 19920 }, { "distill_loss": 0.1577092856168747, "epoch": 6.644429619746497, "step": 19920 }, { "epoch": 6.644429619746497, "ref_ce_loss": 0.053327981382608414, "step": 19920 }, { "epoch": 6.644429619746497, "loss": 1.059072732925415, "step": 19920 }, { "ce_loss": 0.05745551362633705, "epoch": 6.644429619746497, "step": 19920 }, { "distill_loss": 0.19399422407150269, "epoch": 6.644429619746497, "step": 19920 }, { "epoch": 6.644429619746497, "ref_ce_loss": 0.05763303115963936, "step": 19920 }, { "epoch": 6.644429619746497, "loss": 0.21548689901828766, "step": 19920 }, { "ce_loss": 0.027986306697130203, "epoch": 6.644429619746497, "step": 19920 }, { "distill_loss": 0.10086776316165924, "epoch": 6.644429619746497, "step": 19920 }, { "epoch": 6.644429619746497, "ref_ce_loss": 0.05304734781384468, "step": 19920 }, { "epoch": 6.647765176784523, "loss": 0.3455, "step": 19930 }, { "epoch": 6.647765176784523, "grad_norm": 2.4683637619018555, "step": 19930 }, { "epoch": 6.647765176784523, "learning_rate": 2.192227905492275e-05, "step": 19930 }, { "epoch": 6.647765176784523, "loss": 0.26920247077941895, "step": 19930 }, { "ce_loss": 0.06250952929258347, "epoch": 6.647765176784523, "step": 19930 }, { "distill_loss": 0.1409088373184204, "epoch": 6.647765176784523, "step": 19930 }, { "epoch": 6.647765176784523, "ref_ce_loss": 0.05059449374675751, "step": 19930 }, { "epoch": 6.647765176784523, "loss": 0.1810358762741089, "step": 19930 }, { "ce_loss": 0.023886408656835556, "epoch": 6.647765176784523, "step": 19930 }, { "distill_loss": 0.11785140633583069, "epoch": 6.647765176784523, "step": 19930 }, { "epoch": 6.647765176784523, "ref_ce_loss": 0.03921708092093468, "step": 19930 }, { "epoch": 6.647765176784523, "loss": 0.23555897176265717, "step": 19930 }, { "ce_loss": 0.029333343729376793, "epoch": 6.647765176784523, "step": 19930 }, { "distill_loss": 0.12829671800136566, "epoch": 6.647765176784523, "step": 19930 }, { "epoch": 6.647765176784523, "ref_ce_loss": 0.05001313239336014, "step": 19930 }, { "epoch": 6.647765176784523, "loss": 0.3103119134902954, "step": 19930 }, { "ce_loss": 0.04330439865589142, "epoch": 6.647765176784523, "step": 19930 }, { "distill_loss": 0.13884636759757996, "epoch": 6.647765176784523, "step": 19930 }, { "epoch": 6.647765176784523, "ref_ce_loss": 0.07495592534542084, "step": 19930 }, { "epoch": 6.651100733822548, "loss": 0.3722, "step": 19940 }, { "epoch": 6.651100733822548, "grad_norm": 2.8103508949279785, "step": 19940 }, { "epoch": 6.651100733822548, "learning_rate": 2.1816959259063034e-05, "step": 19940 }, { "epoch": 6.651100733822548, "loss": 0.7184747457504272, "step": 19940 }, { "ce_loss": 0.07260715216398239, "epoch": 6.651100733822548, "step": 19940 }, { "distill_loss": 0.17788881063461304, "epoch": 6.651100733822548, "step": 19940 }, { "epoch": 6.651100733822548, "ref_ce_loss": 0.05647752434015274, "step": 19940 }, { "epoch": 6.651100733822548, "loss": 0.34915584325790405, "step": 19940 }, { "ce_loss": 0.028711065649986267, "epoch": 6.651100733822548, "step": 19940 }, { "distill_loss": 0.12251932173967361, "epoch": 6.651100733822548, "step": 19940 }, { "epoch": 6.651100733822548, "ref_ce_loss": 0.057379350066185, "step": 19940 }, { "epoch": 6.651100733822548, "loss": 0.41050225496292114, "step": 19940 }, { "ce_loss": 0.008283420465886593, "epoch": 6.651100733822548, "step": 19940 }, { "distill_loss": 0.2010456621646881, "epoch": 6.651100733822548, "step": 19940 }, { "epoch": 6.651100733822548, "ref_ce_loss": 0.062345150858163834, "step": 19940 }, { "epoch": 6.651100733822548, "loss": 0.24619829654693604, "step": 19940 }, { "ce_loss": 0.03133643418550491, "epoch": 6.651100733822548, "step": 19940 }, { "distill_loss": 0.11145929992198944, "epoch": 6.651100733822548, "step": 19940 }, { "epoch": 6.651100733822548, "ref_ce_loss": 0.03711400181055069, "step": 19940 }, { "epoch": 6.654436290860573, "loss": 0.3623, "step": 19950 }, { "epoch": 6.654436290860573, "grad_norm": 5.005858898162842, "step": 19950 }, { "epoch": 6.654436290860573, "learning_rate": 2.1711873218505533e-05, "step": 19950 }, { "epoch": 6.654436290860573, "loss": 0.26657575368881226, "step": 19950 }, { "ce_loss": 0.05676257982850075, "epoch": 6.654436290860573, "step": 19950 }, { "distill_loss": 0.1302073895931244, "epoch": 6.654436290860573, "step": 19950 }, { "epoch": 6.654436290860573, "ref_ce_loss": 0.06166147440671921, "step": 19950 }, { "epoch": 6.654436290860573, "loss": 0.8528383374214172, "step": 19950 }, { "ce_loss": 0.13403521478176117, "epoch": 6.654436290860573, "step": 19950 }, { "distill_loss": 0.1755397915840149, "epoch": 6.654436290860573, "step": 19950 }, { "epoch": 6.654436290860573, "ref_ce_loss": 0.0475611612200737, "step": 19950 }, { "epoch": 6.654436290860573, "loss": 0.23576173186302185, "step": 19950 }, { "ce_loss": 0.032652635127305984, "epoch": 6.654436290860573, "step": 19950 }, { "distill_loss": 0.11732666939496994, "epoch": 6.654436290860573, "step": 19950 }, { "epoch": 6.654436290860573, "ref_ce_loss": 0.053122635930776596, "step": 19950 }, { "epoch": 6.654436290860573, "loss": 0.23466111719608307, "step": 19950 }, { "ce_loss": 0.018693789839744568, "epoch": 6.654436290860573, "step": 19950 }, { "distill_loss": 0.1024949923157692, "epoch": 6.654436290860573, "step": 19950 }, { "epoch": 6.654436290860573, "ref_ce_loss": 0.0520513616502285, "step": 19950 }, { "epoch": 6.657771847898599, "loss": 0.3175, "step": 19960 }, { "epoch": 6.657771847898599, "grad_norm": 2.6065797805786133, "step": 19960 }, { "epoch": 6.657771847898599, "learning_rate": 2.160702112488577e-05, "step": 19960 }, { "epoch": 6.657771847898599, "loss": 0.29395872354507446, "step": 19960 }, { "ce_loss": 0.0614352822303772, "epoch": 6.657771847898599, "step": 19960 }, { "distill_loss": 0.14185380935668945, "epoch": 6.657771847898599, "step": 19960 }, { "epoch": 6.657771847898599, "ref_ce_loss": 0.06420893967151642, "step": 19960 }, { "epoch": 6.657771847898599, "loss": 0.5165930390357971, "step": 19960 }, { "ce_loss": 0.018710751086473465, "epoch": 6.657771847898599, "step": 19960 }, { "distill_loss": 0.27090543508529663, "epoch": 6.657771847898599, "step": 19960 }, { "epoch": 6.657771847898599, "ref_ce_loss": 0.05845800042152405, "step": 19960 }, { "epoch": 6.657771847898599, "loss": 0.2694173753261566, "step": 19960 }, { "ce_loss": 0.034820299595594406, "epoch": 6.657771847898599, "step": 19960 }, { "distill_loss": 0.0995931401848793, "epoch": 6.657771847898599, "step": 19960 }, { "epoch": 6.657771847898599, "ref_ce_loss": 0.023787811398506165, "step": 19960 }, { "epoch": 6.657771847898599, "loss": 0.21081291139125824, "step": 19960 }, { "ce_loss": 0.04316055029630661, "epoch": 6.657771847898599, "step": 19960 }, { "distill_loss": 0.10261419415473938, "epoch": 6.657771847898599, "step": 19960 }, { "epoch": 6.657771847898599, "ref_ce_loss": 0.04531471058726311, "step": 19960 }, { "epoch": 6.661107404936624, "loss": 0.3254, "step": 19970 }, { "epoch": 6.661107404936624, "grad_norm": 3.9140877723693848, "step": 19970 }, { "epoch": 6.661107404936624, "learning_rate": 2.1502403169412564e-05, "step": 19970 }, { "epoch": 6.661107404936624, "loss": 0.4480225741863251, "step": 19970 }, { "ce_loss": 0.09914959967136383, "epoch": 6.661107404936624, "step": 19970 }, { "distill_loss": 0.2714824974536896, "epoch": 6.661107404936624, "step": 19970 }, { "epoch": 6.661107404936624, "ref_ce_loss": 0.05960004776716232, "step": 19970 }, { "epoch": 6.661107404936624, "loss": 0.29610762000083923, "step": 19970 }, { "ce_loss": 0.05531436204910278, "epoch": 6.661107404936624, "step": 19970 }, { "distill_loss": 0.12633873522281647, "epoch": 6.661107404936624, "step": 19970 }, { "epoch": 6.661107404936624, "ref_ce_loss": 0.02944735810160637, "step": 19970 }, { "epoch": 6.661107404936624, "loss": 0.2766069173812866, "step": 19970 }, { "ce_loss": 0.08344614505767822, "epoch": 6.661107404936624, "step": 19970 }, { "distill_loss": 0.12200739234685898, "epoch": 6.661107404936624, "step": 19970 }, { "epoch": 6.661107404936624, "ref_ce_loss": 0.07094717770814896, "step": 19970 }, { "epoch": 6.661107404936624, "loss": 0.29538315534591675, "step": 19970 }, { "ce_loss": 0.028203513473272324, "epoch": 6.661107404936624, "step": 19970 }, { "distill_loss": 0.10594826191663742, "epoch": 6.661107404936624, "step": 19970 }, { "epoch": 6.661107404936624, "ref_ce_loss": 0.03950703516602516, "step": 19970 }, { "epoch": 6.6644429619746495, "loss": 0.3542, "step": 19980 }, { "epoch": 6.6644429619746495, "grad_norm": 2.6698086261749268, "step": 19980 }, { "epoch": 6.6644429619746495, "learning_rate": 2.13980195428678e-05, "step": 19980 }, { "epoch": 6.6644429619746495, "loss": 0.20336762070655823, "step": 19980 }, { "ce_loss": 0.013415360823273659, "epoch": 6.6644429619746495, "step": 19980 }, { "distill_loss": 0.09042014181613922, "epoch": 6.6644429619746495, "step": 19980 }, { "epoch": 6.6644429619746495, "ref_ce_loss": 0.042300231754779816, "step": 19980 }, { "epoch": 6.6644429619746495, "loss": 0.2975046932697296, "step": 19980 }, { "ce_loss": 0.051743652671575546, "epoch": 6.6644429619746495, "step": 19980 }, { "distill_loss": 0.144939586520195, "epoch": 6.6644429619746495, "step": 19980 }, { "epoch": 6.6644429619746495, "ref_ce_loss": 0.05688483640551567, "step": 19980 }, { "epoch": 6.6644429619746495, "loss": 0.2748930752277374, "step": 19980 }, { "ce_loss": 0.048272471874952316, "epoch": 6.6644429619746495, "step": 19980 }, { "distill_loss": 0.13878586888313293, "epoch": 6.6644429619746495, "step": 19980 }, { "epoch": 6.6644429619746495, "ref_ce_loss": 0.07151763886213303, "step": 19980 }, { "epoch": 6.6644429619746495, "loss": 0.1987239122390747, "step": 19980 }, { "ce_loss": 0.020666783675551414, "epoch": 6.6644429619746495, "step": 19980 }, { "distill_loss": 0.09921713918447495, "epoch": 6.6644429619746495, "step": 19980 }, { "epoch": 6.6644429619746495, "ref_ce_loss": 0.05615457147359848, "step": 19980 }, { "epoch": 6.667778519012675, "loss": 0.3404, "step": 19990 }, { "epoch": 6.667778519012675, "grad_norm": 4.083954811096191, "step": 19990 }, { "epoch": 6.667778519012675, "learning_rate": 2.1293870435606047e-05, "step": 19990 }, { "epoch": 6.667778519012675, "loss": 0.3248312473297119, "step": 19990 }, { "ce_loss": 0.07799259573221207, "epoch": 6.667778519012675, "step": 19990 }, { "distill_loss": 0.17638051509857178, "epoch": 6.667778519012675, "step": 19990 }, { "epoch": 6.667778519012675, "ref_ce_loss": 0.06979241222143173, "step": 19990 }, { "epoch": 6.667778519012675, "loss": 0.3776983916759491, "step": 19990 }, { "ce_loss": 0.08105441927909851, "epoch": 6.667778519012675, "step": 19990 }, { "distill_loss": 0.18132013082504272, "epoch": 6.667778519012675, "step": 19990 }, { "epoch": 6.667778519012675, "ref_ce_loss": 0.08397438377141953, "step": 19990 }, { "epoch": 6.667778519012675, "loss": 0.4606001377105713, "step": 19990 }, { "ce_loss": 0.10863593220710754, "epoch": 6.667778519012675, "step": 19990 }, { "distill_loss": 0.16064369678497314, "epoch": 6.667778519012675, "step": 19990 }, { "epoch": 6.667778519012675, "ref_ce_loss": 0.10042203217744827, "step": 19990 }, { "epoch": 6.667778519012675, "loss": 0.3327219486236572, "step": 19990 }, { "ce_loss": 0.06780189275741577, "epoch": 6.667778519012675, "step": 19990 }, { "distill_loss": 0.2290269136428833, "epoch": 6.667778519012675, "step": 19990 }, { "epoch": 6.667778519012675, "ref_ce_loss": 0.03579997643828392, "step": 19990 }, { "epoch": 6.6711140760507, "loss": 0.3328, "step": 20000 }, { "epoch": 6.6711140760507, "grad_norm": 4.256439208984375, "step": 20000 }, { "epoch": 6.6711140760507, "learning_rate": 2.1189956037554197e-05, "step": 20000 }, { "epoch": 6.6711140760507, "loss": 0.22355251014232635, "step": 20000 }, { "ce_loss": 0.022118112072348595, "epoch": 6.6711140760507, "step": 20000 }, { "distill_loss": 0.12368350476026535, "epoch": 6.6711140760507, "step": 20000 }, { "epoch": 6.6711140760507, "ref_ce_loss": 0.07756340503692627, "step": 20000 }, { "epoch": 6.6711140760507, "loss": 0.24665537476539612, "step": 20000 }, { "ce_loss": 0.018713662400841713, "epoch": 6.6711140760507, "step": 20000 }, { "distill_loss": 0.1372748613357544, "epoch": 6.6711140760507, "step": 20000 }, { "epoch": 6.6711140760507, "ref_ce_loss": 0.026346355676651, "step": 20000 }, { "epoch": 6.6711140760507, "loss": 0.23919732868671417, "step": 20000 }, { "ce_loss": 0.03264044597744942, "epoch": 6.6711140760507, "step": 20000 }, { "distill_loss": 0.13985341787338257, "epoch": 6.6711140760507, "step": 20000 }, { "epoch": 6.6711140760507, "ref_ce_loss": 0.0665779784321785, "step": 20000 }, { "epoch": 6.6711140760507, "loss": 0.197180837392807, "step": 20000 }, { "ce_loss": 0.017834221944212914, "epoch": 6.6711140760507, "step": 20000 }, { "distill_loss": 0.11055406928062439, "epoch": 6.6711140760507, "step": 20000 }, { "epoch": 6.6711140760507, "ref_ce_loss": 0.049189746379852295, "step": 20000 }, { "epoch": 6.6744496330887255, "loss": 0.3045, "step": 20010 }, { "epoch": 6.6744496330887255, "grad_norm": 2.5865511894226074, "step": 20010 }, { "epoch": 6.6744496330887255, "learning_rate": 2.1086276538211144e-05, "step": 20010 }, { "epoch": 6.6744496330887255, "loss": 0.42007237672805786, "step": 20010 }, { "ce_loss": 0.08485287427902222, "epoch": 6.6744496330887255, "step": 20010 }, { "distill_loss": 0.14501990377902985, "epoch": 6.6744496330887255, "step": 20010 }, { "epoch": 6.6744496330887255, "ref_ce_loss": 0.07646860182285309, "step": 20010 }, { "epoch": 6.6744496330887255, "loss": 0.3304160237312317, "step": 20010 }, { "ce_loss": 0.018804801627993584, "epoch": 6.6744496330887255, "step": 20010 }, { "distill_loss": 0.11846814304590225, "epoch": 6.6744496330887255, "step": 20010 }, { "epoch": 6.6744496330887255, "ref_ce_loss": 0.05221167951822281, "step": 20010 }, { "epoch": 6.6744496330887255, "loss": 0.23049429059028625, "step": 20010 }, { "ce_loss": 0.024256093427538872, "epoch": 6.6744496330887255, "step": 20010 }, { "distill_loss": 0.15001031756401062, "epoch": 6.6744496330887255, "step": 20010 }, { "epoch": 6.6744496330887255, "ref_ce_loss": 0.05604208633303642, "step": 20010 }, { "epoch": 6.6744496330887255, "loss": 0.29497581720352173, "step": 20010 }, { "ce_loss": 0.012918825261294842, "epoch": 6.6744496330887255, "step": 20010 }, { "distill_loss": 0.11682042479515076, "epoch": 6.6744496330887255, "step": 20010 }, { "epoch": 6.6744496330887255, "ref_ce_loss": 0.05747944861650467, "step": 20010 }, { "epoch": 6.677785190126751, "loss": 0.319, "step": 20020 }, { "epoch": 6.677785190126751, "grad_norm": 2.0363032817840576, "step": 20020 }, { "epoch": 6.677785190126751, "learning_rate": 2.0982832126647352e-05, "step": 20020 }, { "epoch": 6.677785190126751, "loss": 0.44392961263656616, "step": 20020 }, { "ce_loss": 0.0858464390039444, "epoch": 6.677785190126751, "step": 20020 }, { "distill_loss": 0.2778363525867462, "epoch": 6.677785190126751, "step": 20020 }, { "epoch": 6.677785190126751, "ref_ce_loss": 0.08002053946256638, "step": 20020 }, { "epoch": 6.677785190126751, "loss": 0.2499372363090515, "step": 20020 }, { "ce_loss": 0.026511620730161667, "epoch": 6.677785190126751, "step": 20020 }, { "distill_loss": 0.09822176396846771, "epoch": 6.677785190126751, "step": 20020 }, { "epoch": 6.677785190126751, "ref_ce_loss": 0.03569779545068741, "step": 20020 }, { "epoch": 6.677785190126751, "loss": 0.2579372823238373, "step": 20020 }, { "ce_loss": 0.008953968994319439, "epoch": 6.677785190126751, "step": 20020 }, { "distill_loss": 0.1450115293264389, "epoch": 6.677785190126751, "step": 20020 }, { "epoch": 6.677785190126751, "ref_ce_loss": 0.08302398025989532, "step": 20020 }, { "epoch": 6.677785190126751, "loss": 0.26329633593559265, "step": 20020 }, { "ce_loss": 0.08157920092344284, "epoch": 6.677785190126751, "step": 20020 }, { "distill_loss": 0.1411052644252777, "epoch": 6.677785190126751, "step": 20020 }, { "epoch": 6.677785190126751, "ref_ce_loss": 0.031311504542827606, "step": 20020 }, { "epoch": 6.681120747164776, "loss": 0.3117, "step": 20030 }, { "epoch": 6.681120747164776, "grad_norm": 2.237546682357788, "step": 20030 }, { "epoch": 6.681120747164776, "learning_rate": 2.087962299150464e-05, "step": 20030 }, { "epoch": 6.681120747164776, "loss": 0.3706545829772949, "step": 20030 }, { "ce_loss": 0.05204419791698456, "epoch": 6.681120747164776, "step": 20030 }, { "distill_loss": 0.17340388894081116, "epoch": 6.681120747164776, "step": 20030 }, { "epoch": 6.681120747164776, "ref_ce_loss": 0.10555092245340347, "step": 20030 }, { "epoch": 6.681120747164776, "loss": 0.38419580459594727, "step": 20030 }, { "ce_loss": 0.029340149834752083, "epoch": 6.681120747164776, "step": 20030 }, { "distill_loss": 0.28156542778015137, "epoch": 6.681120747164776, "step": 20030 }, { "epoch": 6.681120747164776, "ref_ce_loss": 0.058754369616508484, "step": 20030 }, { "epoch": 6.681120747164776, "loss": 0.46436163783073425, "step": 20030 }, { "ce_loss": 0.031260956078767776, "epoch": 6.681120747164776, "step": 20030 }, { "distill_loss": 0.15603461861610413, "epoch": 6.681120747164776, "step": 20030 }, { "epoch": 6.681120747164776, "ref_ce_loss": 0.06692703813314438, "step": 20030 }, { "epoch": 6.681120747164776, "loss": 0.1949024498462677, "step": 20030 }, { "ce_loss": 0.016338814049959183, "epoch": 6.681120747164776, "step": 20030 }, { "distill_loss": 0.10759077966213226, "epoch": 6.681120747164776, "step": 20030 }, { "epoch": 6.681120747164776, "ref_ce_loss": 0.05400192365050316, "step": 20030 }, { "epoch": 6.684456304202802, "loss": 0.3256, "step": 20040 }, { "epoch": 6.684456304202802, "grad_norm": 4.661191940307617, "step": 20040 }, { "epoch": 6.684456304202802, "learning_rate": 2.0776649320995754e-05, "step": 20040 }, { "epoch": 6.684456304202802, "loss": 0.18244490027427673, "step": 20040 }, { "ce_loss": 0.004614561330527067, "epoch": 6.684456304202802, "step": 20040 }, { "distill_loss": 0.12099388986825943, "epoch": 6.684456304202802, "step": 20040 }, { "epoch": 6.684456304202802, "ref_ce_loss": 0.035793665796518326, "step": 20040 }, { "epoch": 6.684456304202802, "loss": 0.19822382926940918, "step": 20040 }, { "ce_loss": 0.03608737140893936, "epoch": 6.684456304202802, "step": 20040 }, { "distill_loss": 0.11221136897802353, "epoch": 6.684456304202802, "step": 20040 }, { "epoch": 6.684456304202802, "ref_ce_loss": 0.03879408538341522, "step": 20040 }, { "epoch": 6.684456304202802, "loss": 0.32134145498275757, "step": 20040 }, { "ce_loss": 0.050765812397003174, "epoch": 6.684456304202802, "step": 20040 }, { "distill_loss": 0.17691972851753235, "epoch": 6.684456304202802, "step": 20040 }, { "epoch": 6.684456304202802, "ref_ce_loss": 0.04532846063375473, "step": 20040 }, { "epoch": 6.684456304202802, "loss": 0.180606871843338, "step": 20040 }, { "ce_loss": 0.036758411675691605, "epoch": 6.684456304202802, "step": 20040 }, { "distill_loss": 0.07417429238557816, "epoch": 6.684456304202802, "step": 20040 }, { "epoch": 6.684456304202802, "ref_ce_loss": 0.042639512568712234, "step": 20040 }, { "epoch": 6.687791861240827, "loss": 0.2825, "step": 20050 }, { "epoch": 6.687791861240827, "grad_norm": 2.5875606536865234, "step": 20050 }, { "epoch": 6.687791861240827, "learning_rate": 2.0673911302904046e-05, "step": 20050 }, { "epoch": 6.687791861240827, "loss": 0.18170462548732758, "step": 20050 }, { "ce_loss": 0.025139065459370613, "epoch": 6.687791861240827, "step": 20050 }, { "distill_loss": 0.07967349141836166, "epoch": 6.687791861240827, "step": 20050 }, { "epoch": 6.687791861240827, "ref_ce_loss": 0.040121741592884064, "step": 20050 }, { "epoch": 6.687791861240827, "loss": 0.4045209586620331, "step": 20050 }, { "ce_loss": 0.02009030058979988, "epoch": 6.687791861240827, "step": 20050 }, { "distill_loss": 0.13364392518997192, "epoch": 6.687791861240827, "step": 20050 }, { "epoch": 6.687791861240827, "ref_ce_loss": 0.03764891251921654, "step": 20050 }, { "epoch": 6.687791861240827, "loss": 0.5435401201248169, "step": 20050 }, { "ce_loss": 0.05513055622577667, "epoch": 6.687791861240827, "step": 20050 }, { "distill_loss": 0.12308389693498611, "epoch": 6.687791861240827, "step": 20050 }, { "epoch": 6.687791861240827, "ref_ce_loss": 0.0426427386701107, "step": 20050 }, { "epoch": 6.687791861240827, "loss": 0.31125912070274353, "step": 20050 }, { "ce_loss": 0.078745536506176, "epoch": 6.687791861240827, "step": 20050 }, { "distill_loss": 0.1420706957578659, "epoch": 6.687791861240827, "step": 20050 }, { "epoch": 6.687791861240827, "ref_ce_loss": 0.050845950841903687, "step": 20050 }, { "epoch": 6.691127418278852, "loss": 0.3104, "step": 20060 }, { "epoch": 6.691127418278852, "grad_norm": 2.9328770637512207, "step": 20060 }, { "epoch": 6.691127418278852, "learning_rate": 2.05714091245832e-05, "step": 20060 }, { "epoch": 6.691127418278852, "loss": 0.28509998321533203, "step": 20060 }, { "ce_loss": 0.050861671566963196, "epoch": 6.691127418278852, "step": 20060 }, { "distill_loss": 0.13205139338970184, "epoch": 6.691127418278852, "step": 20060 }, { "epoch": 6.691127418278852, "ref_ce_loss": 0.040802810341119766, "step": 20060 }, { "epoch": 6.691127418278852, "loss": 0.33471447229385376, "step": 20060 }, { "ce_loss": 0.025919100269675255, "epoch": 6.691127418278852, "step": 20060 }, { "distill_loss": 0.158289834856987, "epoch": 6.691127418278852, "step": 20060 }, { "epoch": 6.691127418278852, "ref_ce_loss": 0.0653957799077034, "step": 20060 }, { "epoch": 6.691127418278852, "loss": 0.2894524037837982, "step": 20060 }, { "ce_loss": 0.04996928945183754, "epoch": 6.691127418278852, "step": 20060 }, { "distill_loss": 0.11954309046268463, "epoch": 6.691127418278852, "step": 20060 }, { "epoch": 6.691127418278852, "ref_ce_loss": 0.05506157875061035, "step": 20060 }, { "epoch": 6.691127418278852, "loss": 0.22797024250030518, "step": 20060 }, { "ce_loss": 0.03314273804426193, "epoch": 6.691127418278852, "step": 20060 }, { "distill_loss": 0.10626078397035599, "epoch": 6.691127418278852, "step": 20060 }, { "epoch": 6.691127418278852, "ref_ce_loss": 0.0642261654138565, "step": 20060 }, { "epoch": 6.694462975316878, "loss": 0.3243, "step": 20070 }, { "epoch": 6.694462975316878, "grad_norm": 3.6613128185272217, "step": 20070 }, { "epoch": 6.694462975316878, "learning_rate": 2.046914297295664e-05, "step": 20070 }, { "epoch": 6.694462975316878, "loss": 0.20102205872535706, "step": 20070 }, { "ce_loss": 0.03726993128657341, "epoch": 6.694462975316878, "step": 20070 }, { "distill_loss": 0.10248197615146637, "epoch": 6.694462975316878, "step": 20070 }, { "epoch": 6.694462975316878, "ref_ce_loss": 0.047353606671094894, "step": 20070 }, { "epoch": 6.694462975316878, "loss": 0.17317506670951843, "step": 20070 }, { "ce_loss": 0.005286495666950941, "epoch": 6.694462975316878, "step": 20070 }, { "distill_loss": 0.10332041233778, "epoch": 6.694462975316878, "step": 20070 }, { "epoch": 6.694462975316878, "ref_ce_loss": 0.06435637921094894, "step": 20070 }, { "epoch": 6.694462975316878, "loss": 0.5771521329879761, "step": 20070 }, { "ce_loss": 0.04414547234773636, "epoch": 6.694462975316878, "step": 20070 }, { "distill_loss": 0.24496690928936005, "epoch": 6.694462975316878, "step": 20070 }, { "epoch": 6.694462975316878, "ref_ce_loss": 0.0880177766084671, "step": 20070 }, { "epoch": 6.694462975316878, "loss": 0.4186633229255676, "step": 20070 }, { "ce_loss": 0.029012061655521393, "epoch": 6.694462975316878, "step": 20070 }, { "distill_loss": 0.14431175589561462, "epoch": 6.694462975316878, "step": 20070 }, { "epoch": 6.694462975316878, "ref_ce_loss": 0.04155807942152023, "step": 20070 }, { "epoch": 6.697798532354903, "loss": 0.3458, "step": 20080 }, { "epoch": 6.697798532354903, "grad_norm": 3.196727752685547, "step": 20080 }, { "epoch": 6.697798532354903, "learning_rate": 2.0367113034517564e-05, "step": 20080 }, { "epoch": 6.697798532354903, "loss": 0.2703838050365448, "step": 20080 }, { "ce_loss": 0.05823126807808876, "epoch": 6.697798532354903, "step": 20080 }, { "distill_loss": 0.11969301104545593, "epoch": 6.697798532354903, "step": 20080 }, { "epoch": 6.697798532354903, "ref_ce_loss": 0.07414967566728592, "step": 20080 }, { "epoch": 6.697798532354903, "loss": 0.3451869487762451, "step": 20080 }, { "ce_loss": 0.07295867055654526, "epoch": 6.697798532354903, "step": 20080 }, { "distill_loss": 0.1378656029701233, "epoch": 6.697798532354903, "step": 20080 }, { "epoch": 6.697798532354903, "ref_ce_loss": 0.07279222458600998, "step": 20080 }, { "epoch": 6.697798532354903, "loss": 0.2282976359128952, "step": 20080 }, { "ce_loss": 0.043244145810604095, "epoch": 6.697798532354903, "step": 20080 }, { "distill_loss": 0.11198434233665466, "epoch": 6.697798532354903, "step": 20080 }, { "epoch": 6.697798532354903, "ref_ce_loss": 0.0550074465572834, "step": 20080 }, { "epoch": 6.697798532354903, "loss": 0.4151057004928589, "step": 20080 }, { "ce_loss": 0.11792191118001938, "epoch": 6.697798532354903, "step": 20080 }, { "distill_loss": 0.19072198867797852, "epoch": 6.697798532354903, "step": 20080 }, { "epoch": 6.697798532354903, "ref_ce_loss": 0.10631078481674194, "step": 20080 }, { "epoch": 6.701134089392928, "loss": 0.3425, "step": 20090 }, { "epoch": 6.701134089392928, "grad_norm": 2.2497243881225586, "step": 20090 }, { "epoch": 6.701134089392928, "learning_rate": 2.0265319495328326e-05, "step": 20090 }, { "epoch": 6.701134089392928, "loss": 0.5373490452766418, "step": 20090 }, { "ce_loss": 0.02349102683365345, "epoch": 6.701134089392928, "step": 20090 }, { "distill_loss": 0.21344724297523499, "epoch": 6.701134089392928, "step": 20090 }, { "epoch": 6.701134089392928, "ref_ce_loss": 0.08333466947078705, "step": 20090 }, { "epoch": 6.701134089392928, "loss": 0.24514442682266235, "step": 20090 }, { "ce_loss": 0.03692420944571495, "epoch": 6.701134089392928, "step": 20090 }, { "distill_loss": 0.1548544466495514, "epoch": 6.701134089392928, "step": 20090 }, { "epoch": 6.701134089392928, "ref_ce_loss": 0.036220796406269073, "step": 20090 }, { "epoch": 6.701134089392928, "loss": 0.191425621509552, "step": 20090 }, { "ce_loss": 0.04407431185245514, "epoch": 6.701134089392928, "step": 20090 }, { "distill_loss": 0.09822914004325867, "epoch": 6.701134089392928, "step": 20090 }, { "epoch": 6.701134089392928, "ref_ce_loss": 0.03383997455239296, "step": 20090 }, { "epoch": 6.701134089392928, "loss": 0.31227219104766846, "step": 20090 }, { "ce_loss": 0.08893024921417236, "epoch": 6.701134089392928, "step": 20090 }, { "distill_loss": 0.17280413210391998, "epoch": 6.701134089392928, "step": 20090 }, { "epoch": 6.701134089392928, "ref_ce_loss": 0.05040694400668144, "step": 20090 }, { "epoch": 6.704469646430954, "loss": 0.3198, "step": 20100 }, { "epoch": 6.704469646430954, "grad_norm": 2.1753933429718018, "step": 20100 }, { "epoch": 6.704469646430954, "learning_rate": 2.0163762541020124e-05, "step": 20100 }, { "epoch": 6.704469646430954, "loss": 0.35048708319664, "step": 20100 }, { "ce_loss": 0.10514289140701294, "epoch": 6.704469646430954, "step": 20100 }, { "distill_loss": 0.18074220418930054, "epoch": 6.704469646430954, "step": 20100 }, { "epoch": 6.704469646430954, "ref_ce_loss": 0.0645001083612442, "step": 20100 }, { "epoch": 6.704469646430954, "loss": 0.32312288880348206, "step": 20100 }, { "ce_loss": 0.04791853949427605, "epoch": 6.704469646430954, "step": 20100 }, { "distill_loss": 0.18330535292625427, "epoch": 6.704469646430954, "step": 20100 }, { "epoch": 6.704469646430954, "ref_ce_loss": 0.041524119675159454, "step": 20100 }, { "epoch": 6.704469646430954, "loss": 0.35315945744514465, "step": 20100 }, { "ce_loss": 0.06787319481372833, "epoch": 6.704469646430954, "step": 20100 }, { "distill_loss": 0.14994627237319946, "epoch": 6.704469646430954, "step": 20100 }, { "epoch": 6.704469646430954, "ref_ce_loss": 0.05921891704201698, "step": 20100 }, { "epoch": 6.704469646430954, "loss": 0.38501298427581787, "step": 20100 }, { "ce_loss": 0.0495845191180706, "epoch": 6.704469646430954, "step": 20100 }, { "distill_loss": 0.12040823698043823, "epoch": 6.704469646430954, "step": 20100 }, { "epoch": 6.704469646430954, "ref_ce_loss": 0.06525509059429169, "step": 20100 }, { "epoch": 6.707805203468979, "loss": 0.3196, "step": 20110 }, { "epoch": 6.707805203468979, "grad_norm": 2.6369545459747314, "step": 20110 }, { "epoch": 6.707805203468979, "learning_rate": 2.0062442356792864e-05, "step": 20110 }, { "epoch": 6.707805203468979, "loss": 0.2736344039440155, "step": 20110 }, { "ce_loss": 0.03203265741467476, "epoch": 6.707805203468979, "step": 20110 }, { "distill_loss": 0.127993643283844, "epoch": 6.707805203468979, "step": 20110 }, { "epoch": 6.707805203468979, "ref_ce_loss": 0.0858127698302269, "step": 20110 }, { "epoch": 6.707805203468979, "loss": 0.13974420726299286, "step": 20110 }, { "ce_loss": 0.0016820939490571618, "epoch": 6.707805203468979, "step": 20110 }, { "distill_loss": 0.08246586471796036, "epoch": 6.707805203468979, "step": 20110 }, { "epoch": 6.707805203468979, "ref_ce_loss": 0.02231876365840435, "step": 20110 }, { "epoch": 6.707805203468979, "loss": 0.2535470724105835, "step": 20110 }, { "ce_loss": 0.03593399375677109, "epoch": 6.707805203468979, "step": 20110 }, { "distill_loss": 0.1197575107216835, "epoch": 6.707805203468979, "step": 20110 }, { "epoch": 6.707805203468979, "ref_ce_loss": 0.060538504272699356, "step": 20110 }, { "epoch": 6.707805203468979, "loss": 0.2826269268989563, "step": 20110 }, { "ce_loss": 0.039776112884283066, "epoch": 6.707805203468979, "step": 20110 }, { "distill_loss": 0.12120041996240616, "epoch": 6.707805203468979, "step": 20110 }, { "epoch": 6.707805203468979, "ref_ce_loss": 0.061351049691438675, "step": 20110 }, { "epoch": 6.711140760507004, "loss": 0.3117, "step": 20120 }, { "epoch": 6.711140760507004, "grad_norm": 3.368558168411255, "step": 20120 }, { "epoch": 6.711140760507004, "learning_rate": 1.9961359127414578e-05, "step": 20120 }, { "epoch": 6.711140760507004, "loss": 0.2674647271633148, "step": 20120 }, { "ce_loss": 0.02018035016953945, "epoch": 6.711140760507004, "step": 20120 }, { "distill_loss": 0.13457518815994263, "epoch": 6.711140760507004, "step": 20120 }, { "epoch": 6.711140760507004, "ref_ce_loss": 0.08921375125646591, "step": 20120 }, { "epoch": 6.711140760507004, "loss": 0.20268602669239044, "step": 20120 }, { "ce_loss": 0.0070023308508098125, "epoch": 6.711140760507004, "step": 20120 }, { "distill_loss": 0.13283786177635193, "epoch": 6.711140760507004, "step": 20120 }, { "epoch": 6.711140760507004, "ref_ce_loss": 0.06247830390930176, "step": 20120 }, { "epoch": 6.711140760507004, "loss": 0.47161468863487244, "step": 20120 }, { "ce_loss": 0.06164795160293579, "epoch": 6.711140760507004, "step": 20120 }, { "distill_loss": 0.23084260523319244, "epoch": 6.711140760507004, "step": 20120 }, { "epoch": 6.711140760507004, "ref_ce_loss": 0.09241283684968948, "step": 20120 }, { "epoch": 6.711140760507004, "loss": 0.22700239717960358, "step": 20120 }, { "ce_loss": 0.03247182443737984, "epoch": 6.711140760507004, "step": 20120 }, { "distill_loss": 0.12559150159358978, "epoch": 6.711140760507004, "step": 20120 }, { "epoch": 6.711140760507004, "ref_ce_loss": 0.0504399836063385, "step": 20120 }, { "epoch": 6.71447631754503, "loss": 0.3401, "step": 20130 }, { "epoch": 6.71447631754503, "grad_norm": 2.520934820175171, "step": 20130 }, { "epoch": 6.71447631754503, "learning_rate": 1.9860513037221165e-05, "step": 20130 }, { "epoch": 6.71447631754503, "loss": 0.24574658274650574, "step": 20130 }, { "ce_loss": 0.06673982739448547, "epoch": 6.71447631754503, "step": 20130 }, { "distill_loss": 0.0998455137014389, "epoch": 6.71447631754503, "step": 20130 }, { "epoch": 6.71447631754503, "ref_ce_loss": 0.04743276908993721, "step": 20130 }, { "epoch": 6.71447631754503, "loss": 0.3920712471008301, "step": 20130 }, { "ce_loss": 0.10085289925336838, "epoch": 6.71447631754503, "step": 20130 }, { "distill_loss": 0.15075841546058655, "epoch": 6.71447631754503, "step": 20130 }, { "epoch": 6.71447631754503, "ref_ce_loss": 0.06611064076423645, "step": 20130 }, { "epoch": 6.71447631754503, "loss": 0.5583319067955017, "step": 20130 }, { "ce_loss": 0.12576350569725037, "epoch": 6.71447631754503, "step": 20130 }, { "distill_loss": 0.31589075922966003, "epoch": 6.71447631754503, "step": 20130 }, { "epoch": 6.71447631754503, "ref_ce_loss": 0.08344506472349167, "step": 20130 }, { "epoch": 6.71447631754503, "loss": 0.26720836758613586, "step": 20130 }, { "ce_loss": 0.06933542340993881, "epoch": 6.71447631754503, "step": 20130 }, { "distill_loss": 0.1287502497434616, "epoch": 6.71447631754503, "step": 20130 }, { "epoch": 6.71447631754503, "ref_ce_loss": 0.04018561542034149, "step": 20130 }, { "epoch": 6.717811874583055, "loss": 0.3339, "step": 20140 }, { "epoch": 6.717811874583055, "grad_norm": 2.886352300643921, "step": 20140 }, { "epoch": 6.717811874583055, "learning_rate": 1.9759904270116165e-05, "step": 20140 }, { "epoch": 6.717811874583055, "loss": 0.18582403659820557, "step": 20140 }, { "ce_loss": 0.00801194366067648, "epoch": 6.717811874583055, "step": 20140 }, { "distill_loss": 0.12590402364730835, "epoch": 6.717811874583055, "step": 20140 }, { "epoch": 6.717811874583055, "ref_ce_loss": 0.05173363909125328, "step": 20140 }, { "epoch": 6.717811874583055, "loss": 0.5001078248023987, "step": 20140 }, { "ce_loss": 0.07312473654747009, "epoch": 6.717811874583055, "step": 20140 }, { "distill_loss": 0.31643715500831604, "epoch": 6.717811874583055, "step": 20140 }, { "epoch": 6.717811874583055, "ref_ce_loss": 0.07059961557388306, "step": 20140 }, { "epoch": 6.717811874583055, "loss": 0.39632368087768555, "step": 20140 }, { "ce_loss": 0.041875191032886505, "epoch": 6.717811874583055, "step": 20140 }, { "distill_loss": 0.1932414025068283, "epoch": 6.717811874583055, "step": 20140 }, { "epoch": 6.717811874583055, "ref_ce_loss": 0.07255880534648895, "step": 20140 }, { "epoch": 6.717811874583055, "loss": 0.2731451988220215, "step": 20140 }, { "ce_loss": 0.02847844362258911, "epoch": 6.717811874583055, "step": 20140 }, { "distill_loss": 0.1204526498913765, "epoch": 6.717811874583055, "step": 20140 }, { "epoch": 6.717811874583055, "ref_ce_loss": 0.03780105710029602, "step": 20140 }, { "epoch": 6.72114743162108, "loss": 0.3203, "step": 20150 }, { "epoch": 6.72114743162108, "grad_norm": 2.101719856262207, "step": 20150 }, { "epoch": 6.72114743162108, "learning_rate": 1.9659533009570223e-05, "step": 20150 }, { "epoch": 6.72114743162108, "loss": 0.4778059124946594, "step": 20150 }, { "ce_loss": 0.03960665687918663, "epoch": 6.72114743162108, "step": 20150 }, { "distill_loss": 0.2315598428249359, "epoch": 6.72114743162108, "step": 20150 }, { "epoch": 6.72114743162108, "ref_ce_loss": 0.055229682475328445, "step": 20150 }, { "epoch": 6.72114743162108, "loss": 0.23675209283828735, "step": 20150 }, { "ce_loss": 0.04655850678682327, "epoch": 6.72114743162108, "step": 20150 }, { "distill_loss": 0.10201462358236313, "epoch": 6.72114743162108, "step": 20150 }, { "epoch": 6.72114743162108, "ref_ce_loss": 0.05079250782728195, "step": 20150 }, { "epoch": 6.72114743162108, "loss": 0.47496479749679565, "step": 20150 }, { "ce_loss": 0.08386895060539246, "epoch": 6.72114743162108, "step": 20150 }, { "distill_loss": 0.1764313131570816, "epoch": 6.72114743162108, "step": 20150 }, { "epoch": 6.72114743162108, "ref_ce_loss": 0.08487064391374588, "step": 20150 }, { "epoch": 6.72114743162108, "loss": 0.36193379759788513, "step": 20150 }, { "ce_loss": 0.09721614420413971, "epoch": 6.72114743162108, "step": 20150 }, { "distill_loss": 0.1604432463645935, "epoch": 6.72114743162108, "step": 20150 }, { "epoch": 6.72114743162108, "ref_ce_loss": 0.07095256447792053, "step": 20150 }, { "epoch": 6.724482988659106, "loss": 0.3354, "step": 20160 }, { "epoch": 6.724482988659106, "grad_norm": 2.4998772144317627, "step": 20160 }, { "epoch": 6.724482988659106, "learning_rate": 1.9559399438620916e-05, "step": 20160 }, { "epoch": 6.724482988659106, "loss": 0.47663408517837524, "step": 20160 }, { "ce_loss": 0.05730144679546356, "epoch": 6.724482988659106, "step": 20160 }, { "distill_loss": 0.263236403465271, "epoch": 6.724482988659106, "step": 20160 }, { "epoch": 6.724482988659106, "ref_ce_loss": 0.10452139377593994, "step": 20160 }, { "epoch": 6.724482988659106, "loss": 0.29100215435028076, "step": 20160 }, { "ce_loss": 0.01226563099771738, "epoch": 6.724482988659106, "step": 20160 }, { "distill_loss": 0.09933969378471375, "epoch": 6.724482988659106, "step": 20160 }, { "epoch": 6.724482988659106, "ref_ce_loss": 0.06125570833683014, "step": 20160 }, { "epoch": 6.724482988659106, "loss": 0.19560547173023224, "step": 20160 }, { "ce_loss": 0.028779683634638786, "epoch": 6.724482988659106, "step": 20160 }, { "distill_loss": 0.0916346088051796, "epoch": 6.724482988659106, "step": 20160 }, { "epoch": 6.724482988659106, "ref_ce_loss": 0.06244521588087082, "step": 20160 }, { "epoch": 6.724482988659106, "loss": 0.4223164916038513, "step": 20160 }, { "ce_loss": 0.06593258678913116, "epoch": 6.724482988659106, "step": 20160 }, { "distill_loss": 0.20690634846687317, "epoch": 6.724482988659106, "step": 20160 }, { "epoch": 6.724482988659106, "ref_ce_loss": 0.10990146547555923, "step": 20160 }, { "epoch": 6.727818545697131, "loss": 0.3546, "step": 20170 }, { "epoch": 6.727818545697131, "grad_norm": 2.427511692047119, "step": 20170 }, { "epoch": 6.727818545697131, "learning_rate": 1.945950373987248e-05, "step": 20170 }, { "epoch": 6.727818545697131, "loss": 0.24235740303993225, "step": 20170 }, { "ce_loss": 0.039137471467256546, "epoch": 6.727818545697131, "step": 20170 }, { "distill_loss": 0.11441465467214584, "epoch": 6.727818545697131, "step": 20170 }, { "epoch": 6.727818545697131, "ref_ce_loss": 0.06497035175561905, "step": 20170 }, { "epoch": 6.727818545697131, "loss": 0.3280067443847656, "step": 20170 }, { "ce_loss": 0.04011811688542366, "epoch": 6.727818545697131, "step": 20170 }, { "distill_loss": 0.10386212170124054, "epoch": 6.727818545697131, "step": 20170 }, { "epoch": 6.727818545697131, "ref_ce_loss": 0.0749388188123703, "step": 20170 }, { "epoch": 6.727818545697131, "loss": 0.2595832645893097, "step": 20170 }, { "ce_loss": 0.08086634427309036, "epoch": 6.727818545697131, "step": 20170 }, { "distill_loss": 0.1273350715637207, "epoch": 6.727818545697131, "step": 20170 }, { "epoch": 6.727818545697131, "ref_ce_loss": 0.042277704924345016, "step": 20170 }, { "epoch": 6.727818545697131, "loss": 0.2752918004989624, "step": 20170 }, { "ce_loss": 0.024764977395534515, "epoch": 6.727818545697131, "step": 20170 }, { "distill_loss": 0.12322026491165161, "epoch": 6.727818545697131, "step": 20170 }, { "epoch": 6.727818545697131, "ref_ce_loss": 0.058638669550418854, "step": 20170 }, { "epoch": 6.7311541027351565, "loss": 0.2855, "step": 20180 }, { "epoch": 6.7311541027351565, "grad_norm": 2.475968360900879, "step": 20180 }, { "epoch": 6.7311541027351565, "learning_rate": 1.9359846095495158e-05, "step": 20180 }, { "epoch": 6.7311541027351565, "loss": 0.31172600388526917, "step": 20180 }, { "ce_loss": 0.07925920933485031, "epoch": 6.7311541027351565, "step": 20180 }, { "distill_loss": 0.1636563092470169, "epoch": 6.7311541027351565, "step": 20180 }, { "epoch": 6.7311541027351565, "ref_ce_loss": 0.06868557631969452, "step": 20180 }, { "epoch": 6.7311541027351565, "loss": 0.41545963287353516, "step": 20180 }, { "ce_loss": 0.0213873703032732, "epoch": 6.7311541027351565, "step": 20180 }, { "distill_loss": 0.1410737931728363, "epoch": 6.7311541027351565, "step": 20180 }, { "epoch": 6.7311541027351565, "ref_ce_loss": 0.07231228053569794, "step": 20180 }, { "epoch": 6.7311541027351565, "loss": 0.27609914541244507, "step": 20180 }, { "ce_loss": 0.05257976055145264, "epoch": 6.7311541027351565, "step": 20180 }, { "distill_loss": 0.1249610036611557, "epoch": 6.7311541027351565, "step": 20180 }, { "epoch": 6.7311541027351565, "ref_ce_loss": 0.06813840568065643, "step": 20180 }, { "epoch": 6.7311541027351565, "loss": 0.2992923855781555, "step": 20180 }, { "ce_loss": 0.04836418479681015, "epoch": 6.7311541027351565, "step": 20180 }, { "distill_loss": 0.17673200368881226, "epoch": 6.7311541027351565, "step": 20180 }, { "epoch": 6.7311541027351565, "ref_ce_loss": 0.058475811034440994, "step": 20180 }, { "epoch": 6.734489659773182, "loss": 0.3448, "step": 20190 }, { "epoch": 6.734489659773182, "grad_norm": 2.1060166358947754, "step": 20190 }, { "epoch": 6.734489659773182, "learning_rate": 1.926042668722526e-05, "step": 20190 }, { "epoch": 6.734489659773182, "loss": 0.3200954496860504, "step": 20190 }, { "ce_loss": 0.05356490612030029, "epoch": 6.734489659773182, "step": 20190 }, { "distill_loss": 0.19207656383514404, "epoch": 6.734489659773182, "step": 20190 }, { "epoch": 6.734489659773182, "ref_ce_loss": 0.05510849133133888, "step": 20190 }, { "epoch": 6.734489659773182, "loss": 0.3080518841743469, "step": 20190 }, { "ce_loss": 0.02909858524799347, "epoch": 6.734489659773182, "step": 20190 }, { "distill_loss": 0.14246626198291779, "epoch": 6.734489659773182, "step": 20190 }, { "epoch": 6.734489659773182, "ref_ce_loss": 0.055522508919239044, "step": 20190 }, { "epoch": 6.734489659773182, "loss": 0.21634231507778168, "step": 20190 }, { "ce_loss": 0.039301078766584396, "epoch": 6.734489659773182, "step": 20190 }, { "distill_loss": 0.12853223085403442, "epoch": 6.734489659773182, "step": 20190 }, { "epoch": 6.734489659773182, "ref_ce_loss": 0.04776563495397568, "step": 20190 }, { "epoch": 6.734489659773182, "loss": 0.22381868958473206, "step": 20190 }, { "ce_loss": 0.03490525484085083, "epoch": 6.734489659773182, "step": 20190 }, { "distill_loss": 0.10595700889825821, "epoch": 6.734489659773182, "step": 20190 }, { "epoch": 6.734489659773182, "ref_ce_loss": 0.05174877867102623, "step": 20190 }, { "epoch": 6.737825216811207, "loss": 0.2956, "step": 20200 }, { "epoch": 6.737825216811207, "grad_norm": 1.833837866783142, "step": 20200 }, { "epoch": 6.737825216811207, "learning_rate": 1.9161245696364514e-05, "step": 20200 }, { "epoch": 6.737825216811207, "loss": 0.2744770348072052, "step": 20200 }, { "ce_loss": 0.034656401723623276, "epoch": 6.737825216811207, "step": 20200 }, { "distill_loss": 0.13560622930526733, "epoch": 6.737825216811207, "step": 20200 }, { "epoch": 6.737825216811207, "ref_ce_loss": 0.05984001234173775, "step": 20200 }, { "epoch": 6.737825216811207, "loss": 0.3653131425380707, "step": 20200 }, { "ce_loss": 0.08240316063165665, "epoch": 6.737825216811207, "step": 20200 }, { "distill_loss": 0.18237349390983582, "epoch": 6.737825216811207, "step": 20200 }, { "epoch": 6.737825216811207, "ref_ce_loss": 0.07820425182580948, "step": 20200 }, { "epoch": 6.737825216811207, "loss": 0.28199803829193115, "step": 20200 }, { "ce_loss": 0.024960942566394806, "epoch": 6.737825216811207, "step": 20200 }, { "distill_loss": 0.13476698100566864, "epoch": 6.737825216811207, "step": 20200 }, { "epoch": 6.737825216811207, "ref_ce_loss": 0.061238691210746765, "step": 20200 }, { "epoch": 6.737825216811207, "loss": 0.42400720715522766, "step": 20200 }, { "ce_loss": 0.1105445921421051, "epoch": 6.737825216811207, "step": 20200 }, { "distill_loss": 0.21091026067733765, "epoch": 6.737825216811207, "step": 20200 }, { "epoch": 6.737825216811207, "ref_ce_loss": 0.08224047720432281, "step": 20200 }, { "epoch": 6.7411607738492325, "loss": 0.2882, "step": 20210 }, { "epoch": 6.7411607738492325, "grad_norm": 1.7699483633041382, "step": 20210 }, { "epoch": 6.7411607738492325, "learning_rate": 1.906230330377992e-05, "step": 20210 }, { "epoch": 6.7411607738492325, "loss": 0.3359181880950928, "step": 20210 }, { "ce_loss": 0.02041238360106945, "epoch": 6.7411607738492325, "step": 20210 }, { "distill_loss": 0.14281825721263885, "epoch": 6.7411607738492325, "step": 20210 }, { "epoch": 6.7411607738492325, "ref_ce_loss": 0.08343903720378876, "step": 20210 }, { "epoch": 6.7411607738492325, "loss": 0.28383171558380127, "step": 20210 }, { "ce_loss": 0.03835010156035423, "epoch": 6.7411607738492325, "step": 20210 }, { "distill_loss": 0.12048612534999847, "epoch": 6.7411607738492325, "step": 20210 }, { "epoch": 6.7411607738492325, "ref_ce_loss": 0.03666745126247406, "step": 20210 }, { "epoch": 6.7411607738492325, "loss": 0.26517578959465027, "step": 20210 }, { "ce_loss": 0.07440821081399918, "epoch": 6.7411607738492325, "step": 20210 }, { "distill_loss": 0.0954647958278656, "epoch": 6.7411607738492325, "step": 20210 }, { "epoch": 6.7411607738492325, "ref_ce_loss": 0.05569697543978691, "step": 20210 }, { "epoch": 6.7411607738492325, "loss": 0.6115409731864929, "step": 20210 }, { "ce_loss": 0.024617154151201248, "epoch": 6.7411607738492325, "step": 20210 }, { "distill_loss": 0.310270220041275, "epoch": 6.7411607738492325, "step": 20210 }, { "epoch": 6.7411607738492325, "ref_ce_loss": 0.11816354840993881, "step": 20210 }, { "epoch": 6.744496330887258, "loss": 0.3187, "step": 20220 }, { "epoch": 6.744496330887258, "grad_norm": 2.3631792068481445, "step": 20220 }, { "epoch": 6.744496330887258, "learning_rate": 1.8963599689903412e-05, "step": 20220 }, { "epoch": 6.744496330887258, "loss": 0.17074748873710632, "step": 20220 }, { "ce_loss": 0.011627701111137867, "epoch": 6.744496330887258, "step": 20220 }, { "distill_loss": 0.10479970276355743, "epoch": 6.744496330887258, "step": 20220 }, { "epoch": 6.744496330887258, "ref_ce_loss": 0.04375007376074791, "step": 20220 }, { "epoch": 6.744496330887258, "loss": 0.2588944733142853, "step": 20220 }, { "ce_loss": 0.016052883118391037, "epoch": 6.744496330887258, "step": 20220 }, { "distill_loss": 0.21098734438419342, "epoch": 6.744496330887258, "step": 20220 }, { "epoch": 6.744496330887258, "ref_ce_loss": 0.03143612667918205, "step": 20220 }, { "epoch": 6.744496330887258, "loss": 0.25430601835250854, "step": 20220 }, { "ce_loss": 0.03086763434112072, "epoch": 6.744496330887258, "step": 20220 }, { "distill_loss": 0.10999102145433426, "epoch": 6.744496330887258, "step": 20220 }, { "epoch": 6.744496330887258, "ref_ce_loss": 0.04139406234025955, "step": 20220 }, { "epoch": 6.744496330887258, "loss": 0.22506499290466309, "step": 20220 }, { "ce_loss": 0.02551484853029251, "epoch": 6.744496330887258, "step": 20220 }, { "distill_loss": 0.11838529258966446, "epoch": 6.744496330887258, "step": 20220 }, { "epoch": 6.744496330887258, "ref_ce_loss": 0.048836853355169296, "step": 20220 }, { "epoch": 6.747831887925283, "loss": 0.3145, "step": 20230 }, { "epoch": 6.747831887925283, "grad_norm": 6.5614542961120605, "step": 20230 }, { "epoch": 6.747831887925283, "learning_rate": 1.8865135034731416e-05, "step": 20230 }, { "epoch": 6.747831887925283, "loss": 0.24494950473308563, "step": 20230 }, { "ce_loss": 0.03902106359601021, "epoch": 6.747831887925283, "step": 20230 }, { "distill_loss": 0.1625560224056244, "epoch": 6.747831887925283, "step": 20230 }, { "epoch": 6.747831887925283, "ref_ce_loss": 0.04319927096366882, "step": 20230 }, { "epoch": 6.747831887925283, "loss": 0.4493194818496704, "step": 20230 }, { "ce_loss": 0.06314259022474289, "epoch": 6.747831887925283, "step": 20230 }, { "distill_loss": 0.1928003877401352, "epoch": 6.747831887925283, "step": 20230 }, { "epoch": 6.747831887925283, "ref_ce_loss": 0.06467821449041367, "step": 20230 }, { "epoch": 6.747831887925283, "loss": 0.22811204195022583, "step": 20230 }, { "ce_loss": 0.029204674065113068, "epoch": 6.747831887925283, "step": 20230 }, { "distill_loss": 0.11046045273542404, "epoch": 6.747831887925283, "step": 20230 }, { "epoch": 6.747831887925283, "ref_ce_loss": 0.034495506435632706, "step": 20230 }, { "epoch": 6.747831887925283, "loss": 0.24557246267795563, "step": 20230 }, { "ce_loss": 0.06584376096725464, "epoch": 6.747831887925283, "step": 20230 }, { "distill_loss": 0.09908540546894073, "epoch": 6.747831887925283, "step": 20230 }, { "epoch": 6.747831887925283, "ref_ce_loss": 0.08043040335178375, "step": 20230 }, { "epoch": 6.751167444963309, "loss": 0.3478, "step": 20240 }, { "epoch": 6.751167444963309, "grad_norm": 3.3642992973327637, "step": 20240 }, { "epoch": 6.751167444963309, "learning_rate": 1.876690951782464e-05, "step": 20240 }, { "epoch": 6.751167444963309, "loss": 0.47724294662475586, "step": 20240 }, { "ce_loss": 0.03805273398756981, "epoch": 6.751167444963309, "step": 20240 }, { "distill_loss": 0.25204703211784363, "epoch": 6.751167444963309, "step": 20240 }, { "epoch": 6.751167444963309, "ref_ce_loss": 0.10946623235940933, "step": 20240 }, { "epoch": 6.751167444963309, "loss": 0.36742380261421204, "step": 20240 }, { "ce_loss": 0.06362360715866089, "epoch": 6.751167444963309, "step": 20240 }, { "distill_loss": 0.20535053312778473, "epoch": 6.751167444963309, "step": 20240 }, { "epoch": 6.751167444963309, "ref_ce_loss": 0.0709865540266037, "step": 20240 }, { "epoch": 6.751167444963309, "loss": 0.36857929825782776, "step": 20240 }, { "ce_loss": 0.06251908838748932, "epoch": 6.751167444963309, "step": 20240 }, { "distill_loss": 0.12514689564704895, "epoch": 6.751167444963309, "step": 20240 }, { "epoch": 6.751167444963309, "ref_ce_loss": 0.07997957617044449, "step": 20240 }, { "epoch": 6.751167444963309, "loss": 0.3215661644935608, "step": 20240 }, { "ce_loss": 0.011648074723780155, "epoch": 6.751167444963309, "step": 20240 }, { "distill_loss": 0.09662874042987823, "epoch": 6.751167444963309, "step": 20240 }, { "epoch": 6.751167444963309, "ref_ce_loss": 0.04865029454231262, "step": 20240 }, { "epoch": 6.754503002001334, "loss": 0.3174, "step": 20250 }, { "epoch": 6.754503002001334, "grad_norm": 3.293933153152466, "step": 20250 }, { "epoch": 6.754503002001334, "learning_rate": 1.8668923318307704e-05, "step": 20250 }, { "epoch": 6.754503002001334, "loss": 0.3013077676296234, "step": 20250 }, { "ce_loss": 0.0609404556453228, "epoch": 6.754503002001334, "step": 20250 }, { "distill_loss": 0.12029371410608292, "epoch": 6.754503002001334, "step": 20250 }, { "epoch": 6.754503002001334, "ref_ce_loss": 0.05728248506784439, "step": 20250 }, { "epoch": 6.754503002001334, "loss": 0.36096930503845215, "step": 20250 }, { "ce_loss": 0.0662146508693695, "epoch": 6.754503002001334, "step": 20250 }, { "distill_loss": 0.17438842356204987, "epoch": 6.754503002001334, "step": 20250 }, { "epoch": 6.754503002001334, "ref_ce_loss": 0.07831903547048569, "step": 20250 }, { "epoch": 6.754503002001334, "loss": 0.2404811829328537, "step": 20250 }, { "ce_loss": 0.031928323209285736, "epoch": 6.754503002001334, "step": 20250 }, { "distill_loss": 0.10175152122974396, "epoch": 6.754503002001334, "step": 20250 }, { "epoch": 6.754503002001334, "ref_ce_loss": 0.03808411583304405, "step": 20250 }, { "epoch": 6.754503002001334, "loss": 0.2283913642168045, "step": 20250 }, { "ce_loss": 0.060152892023324966, "epoch": 6.754503002001334, "step": 20250 }, { "distill_loss": 0.1293492317199707, "epoch": 6.754503002001334, "step": 20250 }, { "epoch": 6.754503002001334, "ref_ce_loss": 0.038714051246643066, "step": 20250 }, { "epoch": 6.757838559039359, "loss": 0.395, "step": 20260 }, { "epoch": 6.757838559039359, "grad_norm": 2.3881497383117676, "step": 20260 }, { "epoch": 6.757838559039359, "learning_rate": 1.857117661486872e-05, "step": 20260 }, { "epoch": 6.757838559039359, "loss": 0.26333901286125183, "step": 20260 }, { "ce_loss": 0.061003465205430984, "epoch": 6.757838559039359, "step": 20260 }, { "distill_loss": 0.13676662743091583, "epoch": 6.757838559039359, "step": 20260 }, { "epoch": 6.757838559039359, "ref_ce_loss": 0.06485778838396072, "step": 20260 }, { "epoch": 6.757838559039359, "loss": 0.29492634534835815, "step": 20260 }, { "ce_loss": 0.00960566382855177, "epoch": 6.757838559039359, "step": 20260 }, { "distill_loss": 0.15592965483665466, "epoch": 6.757838559039359, "step": 20260 }, { "epoch": 6.757838559039359, "ref_ce_loss": 0.06057649105787277, "step": 20260 }, { "epoch": 6.757838559039359, "loss": 0.25662752985954285, "step": 20260 }, { "ce_loss": 0.03762103244662285, "epoch": 6.757838559039359, "step": 20260 }, { "distill_loss": 0.1429222673177719, "epoch": 6.757838559039359, "step": 20260 }, { "epoch": 6.757838559039359, "ref_ce_loss": 0.07600852102041245, "step": 20260 }, { "epoch": 6.757838559039359, "loss": 0.2905150055885315, "step": 20260 }, { "ce_loss": 0.03607148677110672, "epoch": 6.757838559039359, "step": 20260 }, { "distill_loss": 0.15427818894386292, "epoch": 6.757838559039359, "step": 20260 }, { "epoch": 6.757838559039359, "ref_ce_loss": 0.05871858075261116, "step": 20260 }, { "epoch": 6.761174116077385, "loss": 0.3381, "step": 20270 }, { "epoch": 6.761174116077385, "grad_norm": 2.4188501834869385, "step": 20270 }, { "epoch": 6.761174116077385, "learning_rate": 1.8473669585759154e-05, "step": 20270 }, { "epoch": 6.761174116077385, "loss": 0.1986462026834488, "step": 20270 }, { "ce_loss": 0.03574904426932335, "epoch": 6.761174116077385, "step": 20270 }, { "distill_loss": 0.1082804724574089, "epoch": 6.761174116077385, "step": 20270 }, { "epoch": 6.761174116077385, "ref_ce_loss": 0.04469778388738632, "step": 20270 }, { "epoch": 6.761174116077385, "loss": 0.28330090641975403, "step": 20270 }, { "ce_loss": 0.039637066423892975, "epoch": 6.761174116077385, "step": 20270 }, { "distill_loss": 0.1479431688785553, "epoch": 6.761174116077385, "step": 20270 }, { "epoch": 6.761174116077385, "ref_ce_loss": 0.045204803347587585, "step": 20270 }, { "epoch": 6.761174116077385, "loss": 0.3502093553543091, "step": 20270 }, { "ce_loss": 0.08832836896181107, "epoch": 6.761174116077385, "step": 20270 }, { "distill_loss": 0.1686418652534485, "epoch": 6.761174116077385, "step": 20270 }, { "epoch": 6.761174116077385, "ref_ce_loss": 0.0489707887172699, "step": 20270 }, { "epoch": 6.761174116077385, "loss": 0.3603600263595581, "step": 20270 }, { "ce_loss": 0.058100733906030655, "epoch": 6.761174116077385, "step": 20270 }, { "distill_loss": 0.13964024186134338, "epoch": 6.761174116077385, "step": 20270 }, { "epoch": 6.761174116077385, "ref_ce_loss": 0.07787153869867325, "step": 20270 }, { "epoch": 6.76450967311541, "loss": 0.3233, "step": 20280 }, { "epoch": 6.76450967311541, "grad_norm": 2.8454596996307373, "step": 20280 }, { "epoch": 6.76450967311541, "learning_rate": 1.837640240879335e-05, "step": 20280 }, { "epoch": 6.76450967311541, "loss": 0.199926495552063, "step": 20280 }, { "ce_loss": 0.03727385401725769, "epoch": 6.76450967311541, "step": 20280 }, { "distill_loss": 0.1146383136510849, "epoch": 6.76450967311541, "step": 20280 }, { "epoch": 6.76450967311541, "ref_ce_loss": 0.04791300743818283, "step": 20280 }, { "epoch": 6.76450967311541, "loss": 0.28659528493881226, "step": 20280 }, { "ce_loss": 0.030995670706033707, "epoch": 6.76450967311541, "step": 20280 }, { "distill_loss": 0.1602456271648407, "epoch": 6.76450967311541, "step": 20280 }, { "epoch": 6.76450967311541, "ref_ce_loss": 0.07240622490644455, "step": 20280 }, { "epoch": 6.76450967311541, "loss": 0.4938913583755493, "step": 20280 }, { "ce_loss": 0.0463070385158062, "epoch": 6.76450967311541, "step": 20280 }, { "distill_loss": 0.16774320602416992, "epoch": 6.76450967311541, "step": 20280 }, { "epoch": 6.76450967311541, "ref_ce_loss": 0.061573222279548645, "step": 20280 }, { "epoch": 6.76450967311541, "loss": 0.20674434304237366, "step": 20280 }, { "ce_loss": 0.02703999914228916, "epoch": 6.76450967311541, "step": 20280 }, { "distill_loss": 0.10253699123859406, "epoch": 6.76450967311541, "step": 20280 }, { "epoch": 6.76450967311541, "ref_ce_loss": 0.057654231786727905, "step": 20280 }, { "epoch": 6.767845230153435, "loss": 0.3143, "step": 20290 }, { "epoch": 6.767845230153435, "grad_norm": 2.4084882736206055, "step": 20290 }, { "epoch": 6.767845230153435, "learning_rate": 1.827937526134829e-05, "step": 20290 }, { "epoch": 6.767845230153435, "loss": 0.2376161515712738, "step": 20290 }, { "ce_loss": 0.0391826331615448, "epoch": 6.767845230153435, "step": 20290 }, { "distill_loss": 0.1317000836133957, "epoch": 6.767845230153435, "step": 20290 }, { "epoch": 6.767845230153435, "ref_ce_loss": 0.046779513359069824, "step": 20290 }, { "epoch": 6.767845230153435, "loss": 0.3904785215854645, "step": 20290 }, { "ce_loss": 0.05911394953727722, "epoch": 6.767845230153435, "step": 20290 }, { "distill_loss": 0.14573772251605988, "epoch": 6.767845230153435, "step": 20290 }, { "epoch": 6.767845230153435, "ref_ce_loss": 0.06968571990728378, "step": 20290 }, { "epoch": 6.767845230153435, "loss": 0.3730168044567108, "step": 20290 }, { "ce_loss": 0.04287463426589966, "epoch": 6.767845230153435, "step": 20290 }, { "distill_loss": 0.17994266748428345, "epoch": 6.767845230153435, "step": 20290 }, { "epoch": 6.767845230153435, "ref_ce_loss": 0.0760711207985878, "step": 20290 }, { "epoch": 6.767845230153435, "loss": 0.3494645953178406, "step": 20290 }, { "ce_loss": 0.058322224766016006, "epoch": 6.767845230153435, "step": 20290 }, { "distill_loss": 0.18589213490486145, "epoch": 6.767845230153435, "step": 20290 }, { "epoch": 6.767845230153435, "ref_ce_loss": 0.077096126973629, "step": 20290 }, { "epoch": 6.771180787191461, "loss": 0.3236, "step": 20300 }, { "epoch": 6.771180787191461, "grad_norm": 3.896623134613037, "step": 20300 }, { "epoch": 6.771180787191461, "learning_rate": 1.8182588320363234e-05, "step": 20300 }, { "epoch": 6.771180787191461, "loss": 0.19223515689373016, "step": 20300 }, { "ce_loss": 0.022537581622600555, "epoch": 6.771180787191461, "step": 20300 }, { "distill_loss": 0.11885391175746918, "epoch": 6.771180787191461, "step": 20300 }, { "epoch": 6.771180787191461, "ref_ce_loss": 0.050274379551410675, "step": 20300 }, { "epoch": 6.771180787191461, "loss": 0.3798726797103882, "step": 20300 }, { "ce_loss": 0.053281012922525406, "epoch": 6.771180787191461, "step": 20300 }, { "distill_loss": 0.23629030585289001, "epoch": 6.771180787191461, "step": 20300 }, { "epoch": 6.771180787191461, "ref_ce_loss": 0.06483253091573715, "step": 20300 }, { "epoch": 6.771180787191461, "loss": 0.4262133240699768, "step": 20300 }, { "ce_loss": 0.0720096156001091, "epoch": 6.771180787191461, "step": 20300 }, { "distill_loss": 0.11169244349002838, "epoch": 6.771180787191461, "step": 20300 }, { "epoch": 6.771180787191461, "ref_ce_loss": 0.0754813551902771, "step": 20300 }, { "epoch": 6.771180787191461, "loss": 0.21413297951221466, "step": 20300 }, { "ce_loss": 0.0175796989351511, "epoch": 6.771180787191461, "step": 20300 }, { "distill_loss": 0.09827864170074463, "epoch": 6.771180787191461, "step": 20300 }, { "epoch": 6.771180787191461, "ref_ce_loss": 0.06181326135993004, "step": 20300 }, { "epoch": 6.774516344229486, "loss": 0.33, "step": 20310 }, { "epoch": 6.774516344229486, "grad_norm": 4.409674644470215, "step": 20310 }, { "epoch": 6.774516344229486, "learning_rate": 1.808604176233933e-05, "step": 20310 }, { "epoch": 6.774516344229486, "loss": 0.20254576206207275, "step": 20310 }, { "ce_loss": 0.025957494974136353, "epoch": 6.774516344229486, "step": 20310 }, { "distill_loss": 0.12091059237718582, "epoch": 6.774516344229486, "step": 20310 }, { "epoch": 6.774516344229486, "ref_ce_loss": 0.044557299464941025, "step": 20310 }, { "epoch": 6.774516344229486, "loss": 0.3443145155906677, "step": 20310 }, { "ce_loss": 0.015703234821558, "epoch": 6.774516344229486, "step": 20310 }, { "distill_loss": 0.1548217087984085, "epoch": 6.774516344229486, "step": 20310 }, { "epoch": 6.774516344229486, "ref_ce_loss": 0.06314200907945633, "step": 20310 }, { "epoch": 6.774516344229486, "loss": 0.2929725646972656, "step": 20310 }, { "ce_loss": 0.039958517998456955, "epoch": 6.774516344229486, "step": 20310 }, { "distill_loss": 0.14566273987293243, "epoch": 6.774516344229486, "step": 20310 }, { "epoch": 6.774516344229486, "ref_ce_loss": 0.04940671846270561, "step": 20310 }, { "epoch": 6.774516344229486, "loss": 0.24490177631378174, "step": 20310 }, { "ce_loss": 0.03264036402106285, "epoch": 6.774516344229486, "step": 20310 }, { "distill_loss": 0.1299162060022354, "epoch": 6.774516344229486, "step": 20310 }, { "epoch": 6.774516344229486, "ref_ce_loss": 0.06482992321252823, "step": 20310 }, { "epoch": 6.777851901267511, "loss": 0.3209, "step": 20320 }, { "epoch": 6.777851901267511, "grad_norm": 3.675053358078003, "step": 20320 }, { "epoch": 6.777851901267511, "learning_rate": 1.798973576333943e-05, "step": 20320 }, { "epoch": 6.777851901267511, "loss": 0.15585127472877502, "step": 20320 }, { "ce_loss": 0.0033037394750863314, "epoch": 6.777851901267511, "step": 20320 }, { "distill_loss": 0.11441172659397125, "epoch": 6.777851901267511, "step": 20320 }, { "epoch": 6.777851901267511, "ref_ce_loss": 0.037796296179294586, "step": 20320 }, { "epoch": 6.777851901267511, "loss": 0.24352480471134186, "step": 20320 }, { "ce_loss": 0.01745920069515705, "epoch": 6.777851901267511, "step": 20320 }, { "distill_loss": 0.10395500808954239, "epoch": 6.777851901267511, "step": 20320 }, { "epoch": 6.777851901267511, "ref_ce_loss": 0.0610855370759964, "step": 20320 }, { "epoch": 6.777851901267511, "loss": 0.2259272187948227, "step": 20320 }, { "ce_loss": 0.04140777140855789, "epoch": 6.777851901267511, "step": 20320 }, { "distill_loss": 0.11032719910144806, "epoch": 6.777851901267511, "step": 20320 }, { "epoch": 6.777851901267511, "ref_ce_loss": 0.046511076390743256, "step": 20320 }, { "epoch": 6.777851901267511, "loss": 0.2978399991989136, "step": 20320 }, { "ce_loss": 0.04130599647760391, "epoch": 6.777851901267511, "step": 20320 }, { "distill_loss": 0.18572106957435608, "epoch": 6.777851901267511, "step": 20320 }, { "epoch": 6.777851901267511, "ref_ce_loss": 0.07065480202436447, "step": 20320 }, { "epoch": 6.781187458305537, "loss": 0.3517, "step": 20330 }, { "epoch": 6.781187458305537, "grad_norm": 2.8553764820098877, "step": 20330 }, { "epoch": 6.781187458305537, "learning_rate": 1.789367049898771e-05, "step": 20330 }, { "epoch": 6.781187458305537, "loss": 0.3826456665992737, "step": 20330 }, { "ce_loss": 0.04621146619319916, "epoch": 6.781187458305537, "step": 20330 }, { "distill_loss": 0.1796746551990509, "epoch": 6.781187458305537, "step": 20330 }, { "epoch": 6.781187458305537, "ref_ce_loss": 0.04336797073483467, "step": 20330 }, { "epoch": 6.781187458305537, "loss": 0.3907976746559143, "step": 20330 }, { "ce_loss": 0.0726345106959343, "epoch": 6.781187458305537, "step": 20330 }, { "distill_loss": 0.1604585349559784, "epoch": 6.781187458305537, "step": 20330 }, { "epoch": 6.781187458305537, "ref_ce_loss": 0.09365147352218628, "step": 20330 }, { "epoch": 6.781187458305537, "loss": 0.3649190068244934, "step": 20330 }, { "ce_loss": 0.06374073773622513, "epoch": 6.781187458305537, "step": 20330 }, { "distill_loss": 0.14042016863822937, "epoch": 6.781187458305537, "step": 20330 }, { "epoch": 6.781187458305537, "ref_ce_loss": 0.05827249959111214, "step": 20330 }, { "epoch": 6.781187458305537, "loss": 0.49203094840049744, "step": 20330 }, { "ce_loss": 0.12137826532125473, "epoch": 6.781187458305537, "step": 20330 }, { "distill_loss": 0.22073110938072205, "epoch": 6.781187458305537, "step": 20330 }, { "epoch": 6.781187458305537, "ref_ce_loss": 0.12056776881217957, "step": 20330 }, { "epoch": 6.784523015343562, "loss": 0.3236, "step": 20340 }, { "epoch": 6.784523015343562, "grad_norm": 2.1215176582336426, "step": 20340 }, { "epoch": 6.784523015343562, "learning_rate": 1.7797846144469306e-05, "step": 20340 }, { "epoch": 6.784523015343562, "loss": 0.41949424147605896, "step": 20340 }, { "ce_loss": 0.09814513474702835, "epoch": 6.784523015343562, "step": 20340 }, { "distill_loss": 0.20361030101776123, "epoch": 6.784523015343562, "step": 20340 }, { "epoch": 6.784523015343562, "ref_ce_loss": 0.09809906035661697, "step": 20340 }, { "epoch": 6.784523015343562, "loss": 0.19860303401947021, "step": 20340 }, { "ce_loss": 0.02330976165831089, "epoch": 6.784523015343562, "step": 20340 }, { "distill_loss": 0.12827952206134796, "epoch": 6.784523015343562, "step": 20340 }, { "epoch": 6.784523015343562, "ref_ce_loss": 0.03326123580336571, "step": 20340 }, { "epoch": 6.784523015343562, "loss": 0.4274199903011322, "step": 20340 }, { "ce_loss": 0.030533771961927414, "epoch": 6.784523015343562, "step": 20340 }, { "distill_loss": 0.1351586878299713, "epoch": 6.784523015343562, "step": 20340 }, { "epoch": 6.784523015343562, "ref_ce_loss": 0.04878941550850868, "step": 20340 }, { "epoch": 6.784523015343562, "loss": 0.2686753273010254, "step": 20340 }, { "ce_loss": 0.020406026393175125, "epoch": 6.784523015343562, "step": 20340 }, { "distill_loss": 0.17345456779003143, "epoch": 6.784523015343562, "step": 20340 }, { "epoch": 6.784523015343562, "ref_ce_loss": 0.049186788499355316, "step": 20340 }, { "epoch": 6.787858572381587, "loss": 0.3163, "step": 20350 }, { "epoch": 6.787858572381587, "grad_norm": 3.0742602348327637, "step": 20350 }, { "epoch": 6.787858572381587, "learning_rate": 1.770226287453007e-05, "step": 20350 }, { "epoch": 6.787858572381587, "loss": 0.25368139147758484, "step": 20350 }, { "ce_loss": 0.02152234874665737, "epoch": 6.787858572381587, "step": 20350 }, { "distill_loss": 0.16655249893665314, "epoch": 6.787858572381587, "step": 20350 }, { "epoch": 6.787858572381587, "ref_ce_loss": 0.0653744712471962, "step": 20350 }, { "epoch": 6.787858572381587, "loss": 1.1269481182098389, "step": 20350 }, { "ce_loss": 0.04487686976790428, "epoch": 6.787858572381587, "step": 20350 }, { "distill_loss": 0.13662518560886383, "epoch": 6.787858572381587, "step": 20350 }, { "epoch": 6.787858572381587, "ref_ce_loss": 0.05432640761137009, "step": 20350 }, { "epoch": 6.787858572381587, "loss": 0.4653398394584656, "step": 20350 }, { "ce_loss": 0.04627220332622528, "epoch": 6.787858572381587, "step": 20350 }, { "distill_loss": 0.17873713374137878, "epoch": 6.787858572381587, "step": 20350 }, { "epoch": 6.787858572381587, "ref_ce_loss": 0.044496770948171616, "step": 20350 }, { "epoch": 6.787858572381587, "loss": 0.4285845160484314, "step": 20350 }, { "ce_loss": 0.033552300184965134, "epoch": 6.787858572381587, "step": 20350 }, { "distill_loss": 0.26186275482177734, "epoch": 6.787858572381587, "step": 20350 }, { "epoch": 6.787858572381587, "ref_ce_loss": 0.052286114543676376, "step": 20350 }, { "epoch": 6.791194129419613, "loss": 0.3572, "step": 20360 }, { "epoch": 6.791194129419613, "grad_norm": 2.3414089679718018, "step": 20360 }, { "epoch": 6.791194129419613, "learning_rate": 1.760692086347612e-05, "step": 20360 }, { "epoch": 6.791194129419613, "loss": 0.2956318259239197, "step": 20360 }, { "ce_loss": 0.013998754322528839, "epoch": 6.791194129419613, "step": 20360 }, { "distill_loss": 0.17851464450359344, "epoch": 6.791194129419613, "step": 20360 }, { "epoch": 6.791194129419613, "ref_ce_loss": 0.07696002721786499, "step": 20360 }, { "epoch": 6.791194129419613, "loss": 0.23942914605140686, "step": 20360 }, { "ce_loss": 0.026676956564188004, "epoch": 6.791194129419613, "step": 20360 }, { "distill_loss": 0.11965633928775787, "epoch": 6.791194129419613, "step": 20360 }, { "epoch": 6.791194129419613, "ref_ce_loss": 0.035076532512903214, "step": 20360 }, { "epoch": 6.791194129419613, "loss": 0.22385582327842712, "step": 20360 }, { "ce_loss": 0.03636830672621727, "epoch": 6.791194129419613, "step": 20360 }, { "distill_loss": 0.1141805499792099, "epoch": 6.791194129419613, "step": 20360 }, { "epoch": 6.791194129419613, "ref_ce_loss": 0.046886786818504333, "step": 20360 }, { "epoch": 6.791194129419613, "loss": 0.36579465866088867, "step": 20360 }, { "ce_loss": 0.08306985348463058, "epoch": 6.791194129419613, "step": 20360 }, { "distill_loss": 0.20233827829360962, "epoch": 6.791194129419613, "step": 20360 }, { "epoch": 6.791194129419613, "ref_ce_loss": 0.08025453239679337, "step": 20360 }, { "epoch": 6.794529686457638, "loss": 0.3441, "step": 20370 }, { "epoch": 6.794529686457638, "grad_norm": 2.2329018115997314, "step": 20370 }, { "epoch": 6.794529686457638, "learning_rate": 1.751182028517373e-05, "step": 20370 }, { "epoch": 6.794529686457638, "loss": 0.2719927430152893, "step": 20370 }, { "ce_loss": 0.061337120831012726, "epoch": 6.794529686457638, "step": 20370 }, { "distill_loss": 0.13906854391098022, "epoch": 6.794529686457638, "step": 20370 }, { "epoch": 6.794529686457638, "ref_ce_loss": 0.07132521271705627, "step": 20370 }, { "epoch": 6.794529686457638, "loss": 0.31627902388572693, "step": 20370 }, { "ce_loss": 0.02854839526116848, "epoch": 6.794529686457638, "step": 20370 }, { "distill_loss": 0.18262380361557007, "epoch": 6.794529686457638, "step": 20370 }, { "epoch": 6.794529686457638, "ref_ce_loss": 0.07285098731517792, "step": 20370 }, { "epoch": 6.794529686457638, "loss": 0.4150804579257965, "step": 20370 }, { "ce_loss": 0.06831822544336319, "epoch": 6.794529686457638, "step": 20370 }, { "distill_loss": 0.2284497320652008, "epoch": 6.794529686457638, "step": 20370 }, { "epoch": 6.794529686457638, "ref_ce_loss": 0.09520136564970016, "step": 20370 }, { "epoch": 6.794529686457638, "loss": 0.43720874190330505, "step": 20370 }, { "ce_loss": 0.061582840979099274, "epoch": 6.794529686457638, "step": 20370 }, { "distill_loss": 0.13514171540737152, "epoch": 6.794529686457638, "step": 20370 }, { "epoch": 6.794529686457638, "ref_ce_loss": 0.027682026848196983, "step": 20370 }, { "epoch": 6.7978652434956635, "loss": 0.3417, "step": 20380 }, { "epoch": 6.7978652434956635, "grad_norm": 3.450296640396118, "step": 20380 }, { "epoch": 6.7978652434956635, "learning_rate": 1.7416961313048767e-05, "step": 20380 }, { "epoch": 6.7978652434956635, "loss": 0.3290284276008606, "step": 20380 }, { "ce_loss": 0.03511648252606392, "epoch": 6.7978652434956635, "step": 20380 }, { "distill_loss": 0.12331343442201614, "epoch": 6.7978652434956635, "step": 20380 }, { "epoch": 6.7978652434956635, "ref_ce_loss": 0.062362030148506165, "step": 20380 }, { "epoch": 6.7978652434956635, "loss": 0.3422921895980835, "step": 20380 }, { "ce_loss": 0.05525599792599678, "epoch": 6.7978652434956635, "step": 20380 }, { "distill_loss": 0.10891838371753693, "epoch": 6.7978652434956635, "step": 20380 }, { "epoch": 6.7978652434956635, "ref_ce_loss": 0.04750807583332062, "step": 20380 }, { "epoch": 6.7978652434956635, "loss": 0.33280348777770996, "step": 20380 }, { "ce_loss": 0.05837497115135193, "epoch": 6.7978652434956635, "step": 20380 }, { "distill_loss": 0.16548392176628113, "epoch": 6.7978652434956635, "step": 20380 }, { "epoch": 6.7978652434956635, "ref_ce_loss": 0.06457653641700745, "step": 20380 }, { "epoch": 6.7978652434956635, "loss": 1.3425191640853882, "step": 20380 }, { "ce_loss": 0.01805054396390915, "epoch": 6.7978652434956635, "step": 20380 }, { "distill_loss": 0.13048408925533295, "epoch": 6.7978652434956635, "step": 20380 }, { "epoch": 6.7978652434956635, "ref_ce_loss": 0.0793338418006897, "step": 20380 }, { "epoch": 6.801200800533689, "loss": 0.3615, "step": 20390 }, { "epoch": 6.801200800533689, "grad_norm": 2.20592999458313, "step": 20390 }, { "epoch": 6.801200800533689, "learning_rate": 1.7322344120086662e-05, "step": 20390 }, { "epoch": 6.801200800533689, "loss": 0.33714401721954346, "step": 20390 }, { "ce_loss": 0.04753788188099861, "epoch": 6.801200800533689, "step": 20390 }, { "distill_loss": 0.12098333984613419, "epoch": 6.801200800533689, "step": 20390 }, { "epoch": 6.801200800533689, "ref_ce_loss": 0.06287845969200134, "step": 20390 }, { "epoch": 6.801200800533689, "loss": 0.30119767785072327, "step": 20390 }, { "ce_loss": 0.01172428298741579, "epoch": 6.801200800533689, "step": 20390 }, { "distill_loss": 0.11199269443750381, "epoch": 6.801200800533689, "step": 20390 }, { "epoch": 6.801200800533689, "ref_ce_loss": 0.042432911694049835, "step": 20390 }, { "epoch": 6.801200800533689, "loss": 0.4813525676727295, "step": 20390 }, { "ce_loss": 0.029576046392321587, "epoch": 6.801200800533689, "step": 20390 }, { "distill_loss": 0.1122434064745903, "epoch": 6.801200800533689, "step": 20390 }, { "epoch": 6.801200800533689, "ref_ce_loss": 0.05790295824408531, "step": 20390 }, { "epoch": 6.801200800533689, "loss": 0.2784154415130615, "step": 20390 }, { "ce_loss": 0.07036291062831879, "epoch": 6.801200800533689, "step": 20390 }, { "distill_loss": 0.15373992919921875, "epoch": 6.801200800533689, "step": 20390 }, { "epoch": 6.801200800533689, "ref_ce_loss": 0.044378459453582764, "step": 20390 }, { "epoch": 6.804536357571714, "loss": 0.3143, "step": 20400 }, { "epoch": 6.804536357571714, "grad_norm": 2.1802616119384766, "step": 20400 }, { "epoch": 6.804536357571714, "learning_rate": 1.722796887883183e-05, "step": 20400 }, { "epoch": 6.804536357571714, "loss": 0.2862167954444885, "step": 20400 }, { "ce_loss": 0.0310842152684927, "epoch": 6.804536357571714, "step": 20400 }, { "distill_loss": 0.15729649364948273, "epoch": 6.804536357571714, "step": 20400 }, { "epoch": 6.804536357571714, "ref_ce_loss": 0.053252577781677246, "step": 20400 }, { "epoch": 6.804536357571714, "loss": 0.29880207777023315, "step": 20400 }, { "ce_loss": 0.06482298672199249, "epoch": 6.804536357571714, "step": 20400 }, { "distill_loss": 0.1452362835407257, "epoch": 6.804536357571714, "step": 20400 }, { "epoch": 6.804536357571714, "ref_ce_loss": 0.08833353221416473, "step": 20400 }, { "epoch": 6.804536357571714, "loss": 0.7638775110244751, "step": 20400 }, { "ce_loss": 0.012667951174080372, "epoch": 6.804536357571714, "step": 20400 }, { "distill_loss": 0.17049390077590942, "epoch": 6.804536357571714, "step": 20400 }, { "epoch": 6.804536357571714, "ref_ce_loss": 0.09940537065267563, "step": 20400 }, { "epoch": 6.804536357571714, "loss": 0.37731772661209106, "step": 20400 }, { "ce_loss": 0.029722534120082855, "epoch": 6.804536357571714, "step": 20400 }, { "distill_loss": 0.24861417710781097, "epoch": 6.804536357571714, "step": 20400 }, { "epoch": 6.804536357571714, "ref_ce_loss": 0.07538484036922455, "step": 20400 }, { "epoch": 6.8078719146097395, "loss": 0.346, "step": 20410 }, { "epoch": 6.8078719146097395, "grad_norm": 2.557342767715454, "step": 20410 }, { "epoch": 6.8078719146097395, "learning_rate": 1.713383576138746e-05, "step": 20410 }, { "epoch": 6.8078719146097395, "loss": 0.2678622603416443, "step": 20410 }, { "ce_loss": 0.05426609516143799, "epoch": 6.8078719146097395, "step": 20410 }, { "distill_loss": 0.1291498839855194, "epoch": 6.8078719146097395, "step": 20410 }, { "epoch": 6.8078719146097395, "ref_ce_loss": 0.07145240902900696, "step": 20410 }, { "epoch": 6.8078719146097395, "loss": 0.30670714378356934, "step": 20410 }, { "ce_loss": 0.03392226621508598, "epoch": 6.8078719146097395, "step": 20410 }, { "distill_loss": 0.18343859910964966, "epoch": 6.8078719146097395, "step": 20410 }, { "epoch": 6.8078719146097395, "ref_ce_loss": 0.056115660816431046, "step": 20410 }, { "epoch": 6.8078719146097395, "loss": 0.6002055406570435, "step": 20410 }, { "ce_loss": 0.013181916438043118, "epoch": 6.8078719146097395, "step": 20410 }, { "distill_loss": 0.25213515758514404, "epoch": 6.8078719146097395, "step": 20410 }, { "epoch": 6.8078719146097395, "ref_ce_loss": 0.0515945628285408, "step": 20410 }, { "epoch": 6.8078719146097395, "loss": 0.29317811131477356, "step": 20410 }, { "ce_loss": 0.06708183139562607, "epoch": 6.8078719146097395, "step": 20410 }, { "distill_loss": 0.15257295966148376, "epoch": 6.8078719146097395, "step": 20410 }, { "epoch": 6.8078719146097395, "ref_ce_loss": 0.06174732744693756, "step": 20410 }, { "epoch": 6.811207471647765, "loss": 0.3556, "step": 20420 }, { "epoch": 6.811207471647765, "grad_norm": 3.1058154106140137, "step": 20420 }, { "epoch": 6.811207471647765, "learning_rate": 1.703994493941523e-05, "step": 20420 }, { "epoch": 6.811207471647765, "loss": 0.3794916868209839, "step": 20420 }, { "ce_loss": 0.038069769740104675, "epoch": 6.811207471647765, "step": 20420 }, { "distill_loss": 0.1606229692697525, "epoch": 6.811207471647765, "step": 20420 }, { "epoch": 6.811207471647765, "ref_ce_loss": 0.059208884835243225, "step": 20420 }, { "epoch": 6.811207471647765, "loss": 0.31073516607284546, "step": 20420 }, { "ce_loss": 0.012998398393392563, "epoch": 6.811207471647765, "step": 20420 }, { "distill_loss": 0.14370596408843994, "epoch": 6.811207471647765, "step": 20420 }, { "epoch": 6.811207471647765, "ref_ce_loss": 0.059843819588422775, "step": 20420 }, { "epoch": 6.811207471647765, "loss": 0.2577630281448364, "step": 20420 }, { "ce_loss": 0.04776522144675255, "epoch": 6.811207471647765, "step": 20420 }, { "distill_loss": 0.099476657807827, "epoch": 6.811207471647765, "step": 20420 }, { "epoch": 6.811207471647765, "ref_ce_loss": 0.060139674693346024, "step": 20420 }, { "epoch": 6.811207471647765, "loss": 0.3549755811691284, "step": 20420 }, { "ce_loss": 0.04084332287311554, "epoch": 6.811207471647765, "step": 20420 }, { "distill_loss": 0.11227469891309738, "epoch": 6.811207471647765, "step": 20420 }, { "epoch": 6.811207471647765, "ref_ce_loss": 0.06732767820358276, "step": 20420 }, { "epoch": 6.81454302868579, "loss": 0.3226, "step": 20430 }, { "epoch": 6.81454302868579, "grad_norm": 3.2220349311828613, "step": 20430 }, { "epoch": 6.81454302868579, "learning_rate": 1.6946296584134988e-05, "step": 20430 }, { "epoch": 6.81454302868579, "loss": 0.3655773103237152, "step": 20430 }, { "ce_loss": 0.01781701296567917, "epoch": 6.81454302868579, "step": 20430 }, { "distill_loss": 0.27449363470077515, "epoch": 6.81454302868579, "step": 20430 }, { "epoch": 6.81454302868579, "ref_ce_loss": 0.05400577932596207, "step": 20430 }, { "epoch": 6.81454302868579, "loss": 0.283231258392334, "step": 20430 }, { "ce_loss": 0.024366069585084915, "epoch": 6.81454302868579, "step": 20430 }, { "distill_loss": 0.17268988490104675, "epoch": 6.81454302868579, "step": 20430 }, { "epoch": 6.81454302868579, "ref_ce_loss": 0.043967101722955704, "step": 20430 }, { "epoch": 6.81454302868579, "loss": 0.31054821610450745, "step": 20430 }, { "ce_loss": 0.08263547718524933, "epoch": 6.81454302868579, "step": 20430 }, { "distill_loss": 0.14180082082748413, "epoch": 6.81454302868579, "step": 20430 }, { "epoch": 6.81454302868579, "ref_ce_loss": 0.07086239755153656, "step": 20430 }, { "epoch": 6.81454302868579, "loss": 0.21523864567279816, "step": 20430 }, { "ce_loss": 0.017624469473958015, "epoch": 6.81454302868579, "step": 20430 }, { "distill_loss": 0.14015556871891022, "epoch": 6.81454302868579, "step": 20430 }, { "epoch": 6.81454302868579, "ref_ce_loss": 0.04783370718359947, "step": 20430 }, { "epoch": 6.8178785857238156, "loss": 0.3806, "step": 20440 }, { "epoch": 6.8178785857238156, "grad_norm": 1.9579981565475464, "step": 20440 }, { "epoch": 6.8178785857238156, "learning_rate": 1.685289086632433e-05, "step": 20440 }, { "epoch": 6.8178785857238156, "loss": 0.3771095275878906, "step": 20440 }, { "ce_loss": 0.07861079275608063, "epoch": 6.8178785857238156, "step": 20440 }, { "distill_loss": 0.21288646757602692, "epoch": 6.8178785857238156, "step": 20440 }, { "epoch": 6.8178785857238156, "ref_ce_loss": 0.05587618425488472, "step": 20440 }, { "epoch": 6.8178785857238156, "loss": 0.2200825810432434, "step": 20440 }, { "ce_loss": 0.04254712909460068, "epoch": 6.8178785857238156, "step": 20440 }, { "distill_loss": 0.11262678354978561, "epoch": 6.8178785857238156, "step": 20440 }, { "epoch": 6.8178785857238156, "ref_ce_loss": 0.04462994635105133, "step": 20440 }, { "epoch": 6.8178785857238156, "loss": 0.29384127259254456, "step": 20440 }, { "ce_loss": 0.03630302846431732, "epoch": 6.8178785857238156, "step": 20440 }, { "distill_loss": 0.11645830422639847, "epoch": 6.8178785857238156, "step": 20440 }, { "epoch": 6.8178785857238156, "ref_ce_loss": 0.037373896688222885, "step": 20440 }, { "epoch": 6.8178785857238156, "loss": 0.39884600043296814, "step": 20440 }, { "ce_loss": 0.032204754650592804, "epoch": 6.8178785857238156, "step": 20440 }, { "distill_loss": 0.19181369245052338, "epoch": 6.8178785857238156, "step": 20440 }, { "epoch": 6.8178785857238156, "ref_ce_loss": 0.07446800172328949, "step": 20440 }, { "epoch": 6.821214142761841, "loss": 0.3907, "step": 20450 }, { "epoch": 6.821214142761841, "grad_norm": 3.3933627605438232, "step": 20450 }, { "epoch": 6.821214142761841, "learning_rate": 1.6759727956318536e-05, "step": 20450 }, { "epoch": 6.821214142761841, "loss": 0.16477397084236145, "step": 20450 }, { "ce_loss": 0.02636115998029709, "epoch": 6.821214142761841, "step": 20450 }, { "distill_loss": 0.0941682681441307, "epoch": 6.821214142761841, "step": 20450 }, { "epoch": 6.821214142761841, "ref_ce_loss": 0.03172001242637634, "step": 20450 }, { "epoch": 6.821214142761841, "loss": 0.49552544951438904, "step": 20450 }, { "ce_loss": 0.06779889762401581, "epoch": 6.821214142761841, "step": 20450 }, { "distill_loss": 0.1919066607952118, "epoch": 6.821214142761841, "step": 20450 }, { "epoch": 6.821214142761841, "ref_ce_loss": 0.05548325926065445, "step": 20450 }, { "epoch": 6.821214142761841, "loss": 0.293425977230072, "step": 20450 }, { "ce_loss": 0.039039529860019684, "epoch": 6.821214142761841, "step": 20450 }, { "distill_loss": 0.18855513632297516, "epoch": 6.821214142761841, "step": 20450 }, { "epoch": 6.821214142761841, "ref_ce_loss": 0.04807998239994049, "step": 20450 }, { "epoch": 6.821214142761841, "loss": 0.359455406665802, "step": 20450 }, { "ce_loss": 0.05050094425678253, "epoch": 6.821214142761841, "step": 20450 }, { "distill_loss": 0.15178292989730835, "epoch": 6.821214142761841, "step": 20450 }, { "epoch": 6.821214142761841, "ref_ce_loss": 0.039823852479457855, "step": 20450 }, { "epoch": 6.824549699799866, "loss": 0.3371, "step": 20460 }, { "epoch": 6.824549699799866, "grad_norm": 3.3244168758392334, "step": 20460 }, { "epoch": 6.824549699799866, "learning_rate": 1.666680802400992e-05, "step": 20460 }, { "epoch": 6.824549699799866, "loss": 0.3092981278896332, "step": 20460 }, { "ce_loss": 0.045066677033901215, "epoch": 6.824549699799866, "step": 20460 }, { "distill_loss": 0.21357296407222748, "epoch": 6.824549699799866, "step": 20460 }, { "epoch": 6.824549699799866, "ref_ce_loss": 0.03532247617840767, "step": 20460 }, { "epoch": 6.824549699799866, "loss": 0.33261650800704956, "step": 20460 }, { "ce_loss": 0.04242473095655441, "epoch": 6.824549699799866, "step": 20460 }, { "distill_loss": 0.16028018295764923, "epoch": 6.824549699799866, "step": 20460 }, { "epoch": 6.824549699799866, "ref_ce_loss": 0.06327745318412781, "step": 20460 }, { "epoch": 6.824549699799866, "loss": 0.38244354724884033, "step": 20460 }, { "ce_loss": 0.06900951266288757, "epoch": 6.824549699799866, "step": 20460 }, { "distill_loss": 0.16052497923374176, "epoch": 6.824549699799866, "step": 20460 }, { "epoch": 6.824549699799866, "ref_ce_loss": 0.06689080595970154, "step": 20460 }, { "epoch": 6.824549699799866, "loss": 0.277946412563324, "step": 20460 }, { "ce_loss": 0.024767400696873665, "epoch": 6.824549699799866, "step": 20460 }, { "distill_loss": 0.1381060630083084, "epoch": 6.824549699799866, "step": 20460 }, { "epoch": 6.824549699799866, "ref_ce_loss": 0.05307367444038391, "step": 20460 }, { "epoch": 6.827885256837892, "loss": 0.3433, "step": 20470 }, { "epoch": 6.827885256837892, "grad_norm": 4.6717658042907715, "step": 20470 }, { "epoch": 6.827885256837892, "learning_rate": 1.657413123884782e-05, "step": 20470 }, { "epoch": 6.827885256837892, "loss": 0.3429701626300812, "step": 20470 }, { "ce_loss": 0.05055519938468933, "epoch": 6.827885256837892, "step": 20470 }, { "distill_loss": 0.20971356332302094, "epoch": 6.827885256837892, "step": 20470 }, { "epoch": 6.827885256837892, "ref_ce_loss": 0.07031997293233871, "step": 20470 }, { "epoch": 6.827885256837892, "loss": 1.1225718259811401, "step": 20470 }, { "ce_loss": 0.0169549398124218, "epoch": 6.827885256837892, "step": 20470 }, { "distill_loss": 0.213147833943367, "epoch": 6.827885256837892, "step": 20470 }, { "epoch": 6.827885256837892, "ref_ce_loss": 0.09352371096611023, "step": 20470 }, { "epoch": 6.827885256837892, "loss": 1.0933277606964111, "step": 20470 }, { "ce_loss": 0.059152502566576004, "epoch": 6.827885256837892, "step": 20470 }, { "distill_loss": 0.19744914770126343, "epoch": 6.827885256837892, "step": 20470 }, { "epoch": 6.827885256837892, "ref_ce_loss": 0.06428895890712738, "step": 20470 }, { "epoch": 6.827885256837892, "loss": 0.29990047216415405, "step": 20470 }, { "ce_loss": 0.030425040051341057, "epoch": 6.827885256837892, "step": 20470 }, { "distill_loss": 0.1377599537372589, "epoch": 6.827885256837892, "step": 20470 }, { "epoch": 6.827885256837892, "ref_ce_loss": 0.057538341730833054, "step": 20470 }, { "epoch": 6.831220813875917, "loss": 0.3842, "step": 20480 }, { "epoch": 6.831220813875917, "grad_norm": 2.320603132247925, "step": 20480 }, { "epoch": 6.831220813875917, "learning_rate": 1.6481697769838166e-05, "step": 20480 }, { "epoch": 6.831220813875917, "loss": 0.13774797320365906, "step": 20480 }, { "ce_loss": 0.0031953605357557535, "epoch": 6.831220813875917, "step": 20480 }, { "distill_loss": 0.09736062586307526, "epoch": 6.831220813875917, "step": 20480 }, { "epoch": 6.831220813875917, "ref_ce_loss": 0.019617626443505287, "step": 20480 }, { "epoch": 6.831220813875917, "loss": 0.14723575115203857, "step": 20480 }, { "ce_loss": 0.025522742420434952, "epoch": 6.831220813875917, "step": 20480 }, { "distill_loss": 0.08433471620082855, "epoch": 6.831220813875917, "step": 20480 }, { "epoch": 6.831220813875917, "ref_ce_loss": 0.03724829852581024, "step": 20480 }, { "epoch": 6.831220813875917, "loss": 0.38259050250053406, "step": 20480 }, { "ce_loss": 0.0821247398853302, "epoch": 6.831220813875917, "step": 20480 }, { "distill_loss": 0.19539779424667358, "epoch": 6.831220813875917, "step": 20480 }, { "epoch": 6.831220813875917, "ref_ce_loss": 0.04475909844040871, "step": 20480 }, { "epoch": 6.831220813875917, "loss": 0.3221023976802826, "step": 20480 }, { "ce_loss": 0.05226357653737068, "epoch": 6.831220813875917, "step": 20480 }, { "distill_loss": 0.12409257888793945, "epoch": 6.831220813875917, "step": 20480 }, { "epoch": 6.831220813875917, "ref_ce_loss": 0.10269239544868469, "step": 20480 }, { "epoch": 6.834556370913942, "loss": 0.3303, "step": 20490 }, { "epoch": 6.834556370913942, "grad_norm": 2.2937734127044678, "step": 20490 }, { "epoch": 6.834556370913942, "learning_rate": 1.6389507785543067e-05, "step": 20490 }, { "epoch": 6.834556370913942, "loss": 0.3213595747947693, "step": 20490 }, { "ce_loss": 0.0633217841386795, "epoch": 6.834556370913942, "step": 20490 }, { "distill_loss": 0.1517091989517212, "epoch": 6.834556370913942, "step": 20490 }, { "epoch": 6.834556370913942, "ref_ce_loss": 0.060797836631536484, "step": 20490 }, { "epoch": 6.834556370913942, "loss": 0.45498204231262207, "step": 20490 }, { "ce_loss": 0.09181714057922363, "epoch": 6.834556370913942, "step": 20490 }, { "distill_loss": 0.2289491891860962, "epoch": 6.834556370913942, "step": 20490 }, { "epoch": 6.834556370913942, "ref_ce_loss": 0.10269571840763092, "step": 20490 }, { "epoch": 6.834556370913942, "loss": 0.4061187207698822, "step": 20490 }, { "ce_loss": 0.0254044346511364, "epoch": 6.834556370913942, "step": 20490 }, { "distill_loss": 0.14261065423488617, "epoch": 6.834556370913942, "step": 20490 }, { "epoch": 6.834556370913942, "ref_ce_loss": 0.06448842585086823, "step": 20490 }, { "epoch": 6.834556370913942, "loss": 0.27845606207847595, "step": 20490 }, { "ce_loss": 0.08355095237493515, "epoch": 6.834556370913942, "step": 20490 }, { "distill_loss": 0.11640045046806335, "epoch": 6.834556370913942, "step": 20490 }, { "epoch": 6.834556370913942, "ref_ce_loss": 0.0555226132273674, "step": 20490 }, { "epoch": 6.837891927951968, "loss": 0.3147, "step": 20500 }, { "epoch": 6.837891927951968, "grad_norm": 2.9679994583129883, "step": 20500 }, { "epoch": 6.837891927951968, "learning_rate": 1.6297561454080727e-05, "step": 20500 }, { "epoch": 6.837891927951968, "loss": 0.24702830612659454, "step": 20500 }, { "ce_loss": 0.05579338222742081, "epoch": 6.837891927951968, "step": 20500 }, { "distill_loss": 0.12925627827644348, "epoch": 6.837891927951968, "step": 20500 }, { "epoch": 6.837891927951968, "ref_ce_loss": 0.06171823665499687, "step": 20500 }, { "epoch": 6.837891927951968, "loss": 0.30632027983665466, "step": 20500 }, { "ce_loss": 0.034926217049360275, "epoch": 6.837891927951968, "step": 20500 }, { "distill_loss": 0.16829253733158112, "epoch": 6.837891927951968, "step": 20500 }, { "epoch": 6.837891927951968, "ref_ce_loss": 0.0464189313352108, "step": 20500 }, { "epoch": 6.837891927951968, "loss": 0.17535685002803802, "step": 20500 }, { "ce_loss": 0.0038707999046891928, "epoch": 6.837891927951968, "step": 20500 }, { "distill_loss": 0.08704368770122528, "epoch": 6.837891927951968, "step": 20500 }, { "epoch": 6.837891927951968, "ref_ce_loss": 0.04450593888759613, "step": 20500 }, { "epoch": 6.837891927951968, "loss": 0.8304358720779419, "step": 20500 }, { "ce_loss": 0.09895811975002289, "epoch": 6.837891927951968, "step": 20500 }, { "distill_loss": 0.14359821379184723, "epoch": 6.837891927951968, "step": 20500 }, { "epoch": 6.837891927951968, "ref_ce_loss": 0.05015580728650093, "step": 20500 }, { "epoch": 6.841227484989993, "loss": 0.317, "step": 20510 }, { "epoch": 6.841227484989993, "grad_norm": 1.9534635543823242, "step": 20510 }, { "epoch": 6.841227484989993, "learning_rate": 1.6205858943125005e-05, "step": 20510 }, { "epoch": 6.841227484989993, "loss": 0.39088451862335205, "step": 20510 }, { "ce_loss": 0.03835386037826538, "epoch": 6.841227484989993, "step": 20510 }, { "distill_loss": 0.20981791615486145, "epoch": 6.841227484989993, "step": 20510 }, { "epoch": 6.841227484989993, "ref_ce_loss": 0.06102800369262695, "step": 20510 }, { "epoch": 6.841227484989993, "loss": 0.5545916557312012, "step": 20510 }, { "ce_loss": 0.07103858143091202, "epoch": 6.841227484989993, "step": 20510 }, { "distill_loss": 0.17186136543750763, "epoch": 6.841227484989993, "step": 20510 }, { "epoch": 6.841227484989993, "ref_ce_loss": 0.06757304817438126, "step": 20510 }, { "epoch": 6.841227484989993, "loss": 0.1919550895690918, "step": 20510 }, { "ce_loss": 0.0106651084497571, "epoch": 6.841227484989993, "step": 20510 }, { "distill_loss": 0.13799473643302917, "epoch": 6.841227484989993, "step": 20510 }, { "epoch": 6.841227484989993, "ref_ce_loss": 0.02933264710009098, "step": 20510 }, { "epoch": 6.841227484989993, "loss": 0.4210149049758911, "step": 20510 }, { "ce_loss": 0.015158873051404953, "epoch": 6.841227484989993, "step": 20510 }, { "distill_loss": 0.18024934828281403, "epoch": 6.841227484989993, "step": 20510 }, { "epoch": 6.841227484989993, "ref_ce_loss": 0.06706501543521881, "step": 20510 }, { "epoch": 6.844563042028018, "loss": 0.3569, "step": 20520 }, { "epoch": 6.844563042028018, "grad_norm": 4.50359582901001, "step": 20520 }, { "epoch": 6.844563042028018, "learning_rate": 1.6114400419905067e-05, "step": 20520 }, { "epoch": 6.844563042028018, "loss": 0.35514190793037415, "step": 20520 }, { "ce_loss": 0.08249351382255554, "epoch": 6.844563042028018, "step": 20520 }, { "distill_loss": 0.13066443800926208, "epoch": 6.844563042028018, "step": 20520 }, { "epoch": 6.844563042028018, "ref_ce_loss": 0.041989509016275406, "step": 20520 }, { "epoch": 6.844563042028018, "loss": 0.26543962955474854, "step": 20520 }, { "ce_loss": 0.0641234815120697, "epoch": 6.844563042028018, "step": 20520 }, { "distill_loss": 0.14749526977539062, "epoch": 6.844563042028018, "step": 20520 }, { "epoch": 6.844563042028018, "ref_ce_loss": 0.03656027466058731, "step": 20520 }, { "epoch": 6.844563042028018, "loss": 0.29692205786705017, "step": 20520 }, { "ce_loss": 0.01707381382584572, "epoch": 6.844563042028018, "step": 20520 }, { "distill_loss": 0.1400364339351654, "epoch": 6.844563042028018, "step": 20520 }, { "epoch": 6.844563042028018, "ref_ce_loss": 0.06183823198080063, "step": 20520 }, { "epoch": 6.844563042028018, "loss": 0.24452118575572968, "step": 20520 }, { "ce_loss": 0.027332574129104614, "epoch": 6.844563042028018, "step": 20520 }, { "distill_loss": 0.14121589064598083, "epoch": 6.844563042028018, "step": 20520 }, { "epoch": 6.844563042028018, "ref_ce_loss": 0.05127198249101639, "step": 20520 }, { "epoch": 6.847898599066044, "loss": 0.3207, "step": 20530 }, { "epoch": 6.847898599066044, "grad_norm": 3.143054485321045, "step": 20530 }, { "epoch": 6.847898599066044, "learning_rate": 1.6023186051205243e-05, "step": 20530 }, { "epoch": 6.847898599066044, "loss": 0.29538893699645996, "step": 20530 }, { "ce_loss": 0.026194782927632332, "epoch": 6.847898599066044, "step": 20530 }, { "distill_loss": 0.15457136929035187, "epoch": 6.847898599066044, "step": 20530 }, { "epoch": 6.847898599066044, "ref_ce_loss": 0.046603668481111526, "step": 20530 }, { "epoch": 6.847898599066044, "loss": 0.29159462451934814, "step": 20530 }, { "ce_loss": 0.01935706101357937, "epoch": 6.847898599066044, "step": 20530 }, { "distill_loss": 0.1832038313150406, "epoch": 6.847898599066044, "step": 20530 }, { "epoch": 6.847898599066044, "ref_ce_loss": 0.04731707647442818, "step": 20530 }, { "epoch": 6.847898599066044, "loss": 0.4152846038341522, "step": 20530 }, { "ce_loss": 0.02133793942630291, "epoch": 6.847898599066044, "step": 20530 }, { "distill_loss": 0.1754729300737381, "epoch": 6.847898599066044, "step": 20530 }, { "epoch": 6.847898599066044, "ref_ce_loss": 0.0643564984202385, "step": 20530 }, { "epoch": 6.847898599066044, "loss": 0.35213616490364075, "step": 20530 }, { "ce_loss": 0.028784506022930145, "epoch": 6.847898599066044, "step": 20530 }, { "distill_loss": 0.17917081713676453, "epoch": 6.847898599066044, "step": 20530 }, { "epoch": 6.847898599066044, "ref_ce_loss": 0.06949307024478912, "step": 20530 }, { "epoch": 6.851234156104069, "loss": 0.383, "step": 20540 }, { "epoch": 6.851234156104069, "grad_norm": 5.041261196136475, "step": 20540 }, { "epoch": 6.851234156104069, "learning_rate": 1.59322160033645e-05, "step": 20540 }, { "epoch": 6.851234156104069, "loss": 0.3897348940372467, "step": 20540 }, { "ce_loss": 0.045651961117982864, "epoch": 6.851234156104069, "step": 20540 }, { "distill_loss": 0.2878296971321106, "epoch": 6.851234156104069, "step": 20540 }, { "epoch": 6.851234156104069, "ref_ce_loss": 0.05607311427593231, "step": 20540 }, { "epoch": 6.851234156104069, "loss": 0.36378687620162964, "step": 20540 }, { "ce_loss": 0.056211087852716446, "epoch": 6.851234156104069, "step": 20540 }, { "distill_loss": 0.2356104701757431, "epoch": 6.851234156104069, "step": 20540 }, { "epoch": 6.851234156104069, "ref_ce_loss": 0.05061378329992294, "step": 20540 }, { "epoch": 6.851234156104069, "loss": 0.3529844284057617, "step": 20540 }, { "ce_loss": 0.02915358357131481, "epoch": 6.851234156104069, "step": 20540 }, { "distill_loss": 0.15124404430389404, "epoch": 6.851234156104069, "step": 20540 }, { "epoch": 6.851234156104069, "ref_ce_loss": 0.08586305379867554, "step": 20540 }, { "epoch": 6.851234156104069, "loss": 0.3024942874908447, "step": 20540 }, { "ce_loss": 0.05238991603255272, "epoch": 6.851234156104069, "step": 20540 }, { "distill_loss": 0.14373472332954407, "epoch": 6.851234156104069, "step": 20540 }, { "epoch": 6.851234156104069, "ref_ce_loss": 0.057221703231334686, "step": 20540 }, { "epoch": 6.854569713142094, "loss": 0.3917, "step": 20550 }, { "epoch": 6.854569713142094, "grad_norm": 7.488528728485107, "step": 20550 }, { "epoch": 6.854569713142094, "learning_rate": 1.5841490442276332e-05, "step": 20550 }, { "epoch": 6.854569713142094, "loss": 0.41819265484809875, "step": 20550 }, { "ce_loss": 0.018714766949415207, "epoch": 6.854569713142094, "step": 20550 }, { "distill_loss": 0.3183062672615051, "epoch": 6.854569713142094, "step": 20550 }, { "epoch": 6.854569713142094, "ref_ce_loss": 0.03689192607998848, "step": 20550 }, { "epoch": 6.854569713142094, "loss": 0.36515775322914124, "step": 20550 }, { "ce_loss": 0.08507240563631058, "epoch": 6.854569713142094, "step": 20550 }, { "distill_loss": 0.14640793204307556, "epoch": 6.854569713142094, "step": 20550 }, { "epoch": 6.854569713142094, "ref_ce_loss": 0.05818329378962517, "step": 20550 }, { "epoch": 6.854569713142094, "loss": 0.7347936034202576, "step": 20550 }, { "ce_loss": 0.09090854972600937, "epoch": 6.854569713142094, "step": 20550 }, { "distill_loss": 0.41110169887542725, "epoch": 6.854569713142094, "step": 20550 }, { "epoch": 6.854569713142094, "ref_ce_loss": 0.0624711699783802, "step": 20550 }, { "epoch": 6.854569713142094, "loss": 0.33083295822143555, "step": 20550 }, { "ce_loss": 0.034476906061172485, "epoch": 6.854569713142094, "step": 20550 }, { "distill_loss": 0.14133962988853455, "epoch": 6.854569713142094, "step": 20550 }, { "epoch": 6.854569713142094, "ref_ce_loss": 0.05410853400826454, "step": 20550 }, { "epoch": 6.85790527018012, "loss": 0.4608, "step": 20560 }, { "epoch": 6.85790527018012, "grad_norm": 8.407671928405762, "step": 20560 }, { "epoch": 6.85790527018012, "learning_rate": 1.575100953338838e-05, "step": 20560 }, { "epoch": 6.85790527018012, "loss": 0.39217668771743774, "step": 20560 }, { "ce_loss": 0.049609459936618805, "epoch": 6.85790527018012, "step": 20560 }, { "distill_loss": 0.2388307750225067, "epoch": 6.85790527018012, "step": 20560 }, { "epoch": 6.85790527018012, "ref_ce_loss": 0.055172622203826904, "step": 20560 }, { "epoch": 6.85790527018012, "loss": 0.503061056137085, "step": 20560 }, { "ce_loss": 0.029303135350346565, "epoch": 6.85790527018012, "step": 20560 }, { "distill_loss": 0.3383800983428955, "epoch": 6.85790527018012, "step": 20560 }, { "epoch": 6.85790527018012, "ref_ce_loss": 0.043822310864925385, "step": 20560 }, { "epoch": 6.85790527018012, "loss": 0.37875670194625854, "step": 20560 }, { "ce_loss": 0.018401410430669785, "epoch": 6.85790527018012, "step": 20560 }, { "distill_loss": 0.28223758935928345, "epoch": 6.85790527018012, "step": 20560 }, { "epoch": 6.85790527018012, "ref_ce_loss": 0.06798820197582245, "step": 20560 }, { "epoch": 6.85790527018012, "loss": 0.45275554060935974, "step": 20560 }, { "ce_loss": 0.05281388759613037, "epoch": 6.85790527018012, "step": 20560 }, { "distill_loss": 0.3191457688808441, "epoch": 6.85790527018012, "step": 20560 }, { "epoch": 6.85790527018012, "ref_ce_loss": 0.05630598962306976, "step": 20560 }, { "epoch": 6.861240827218145, "loss": 0.5633, "step": 20570 }, { "epoch": 6.861240827218145, "grad_norm": 7.339962482452393, "step": 20570 }, { "epoch": 6.861240827218145, "learning_rate": 1.566077344170214e-05, "step": 20570 }, { "epoch": 6.861240827218145, "loss": 0.2997542917728424, "step": 20570 }, { "ce_loss": 0.01282829511910677, "epoch": 6.861240827218145, "step": 20570 }, { "distill_loss": 0.21651379764080048, "epoch": 6.861240827218145, "step": 20570 }, { "epoch": 6.861240827218145, "ref_ce_loss": 0.0550638884305954, "step": 20570 }, { "epoch": 6.861240827218145, "loss": 0.7980534434318542, "step": 20570 }, { "ce_loss": 0.06555254012346268, "epoch": 6.861240827218145, "step": 20570 }, { "distill_loss": 0.702267050743103, "epoch": 6.861240827218145, "step": 20570 }, { "epoch": 6.861240827218145, "ref_ce_loss": 0.030158137902617455, "step": 20570 }, { "epoch": 6.861240827218145, "loss": 1.2472227811813354, "step": 20570 }, { "ce_loss": 0.11841049045324326, "epoch": 6.861240827218145, "step": 20570 }, { "distill_loss": 0.9726265072822571, "epoch": 6.861240827218145, "step": 20570 }, { "epoch": 6.861240827218145, "ref_ce_loss": 0.08595526963472366, "step": 20570 }, { "epoch": 6.861240827218145, "loss": 0.608909010887146, "step": 20570 }, { "ce_loss": 0.08853866904973984, "epoch": 6.861240827218145, "step": 20570 }, { "distill_loss": 0.4263076186180115, "epoch": 6.861240827218145, "step": 20570 }, { "epoch": 6.861240827218145, "ref_ce_loss": 0.05566973239183426, "step": 20570 }, { "epoch": 6.8645763842561704, "loss": 0.633, "step": 20580 }, { "epoch": 6.8645763842561704, "grad_norm": 9.359967231750488, "step": 20580 }, { "epoch": 6.8645763842561704, "learning_rate": 1.557078233177268e-05, "step": 20580 }, { "epoch": 6.8645763842561704, "loss": 0.37335649132728577, "step": 20580 }, { "ce_loss": 0.04577264189720154, "epoch": 6.8645763842561704, "step": 20580 }, { "distill_loss": 0.2800779938697815, "epoch": 6.8645763842561704, "step": 20580 }, { "epoch": 6.8645763842561704, "ref_ce_loss": 0.04734800010919571, "step": 20580 }, { "epoch": 6.8645763842561704, "loss": 0.6135170459747314, "step": 20580 }, { "ce_loss": 0.0938236191868782, "epoch": 6.8645763842561704, "step": 20580 }, { "distill_loss": 0.4432787001132965, "epoch": 6.8645763842561704, "step": 20580 }, { "epoch": 6.8645763842561704, "ref_ce_loss": 0.06203773617744446, "step": 20580 }, { "epoch": 6.8645763842561704, "loss": 0.5161895751953125, "step": 20580 }, { "ce_loss": 0.0030224386136978865, "epoch": 6.8645763842561704, "step": 20580 }, { "distill_loss": 0.33014774322509766, "epoch": 6.8645763842561704, "step": 20580 }, { "epoch": 6.8645763842561704, "ref_ce_loss": 0.04769544303417206, "step": 20580 }, { "epoch": 6.8645763842561704, "loss": 0.5059229135513306, "step": 20580 }, { "ce_loss": 0.009971593506634235, "epoch": 6.8645763842561704, "step": 20580 }, { "distill_loss": 0.40563908219337463, "epoch": 6.8645763842561704, "step": 20580 }, { "epoch": 6.8645763842561704, "ref_ce_loss": 0.051717959344387054, "step": 20580 }, { "epoch": 6.867911941294196, "loss": 0.4427, "step": 20590 }, { "epoch": 6.867911941294196, "grad_norm": 6.026227951049805, "step": 20590 }, { "epoch": 6.867911941294196, "learning_rate": 1.5481036367708212e-05, "step": 20590 }, { "epoch": 6.867911941294196, "loss": 0.72898268699646, "step": 20590 }, { "ce_loss": 0.056139182299375534, "epoch": 6.867911941294196, "step": 20590 }, { "distill_loss": 0.20178236067295074, "epoch": 6.867911941294196, "step": 20590 }, { "epoch": 6.867911941294196, "ref_ce_loss": 0.06525614857673645, "step": 20590 }, { "epoch": 6.867911941294196, "loss": 0.3133620023727417, "step": 20590 }, { "ce_loss": 0.03392108902335167, "epoch": 6.867911941294196, "step": 20590 }, { "distill_loss": 0.15026448667049408, "epoch": 6.867911941294196, "step": 20590 }, { "epoch": 6.867911941294196, "ref_ce_loss": 0.06486678868532181, "step": 20590 }, { "epoch": 6.867911941294196, "loss": 0.31791892647743225, "step": 20590 }, { "ce_loss": 0.046934567391872406, "epoch": 6.867911941294196, "step": 20590 }, { "distill_loss": 0.19896210730075836, "epoch": 6.867911941294196, "step": 20590 }, { "epoch": 6.867911941294196, "ref_ce_loss": 0.05313270166516304, "step": 20590 }, { "epoch": 6.867911941294196, "loss": 0.40088412165641785, "step": 20590 }, { "ce_loss": 0.04394695535302162, "epoch": 6.867911941294196, "step": 20590 }, { "distill_loss": 0.24145759642124176, "epoch": 6.867911941294196, "step": 20590 }, { "epoch": 6.867911941294196, "ref_ce_loss": 0.04558607190847397, "step": 20590 }, { "epoch": 6.871247498332221, "loss": 0.3944, "step": 20600 }, { "epoch": 6.871247498332221, "grad_norm": 2.8247768878936768, "step": 20600 }, { "epoch": 6.871247498332221, "learning_rate": 1.539153571317e-05, "step": 20600 }, { "epoch": 6.871247498332221, "loss": 0.5353904962539673, "step": 20600 }, { "ce_loss": 0.08445299416780472, "epoch": 6.871247498332221, "step": 20600 }, { "distill_loss": 0.3093477785587311, "epoch": 6.871247498332221, "step": 20600 }, { "epoch": 6.871247498332221, "ref_ce_loss": 0.08602926880121231, "step": 20600 }, { "epoch": 6.871247498332221, "loss": 0.29699471592903137, "step": 20600 }, { "ce_loss": 0.029342997819185257, "epoch": 6.871247498332221, "step": 20600 }, { "distill_loss": 0.22840334475040436, "epoch": 6.871247498332221, "step": 20600 }, { "epoch": 6.871247498332221, "ref_ce_loss": 0.0391845703125, "step": 20600 }, { "epoch": 6.871247498332221, "loss": 0.20265303552150726, "step": 20600 }, { "ce_loss": 0.005748094525188208, "epoch": 6.871247498332221, "step": 20600 }, { "distill_loss": 0.14643459022045135, "epoch": 6.871247498332221, "step": 20600 }, { "epoch": 6.871247498332221, "ref_ce_loss": 0.05027634650468826, "step": 20600 }, { "epoch": 6.871247498332221, "loss": 0.46097424626350403, "step": 20600 }, { "ce_loss": 0.04106662794947624, "epoch": 6.871247498332221, "step": 20600 }, { "distill_loss": 0.2435900866985321, "epoch": 6.871247498332221, "step": 20600 }, { "epoch": 6.871247498332221, "ref_ce_loss": 0.03964756056666374, "step": 20600 }, { "epoch": 6.8745830553702465, "loss": 0.5758, "step": 20610 }, { "epoch": 6.8745830553702465, "grad_norm": 15.702533721923828, "step": 20610 }, { "epoch": 6.8745830553702465, "learning_rate": 1.5302280531371957e-05, "step": 20610 }, { "epoch": 6.8745830553702465, "loss": 0.4692019820213318, "step": 20610 }, { "ce_loss": 0.039602942764759064, "epoch": 6.8745830553702465, "step": 20610 }, { "distill_loss": 0.373125821352005, "epoch": 6.8745830553702465, "step": 20610 }, { "epoch": 6.8745830553702465, "ref_ce_loss": 0.046325571835041046, "step": 20610 }, { "epoch": 6.8745830553702465, "loss": 0.7239594459533691, "step": 20610 }, { "ce_loss": 0.018161263316869736, "epoch": 6.8745830553702465, "step": 20610 }, { "distill_loss": 0.6537948846817017, "epoch": 6.8745830553702465, "step": 20610 }, { "epoch": 6.8745830553702465, "ref_ce_loss": 0.03868037089705467, "step": 20610 }, { "epoch": 6.8745830553702465, "loss": 0.6063088178634644, "step": 20610 }, { "ce_loss": 0.018849315121769905, "epoch": 6.8745830553702465, "step": 20610 }, { "distill_loss": 0.4054892659187317, "epoch": 6.8745830553702465, "step": 20610 }, { "epoch": 6.8745830553702465, "ref_ce_loss": 0.05163077265024185, "step": 20610 }, { "epoch": 6.8745830553702465, "loss": 1.016980767250061, "step": 20610 }, { "ce_loss": 0.017590520903468132, "epoch": 6.8745830553702465, "step": 20610 }, { "distill_loss": 0.8820805549621582, "epoch": 6.8745830553702465, "step": 20610 }, { "epoch": 6.8745830553702465, "ref_ce_loss": 0.05267731845378876, "step": 20610 }, { "epoch": 6.877918612408272, "loss": 0.7963, "step": 20620 }, { "epoch": 6.877918612408272, "grad_norm": 11.103915214538574, "step": 20620 }, { "epoch": 6.877918612408272, "learning_rate": 1.52132709850803e-05, "step": 20620 }, { "epoch": 6.877918612408272, "loss": 1.7030434608459473, "step": 20620 }, { "ce_loss": 0.1394777148962021, "epoch": 6.877918612408272, "step": 20620 }, { "distill_loss": 1.216463565826416, "epoch": 6.877918612408272, "step": 20620 }, { "epoch": 6.877918612408272, "ref_ce_loss": 0.08501607924699783, "step": 20620 }, { "epoch": 6.877918612408272, "loss": 0.8534746170043945, "step": 20620 }, { "ce_loss": 0.029901063069701195, "epoch": 6.877918612408272, "step": 20620 }, { "distill_loss": 0.7174789309501648, "epoch": 6.877918612408272, "step": 20620 }, { "epoch": 6.877918612408272, "ref_ce_loss": 0.05184415727853775, "step": 20620 }, { "epoch": 6.877918612408272, "loss": 0.5241031646728516, "step": 20620 }, { "ce_loss": 0.029480108991265297, "epoch": 6.877918612408272, "step": 20620 }, { "distill_loss": 0.342683345079422, "epoch": 6.877918612408272, "step": 20620 }, { "epoch": 6.877918612408272, "ref_ce_loss": 0.0461571030318737, "step": 20620 }, { "epoch": 6.877918612408272, "loss": 0.7048920392990112, "step": 20620 }, { "ce_loss": 0.04662850871682167, "epoch": 6.877918612408272, "step": 20620 }, { "distill_loss": 0.5464339852333069, "epoch": 6.877918612408272, "step": 20620 }, { "epoch": 6.877918612408272, "ref_ce_loss": 0.07147201150655746, "step": 20620 }, { "epoch": 6.881254169446297, "loss": 0.8145, "step": 20630 }, { "epoch": 6.881254169446297, "grad_norm": 11.292054176330566, "step": 20630 }, { "epoch": 6.881254169446297, "learning_rate": 1.512450723661337e-05, "step": 20630 }, { "epoch": 6.881254169446297, "loss": 0.42995086312294006, "step": 20630 }, { "ce_loss": 0.011972256004810333, "epoch": 6.881254169446297, "step": 20630 }, { "distill_loss": 0.36720559000968933, "epoch": 6.881254169446297, "step": 20630 }, { "epoch": 6.881254169446297, "ref_ce_loss": 0.03696437552571297, "step": 20630 }, { "epoch": 6.881254169446297, "loss": 0.5721352696418762, "step": 20630 }, { "ce_loss": 0.07270863652229309, "epoch": 6.881254169446297, "step": 20630 }, { "distill_loss": 0.39840593934059143, "epoch": 6.881254169446297, "step": 20630 }, { "epoch": 6.881254169446297, "ref_ce_loss": 0.07148131728172302, "step": 20630 }, { "epoch": 6.881254169446297, "loss": 0.4161236882209778, "step": 20630 }, { "ce_loss": 0.025703487917780876, "epoch": 6.881254169446297, "step": 20630 }, { "distill_loss": 0.3056744337081909, "epoch": 6.881254169446297, "step": 20630 }, { "epoch": 6.881254169446297, "ref_ce_loss": 0.04110679775476456, "step": 20630 }, { "epoch": 6.881254169446297, "loss": 0.43479883670806885, "step": 20630 }, { "ce_loss": 0.05821816995739937, "epoch": 6.881254169446297, "step": 20630 }, { "distill_loss": 0.29935696721076965, "epoch": 6.881254169446297, "step": 20630 }, { "epoch": 6.881254169446297, "ref_ce_loss": 0.047724220901727676, "step": 20630 }, { "epoch": 6.8845897264843225, "loss": 0.552, "step": 20640 }, { "epoch": 6.8845897264843225, "grad_norm": 17.464458465576172, "step": 20640 }, { "epoch": 6.8845897264843225, "learning_rate": 1.5035989447841167e-05, "step": 20640 }, { "epoch": 6.8845897264843225, "loss": 0.4306272566318512, "step": 20640 }, { "ce_loss": 0.049284566193819046, "epoch": 6.8845897264843225, "step": 20640 }, { "distill_loss": 0.28677845001220703, "epoch": 6.8845897264843225, "step": 20640 }, { "epoch": 6.8845897264843225, "ref_ce_loss": 0.04678497090935707, "step": 20640 }, { "epoch": 6.8845897264843225, "loss": 0.32574254274368286, "step": 20640 }, { "ce_loss": 0.020122941583395004, "epoch": 6.8845897264843225, "step": 20640 }, { "distill_loss": 0.26398566365242004, "epoch": 6.8845897264843225, "step": 20640 }, { "epoch": 6.8845897264843225, "ref_ce_loss": 0.027873439714312553, "step": 20640 }, { "epoch": 6.8845897264843225, "loss": 0.41913729906082153, "step": 20640 }, { "ce_loss": 0.035174764692783356, "epoch": 6.8845897264843225, "step": 20640 }, { "distill_loss": 0.3242431879043579, "epoch": 6.8845897264843225, "step": 20640 }, { "epoch": 6.8845897264843225, "ref_ce_loss": 0.0594937726855278, "step": 20640 }, { "epoch": 6.8845897264843225, "loss": 1.0001294612884521, "step": 20640 }, { "ce_loss": 0.06655082106590271, "epoch": 6.8845897264843225, "step": 20640 }, { "distill_loss": 0.8158390522003174, "epoch": 6.8845897264843225, "step": 20640 }, { "epoch": 6.8845897264843225, "ref_ce_loss": 0.07071325927972794, "step": 20640 }, { "epoch": 6.887925283522348, "loss": 0.7093, "step": 20650 }, { "epoch": 6.887925283522348, "grad_norm": 16.102046966552734, "step": 20650 }, { "epoch": 6.887925283522348, "learning_rate": 1.494771778018527e-05, "step": 20650 }, { "epoch": 6.887925283522348, "loss": 0.5199611186981201, "step": 20650 }, { "ce_loss": 0.015108607709407806, "epoch": 6.887925283522348, "step": 20650 }, { "distill_loss": 0.42118048667907715, "epoch": 6.887925283522348, "step": 20650 }, { "epoch": 6.887925283522348, "ref_ce_loss": 0.08327829837799072, "step": 20650 }, { "epoch": 6.887925283522348, "loss": 1.312179446220398, "step": 20650 }, { "ce_loss": 0.009233505465090275, "epoch": 6.887925283522348, "step": 20650 }, { "distill_loss": 1.1099575757980347, "epoch": 6.887925283522348, "step": 20650 }, { "epoch": 6.887925283522348, "ref_ce_loss": 0.0498458668589592, "step": 20650 }, { "epoch": 6.887925283522348, "loss": 1.1651020050048828, "step": 20650 }, { "ce_loss": 0.00040370121132582426, "epoch": 6.887925283522348, "step": 20650 }, { "distill_loss": 1.0097606182098389, "epoch": 6.887925283522348, "step": 20650 }, { "epoch": 6.887925283522348, "ref_ce_loss": 0.03661251440644264, "step": 20650 }, { "epoch": 6.887925283522348, "loss": 0.9241766929626465, "step": 20650 }, { "ce_loss": 0.018162589520215988, "epoch": 6.887925283522348, "step": 20650 }, { "distill_loss": 0.8247155547142029, "epoch": 6.887925283522348, "step": 20650 }, { "epoch": 6.887925283522348, "ref_ce_loss": 0.055458761751651764, "step": 20650 }, { "epoch": 6.891260840560373, "loss": 0.8902, "step": 20660 }, { "epoch": 6.891260840560373, "grad_norm": 17.5844669342041, "step": 20660 }, { "epoch": 6.891260840560373, "learning_rate": 1.4859692394618345e-05, "step": 20660 }, { "epoch": 6.891260840560373, "loss": 0.3984909951686859, "step": 20660 }, { "ce_loss": 0.023830199614167213, "epoch": 6.891260840560373, "step": 20660 }, { "distill_loss": 0.2762604355812073, "epoch": 6.891260840560373, "step": 20660 }, { "epoch": 6.891260840560373, "ref_ce_loss": 0.04289623722434044, "step": 20660 }, { "epoch": 6.891260840560373, "loss": 0.5686413645744324, "step": 20660 }, { "ce_loss": 0.06819921731948853, "epoch": 6.891260840560373, "step": 20660 }, { "distill_loss": 0.4240104854106903, "epoch": 6.891260840560373, "step": 20660 }, { "epoch": 6.891260840560373, "ref_ce_loss": 0.07634566724300385, "step": 20660 }, { "epoch": 6.891260840560373, "loss": 0.9278061389923096, "step": 20660 }, { "ce_loss": 0.016938427463173866, "epoch": 6.891260840560373, "step": 20660 }, { "distill_loss": 0.8398166298866272, "epoch": 6.891260840560373, "step": 20660 }, { "epoch": 6.891260840560373, "ref_ce_loss": 0.070817731320858, "step": 20660 }, { "epoch": 6.891260840560373, "loss": 0.5596387386322021, "step": 20660 }, { "ce_loss": 0.05967777222394943, "epoch": 6.891260840560373, "step": 20660 }, { "distill_loss": 0.38053545355796814, "epoch": 6.891260840560373, "step": 20660 }, { "epoch": 6.891260840560373, "ref_ce_loss": 0.043331343680620193, "step": 20660 }, { "epoch": 6.894596397598399, "loss": 0.5821, "step": 20670 }, { "epoch": 6.894596397598399, "grad_norm": 15.2507963180542, "step": 20670 }, { "epoch": 6.894596397598399, "learning_rate": 1.4771913451664002e-05, "step": 20670 }, { "epoch": 6.894596397598399, "loss": 0.5702903866767883, "step": 20670 }, { "ce_loss": 0.028813892975449562, "epoch": 6.894596397598399, "step": 20670 }, { "distill_loss": 0.4404188394546509, "epoch": 6.894596397598399, "step": 20670 }, { "epoch": 6.894596397598399, "ref_ce_loss": 0.0773758515715599, "step": 20670 }, { "epoch": 6.894596397598399, "loss": 0.4521728456020355, "step": 20670 }, { "ce_loss": 0.08183231204748154, "epoch": 6.894596397598399, "step": 20670 }, { "distill_loss": 0.272499144077301, "epoch": 6.894596397598399, "step": 20670 }, { "epoch": 6.894596397598399, "ref_ce_loss": 0.05865732207894325, "step": 20670 }, { "epoch": 6.894596397598399, "loss": 0.5765916705131531, "step": 20670 }, { "ce_loss": 0.02798348292708397, "epoch": 6.894596397598399, "step": 20670 }, { "distill_loss": 0.47397539019584656, "epoch": 6.894596397598399, "step": 20670 }, { "epoch": 6.894596397598399, "ref_ce_loss": 0.0540350042283535, "step": 20670 }, { "epoch": 6.894596397598399, "loss": 0.31577372550964355, "step": 20670 }, { "ce_loss": 0.05203654244542122, "epoch": 6.894596397598399, "step": 20670 }, { "distill_loss": 0.1875409334897995, "epoch": 6.894596397598399, "step": 20670 }, { "epoch": 6.894596397598399, "ref_ce_loss": 0.054577991366386414, "step": 20670 }, { "epoch": 6.897931954636424, "loss": 0.5992, "step": 20680 }, { "epoch": 6.897931954636424, "grad_norm": 8.946640968322754, "step": 20680 }, { "epoch": 6.897931954636424, "learning_rate": 1.4684381111396399e-05, "step": 20680 }, { "epoch": 6.897931954636424, "loss": 0.40662744641304016, "step": 20680 }, { "ce_loss": 0.020231124013662338, "epoch": 6.897931954636424, "step": 20680 }, { "distill_loss": 0.334001749753952, "epoch": 6.897931954636424, "step": 20680 }, { "epoch": 6.897931954636424, "ref_ce_loss": 0.035733092576265335, "step": 20680 }, { "epoch": 6.897931954636424, "loss": 0.4750587046146393, "step": 20680 }, { "ce_loss": 0.03923625871539116, "epoch": 6.897931954636424, "step": 20680 }, { "distill_loss": 0.38633885979652405, "epoch": 6.897931954636424, "step": 20680 }, { "epoch": 6.897931954636424, "ref_ce_loss": 0.04937080293893814, "step": 20680 }, { "epoch": 6.897931954636424, "loss": 0.47791561484336853, "step": 20680 }, { "ce_loss": 0.052632320672273636, "epoch": 6.897931954636424, "step": 20680 }, { "distill_loss": 0.35709503293037415, "epoch": 6.897931954636424, "step": 20680 }, { "epoch": 6.897931954636424, "ref_ce_loss": 0.06743654608726501, "step": 20680 }, { "epoch": 6.897931954636424, "loss": 0.8078065514564514, "step": 20680 }, { "ce_loss": 0.02854505181312561, "epoch": 6.897931954636424, "step": 20680 }, { "distill_loss": 0.5855287313461304, "epoch": 6.897931954636424, "step": 20680 }, { "epoch": 6.897931954636424, "ref_ce_loss": 0.042745549231767654, "step": 20680 }, { "epoch": 6.901267511674449, "loss": 0.6226, "step": 20690 }, { "epoch": 6.901267511674449, "grad_norm": 17.90632438659668, "step": 20690 }, { "epoch": 6.901267511674449, "learning_rate": 1.4597095533440013e-05, "step": 20690 }, { "epoch": 6.901267511674449, "loss": 0.3882908225059509, "step": 20690 }, { "ce_loss": 0.06582961976528168, "epoch": 6.901267511674449, "step": 20690 }, { "distill_loss": 0.23433640599250793, "epoch": 6.901267511674449, "step": 20690 }, { "epoch": 6.901267511674449, "ref_ce_loss": 0.06076068431138992, "step": 20690 }, { "epoch": 6.901267511674449, "loss": 0.8035255670547485, "step": 20690 }, { "ce_loss": 0.02489415742456913, "epoch": 6.901267511674449, "step": 20690 }, { "distill_loss": 0.3288425803184509, "epoch": 6.901267511674449, "step": 20690 }, { "epoch": 6.901267511674449, "ref_ce_loss": 0.07385779172182083, "step": 20690 }, { "epoch": 6.901267511674449, "loss": 0.3329160809516907, "step": 20690 }, { "ce_loss": 0.03135690838098526, "epoch": 6.901267511674449, "step": 20690 }, { "distill_loss": 0.2420273721218109, "epoch": 6.901267511674449, "step": 20690 }, { "epoch": 6.901267511674449, "ref_ce_loss": 0.03833601996302605, "step": 20690 }, { "epoch": 6.901267511674449, "loss": 0.4151855409145355, "step": 20690 }, { "ce_loss": 0.02253338135778904, "epoch": 6.901267511674449, "step": 20690 }, { "distill_loss": 0.32237106561660767, "epoch": 6.901267511674449, "step": 20690 }, { "epoch": 6.901267511674449, "ref_ce_loss": 0.04350052401423454, "step": 20690 }, { "epoch": 6.904603068712475, "loss": 0.531, "step": 20700 }, { "epoch": 6.904603068712475, "grad_norm": 7.521162986755371, "step": 20700 }, { "epoch": 6.904603068712475, "learning_rate": 1.4510056876969267e-05, "step": 20700 }, { "epoch": 6.904603068712475, "loss": 0.8306207060813904, "step": 20700 }, { "ce_loss": 0.07492867112159729, "epoch": 6.904603068712475, "step": 20700 }, { "distill_loss": 0.6766117215156555, "epoch": 6.904603068712475, "step": 20700 }, { "epoch": 6.904603068712475, "ref_ce_loss": 0.07895965129137039, "step": 20700 }, { "epoch": 6.904603068712475, "loss": 0.5224670767784119, "step": 20700 }, { "ce_loss": 0.08923482149839401, "epoch": 6.904603068712475, "step": 20700 }, { "distill_loss": 0.33293861150741577, "epoch": 6.904603068712475, "step": 20700 }, { "epoch": 6.904603068712475, "ref_ce_loss": 0.09829211235046387, "step": 20700 }, { "epoch": 6.904603068712475, "loss": 0.4990532696247101, "step": 20700 }, { "ce_loss": 0.023325325921177864, "epoch": 6.904603068712475, "step": 20700 }, { "distill_loss": 0.37632182240486145, "epoch": 6.904603068712475, "step": 20700 }, { "epoch": 6.904603068712475, "ref_ce_loss": 0.06445571035146713, "step": 20700 }, { "epoch": 6.904603068712475, "loss": 0.4500182271003723, "step": 20700 }, { "ce_loss": 0.05171177163720131, "epoch": 6.904603068712475, "step": 20700 }, { "distill_loss": 0.24152307212352753, "epoch": 6.904603068712475, "step": 20700 }, { "epoch": 6.904603068712475, "ref_ce_loss": 0.06856939196586609, "step": 20700 }, { "epoch": 6.9079386257505, "loss": 0.5049, "step": 20710 }, { "epoch": 6.9079386257505, "grad_norm": 8.229083061218262, "step": 20710 }, { "epoch": 6.9079386257505, "learning_rate": 1.442326530070838e-05, "step": 20710 }, { "epoch": 6.9079386257505, "loss": 0.30293604731559753, "step": 20710 }, { "ce_loss": 0.024854473769664764, "epoch": 6.9079386257505, "step": 20710 }, { "distill_loss": 0.16231058537960052, "epoch": 6.9079386257505, "step": 20710 }, { "epoch": 6.9079386257505, "ref_ce_loss": 0.0645240843296051, "step": 20710 }, { "epoch": 6.9079386257505, "loss": 0.8261449933052063, "step": 20710 }, { "ce_loss": 0.08644302934408188, "epoch": 6.9079386257505, "step": 20710 }, { "distill_loss": 0.5393825769424438, "epoch": 6.9079386257505, "step": 20710 }, { "epoch": 6.9079386257505, "ref_ce_loss": 0.10075093805789948, "step": 20710 }, { "epoch": 6.9079386257505, "loss": 0.4293292164802551, "step": 20710 }, { "ce_loss": 0.033315710723400116, "epoch": 6.9079386257505, "step": 20710 }, { "distill_loss": 0.2953912317752838, "epoch": 6.9079386257505, "step": 20710 }, { "epoch": 6.9079386257505, "ref_ce_loss": 0.04508380591869354, "step": 20710 }, { "epoch": 6.9079386257505, "loss": 0.3437563478946686, "step": 20710 }, { "ce_loss": 0.06788795441389084, "epoch": 6.9079386257505, "step": 20710 }, { "distill_loss": 0.17214861512184143, "epoch": 6.9079386257505, "step": 20710 }, { "epoch": 6.9079386257505, "ref_ce_loss": 0.08690239489078522, "step": 20710 }, { "epoch": 6.911274182788525, "loss": 0.4995, "step": 20720 }, { "epoch": 6.911274182788525, "grad_norm": 8.721784591674805, "step": 20720 }, { "epoch": 6.911274182788525, "learning_rate": 1.4336720962930898e-05, "step": 20720 }, { "epoch": 6.911274182788525, "loss": 0.6756659746170044, "step": 20720 }, { "ce_loss": 0.008523699827492237, "epoch": 6.911274182788525, "step": 20720 }, { "distill_loss": 0.25348472595214844, "epoch": 6.911274182788525, "step": 20720 }, { "epoch": 6.911274182788525, "ref_ce_loss": 0.05715152993798256, "step": 20720 }, { "epoch": 6.911274182788525, "loss": 0.7252782583236694, "step": 20720 }, { "ce_loss": 0.03596337139606476, "epoch": 6.911274182788525, "step": 20720 }, { "distill_loss": 0.41875484585762024, "epoch": 6.911274182788525, "step": 20720 }, { "epoch": 6.911274182788525, "ref_ce_loss": 0.08006498217582703, "step": 20720 }, { "epoch": 6.911274182788525, "loss": 0.4181911051273346, "step": 20720 }, { "ce_loss": 0.009214845485985279, "epoch": 6.911274182788525, "step": 20720 }, { "distill_loss": 0.3338942229747772, "epoch": 6.911274182788525, "step": 20720 }, { "epoch": 6.911274182788525, "ref_ce_loss": 0.07484129816293716, "step": 20720 }, { "epoch": 6.911274182788525, "loss": 0.3620077669620514, "step": 20720 }, { "ce_loss": 0.03972398489713669, "epoch": 6.911274182788525, "step": 20720 }, { "distill_loss": 0.2782702147960663, "epoch": 6.911274182788525, "step": 20720 }, { "epoch": 6.911274182788525, "ref_ce_loss": 0.043749336153268814, "step": 20720 }, { "epoch": 6.914609739826551, "loss": 0.439, "step": 20730 }, { "epoch": 6.914609739826551, "grad_norm": 5.441989898681641, "step": 20730 }, { "epoch": 6.914609739826551, "learning_rate": 1.4250424021459555e-05, "step": 20730 }, { "epoch": 6.914609739826551, "loss": 0.30001750588417053, "step": 20730 }, { "ce_loss": 0.03059590421617031, "epoch": 6.914609739826551, "step": 20730 }, { "distill_loss": 0.21454161405563354, "epoch": 6.914609739826551, "step": 20730 }, { "epoch": 6.914609739826551, "ref_ce_loss": 0.05449133366346359, "step": 20730 }, { "epoch": 6.914609739826551, "loss": 0.5982506275177002, "step": 20730 }, { "ce_loss": 0.03713531047105789, "epoch": 6.914609739826551, "step": 20730 }, { "distill_loss": 0.4382067620754242, "epoch": 6.914609739826551, "step": 20730 }, { "epoch": 6.914609739826551, "ref_ce_loss": 0.09281221032142639, "step": 20730 }, { "epoch": 6.914609739826551, "loss": 0.4225163459777832, "step": 20730 }, { "ce_loss": 0.08537886291742325, "epoch": 6.914609739826551, "step": 20730 }, { "distill_loss": 0.18929584324359894, "epoch": 6.914609739826551, "step": 20730 }, { "epoch": 6.914609739826551, "ref_ce_loss": 0.05052082613110542, "step": 20730 }, { "epoch": 6.914609739826551, "loss": 0.33002710342407227, "step": 20730 }, { "ce_loss": 0.037739742547273636, "epoch": 6.914609739826551, "step": 20730 }, { "distill_loss": 0.21521805226802826, "epoch": 6.914609739826551, "step": 20730 }, { "epoch": 6.914609739826551, "ref_ce_loss": 0.07679290324449539, "step": 20730 }, { "epoch": 6.917945296864576, "loss": 0.4035, "step": 20740 }, { "epoch": 6.917945296864576, "grad_norm": 4.653440952301025, "step": 20740 }, { "epoch": 6.917945296864576, "learning_rate": 1.4164374633666003e-05, "step": 20740 }, { "epoch": 6.917945296864576, "loss": 0.5228537917137146, "step": 20740 }, { "ce_loss": 0.03656856343150139, "epoch": 6.917945296864576, "step": 20740 }, { "distill_loss": 0.4247232675552368, "epoch": 6.917945296864576, "step": 20740 }, { "epoch": 6.917945296864576, "ref_ce_loss": 0.06130426377058029, "step": 20740 }, { "epoch": 6.917945296864576, "loss": 0.3749106526374817, "step": 20740 }, { "ce_loss": 0.02962685376405716, "epoch": 6.917945296864576, "step": 20740 }, { "distill_loss": 0.27801570296287537, "epoch": 6.917945296864576, "step": 20740 }, { "epoch": 6.917945296864576, "ref_ce_loss": 0.0668526366353035, "step": 20740 }, { "epoch": 6.917945296864576, "loss": 0.17048296332359314, "step": 20740 }, { "ce_loss": 0.01074532326310873, "epoch": 6.917945296864576, "step": 20740 }, { "distill_loss": 0.12260544300079346, "epoch": 6.917945296864576, "step": 20740 }, { "epoch": 6.917945296864576, "ref_ce_loss": 0.03705614432692528, "step": 20740 }, { "epoch": 6.917945296864576, "loss": 0.19853581488132477, "step": 20740 }, { "ce_loss": 0.005890983156859875, "epoch": 6.917945296864576, "step": 20740 }, { "distill_loss": 0.1258002668619156, "epoch": 6.917945296864576, "step": 20740 }, { "epoch": 6.917945296864576, "ref_ce_loss": 0.04815744236111641, "step": 20740 }, { "epoch": 6.921280853902601, "loss": 0.4412, "step": 20750 }, { "epoch": 6.921280853902601, "grad_norm": 5.386718273162842, "step": 20750 }, { "epoch": 6.921280853902601, "learning_rate": 1.4078572956470335e-05, "step": 20750 }, { "epoch": 6.921280853902601, "loss": 0.5025739669799805, "step": 20750 }, { "ce_loss": 0.07032178342342377, "epoch": 6.921280853902601, "step": 20750 }, { "distill_loss": 0.2361125499010086, "epoch": 6.921280853902601, "step": 20750 }, { "epoch": 6.921280853902601, "ref_ce_loss": 0.04584193974733353, "step": 20750 }, { "epoch": 6.921280853902601, "loss": 0.515607476234436, "step": 20750 }, { "ce_loss": 0.0668293833732605, "epoch": 6.921280853902601, "step": 20750 }, { "distill_loss": 0.2832540273666382, "epoch": 6.921280853902601, "step": 20750 }, { "epoch": 6.921280853902601, "ref_ce_loss": 0.07667690515518188, "step": 20750 }, { "epoch": 6.921280853902601, "loss": 0.27646130323410034, "step": 20750 }, { "ce_loss": 0.009194553829729557, "epoch": 6.921280853902601, "step": 20750 }, { "distill_loss": 0.16538158059120178, "epoch": 6.921280853902601, "step": 20750 }, { "epoch": 6.921280853902601, "ref_ce_loss": 0.08040463179349899, "step": 20750 }, { "epoch": 6.921280853902601, "loss": 0.20386376976966858, "step": 20750 }, { "ce_loss": 0.013103666715323925, "epoch": 6.921280853902601, "step": 20750 }, { "distill_loss": 0.14098665118217468, "epoch": 6.921280853902601, "step": 20750 }, { "epoch": 6.921280853902601, "ref_ce_loss": 0.04962094500660896, "step": 20750 }, { "epoch": 6.924616410940627, "loss": 0.4026, "step": 20760 }, { "epoch": 6.924616410940627, "grad_norm": 6.593588352203369, "step": 20760 }, { "epoch": 6.924616410940627, "learning_rate": 1.3993019146340973e-05, "step": 20760 }, { "epoch": 6.924616410940627, "loss": 0.40435919165611267, "step": 20760 }, { "ce_loss": 0.06731601804494858, "epoch": 6.924616410940627, "step": 20760 }, { "distill_loss": 0.20835278928279877, "epoch": 6.924616410940627, "step": 20760 }, { "epoch": 6.924616410940627, "ref_ce_loss": 0.06101388484239578, "step": 20760 }, { "epoch": 6.924616410940627, "loss": 0.6540699005126953, "step": 20760 }, { "ce_loss": 0.05264006927609444, "epoch": 6.924616410940627, "step": 20760 }, { "distill_loss": 0.29627346992492676, "epoch": 6.924616410940627, "step": 20760 }, { "epoch": 6.924616410940627, "ref_ce_loss": 0.0648992583155632, "step": 20760 }, { "epoch": 6.924616410940627, "loss": 0.49436137080192566, "step": 20760 }, { "ce_loss": 0.0642567053437233, "epoch": 6.924616410940627, "step": 20760 }, { "distill_loss": 0.18897266685962677, "epoch": 6.924616410940627, "step": 20760 }, { "epoch": 6.924616410940627, "ref_ce_loss": 0.04494559019804001, "step": 20760 }, { "epoch": 6.924616410940627, "loss": 0.2949737012386322, "step": 20760 }, { "ce_loss": 0.025369055569171906, "epoch": 6.924616410940627, "step": 20760 }, { "distill_loss": 0.23227348923683167, "epoch": 6.924616410940627, "step": 20760 }, { "epoch": 6.924616410940627, "ref_ce_loss": 0.03722801432013512, "step": 20760 }, { "epoch": 6.927951967978652, "loss": 0.4255, "step": 20770 }, { "epoch": 6.927951967978652, "grad_norm": 3.854159116744995, "step": 20770 }, { "epoch": 6.927951967978652, "learning_rate": 1.3907713359294298e-05, "step": 20770 }, { "epoch": 6.927951967978652, "loss": 0.4830353260040283, "step": 20770 }, { "ce_loss": 0.046318408101797104, "epoch": 6.927951967978652, "step": 20770 }, { "distill_loss": 0.37137317657470703, "epoch": 6.927951967978652, "step": 20770 }, { "epoch": 6.927951967978652, "ref_ce_loss": 0.031674664467573166, "step": 20770 }, { "epoch": 6.927951967978652, "loss": 0.44186148047447205, "step": 20770 }, { "ce_loss": 0.08817555755376816, "epoch": 6.927951967978652, "step": 20770 }, { "distill_loss": 0.2676374316215515, "epoch": 6.927951967978652, "step": 20770 }, { "epoch": 6.927951967978652, "ref_ce_loss": 0.06345969438552856, "step": 20770 }, { "epoch": 6.927951967978652, "loss": 0.44447100162506104, "step": 20770 }, { "ce_loss": 0.02692546881735325, "epoch": 6.927951967978652, "step": 20770 }, { "distill_loss": 0.32898080348968506, "epoch": 6.927951967978652, "step": 20770 }, { "epoch": 6.927951967978652, "ref_ce_loss": 0.05204636603593826, "step": 20770 }, { "epoch": 6.927951967978652, "loss": 0.35606223344802856, "step": 20770 }, { "ce_loss": 0.020126059651374817, "epoch": 6.927951967978652, "step": 20770 }, { "distill_loss": 0.3049810826778412, "epoch": 6.927951967978652, "step": 20770 }, { "epoch": 6.927951967978652, "ref_ce_loss": 0.030778730288147926, "step": 20770 }, { "epoch": 6.931287525016677, "loss": 0.457, "step": 20780 }, { "epoch": 6.931287525016677, "grad_norm": 4.456582546234131, "step": 20780 }, { "epoch": 6.931287525016677, "learning_rate": 1.3822655750894424e-05, "step": 20780 }, { "epoch": 6.931287525016677, "loss": 0.44263705611228943, "step": 20780 }, { "ce_loss": 0.08875694870948792, "epoch": 6.931287525016677, "step": 20780 }, { "distill_loss": 0.23786699771881104, "epoch": 6.931287525016677, "step": 20780 }, { "epoch": 6.931287525016677, "ref_ce_loss": 0.04129364714026451, "step": 20780 }, { "epoch": 6.931287525016677, "loss": 0.6732338666915894, "step": 20780 }, { "ce_loss": 0.05769633874297142, "epoch": 6.931287525016677, "step": 20780 }, { "distill_loss": 0.277824729681015, "epoch": 6.931287525016677, "step": 20780 }, { "epoch": 6.931287525016677, "ref_ce_loss": 0.07031594961881638, "step": 20780 }, { "epoch": 6.931287525016677, "loss": 0.27141398191452026, "step": 20780 }, { "ce_loss": 0.013270096853375435, "epoch": 6.931287525016677, "step": 20780 }, { "distill_loss": 0.15588274598121643, "epoch": 6.931287525016677, "step": 20780 }, { "epoch": 6.931287525016677, "ref_ce_loss": 0.05156317353248596, "step": 20780 }, { "epoch": 6.931287525016677, "loss": 0.45670467615127563, "step": 20780 }, { "ce_loss": 0.07102420181035995, "epoch": 6.931287525016677, "step": 20780 }, { "distill_loss": 0.18294796347618103, "epoch": 6.931287525016677, "step": 20780 }, { "epoch": 6.931287525016677, "ref_ce_loss": 0.07500330358743668, "step": 20780 }, { "epoch": 6.934623082054703, "loss": 0.4402, "step": 20790 }, { "epoch": 6.934623082054703, "grad_norm": 7.2800397872924805, "step": 20790 }, { "epoch": 6.934623082054703, "learning_rate": 1.3737846476252889e-05, "step": 20790 }, { "epoch": 6.934623082054703, "loss": 0.28252097964286804, "step": 20790 }, { "ce_loss": 0.03667629882693291, "epoch": 6.934623082054703, "step": 20790 }, { "distill_loss": 0.16667498648166656, "epoch": 6.934623082054703, "step": 20790 }, { "epoch": 6.934623082054703, "ref_ce_loss": 0.05718651041388512, "step": 20790 }, { "epoch": 6.934623082054703, "loss": 0.2800046503543854, "step": 20790 }, { "ce_loss": 0.029736505821347237, "epoch": 6.934623082054703, "step": 20790 }, { "distill_loss": 0.11351748555898666, "epoch": 6.934623082054703, "step": 20790 }, { "epoch": 6.934623082054703, "ref_ce_loss": 0.039131175726652145, "step": 20790 }, { "epoch": 6.934623082054703, "loss": 0.5445693731307983, "step": 20790 }, { "ce_loss": 0.04326792433857918, "epoch": 6.934623082054703, "step": 20790 }, { "distill_loss": 0.37276849150657654, "epoch": 6.934623082054703, "step": 20790 }, { "epoch": 6.934623082054703, "ref_ce_loss": 0.043659619987010956, "step": 20790 }, { "epoch": 6.934623082054703, "loss": 0.3070202171802521, "step": 20790 }, { "ce_loss": 0.022522946819663048, "epoch": 6.934623082054703, "step": 20790 }, { "distill_loss": 0.18984946608543396, "epoch": 6.934623082054703, "step": 20790 }, { "epoch": 6.934623082054703, "ref_ce_loss": 0.07139455527067184, "step": 20790 }, { "epoch": 6.937958639092728, "loss": 0.4202, "step": 20800 }, { "epoch": 6.937958639092728, "grad_norm": 4.608834743499756, "step": 20800 }, { "epoch": 6.937958639092728, "learning_rate": 1.3653285690028349e-05, "step": 20800 }, { "epoch": 6.937958639092728, "loss": 0.6431030631065369, "step": 20800 }, { "ce_loss": 0.09889832884073257, "epoch": 6.937958639092728, "step": 20800 }, { "distill_loss": 0.41358011960983276, "epoch": 6.937958639092728, "step": 20800 }, { "epoch": 6.937958639092728, "ref_ce_loss": 0.09461617469787598, "step": 20800 }, { "epoch": 6.937958639092728, "loss": 0.3524114787578583, "step": 20800 }, { "ce_loss": 0.036635737866163254, "epoch": 6.937958639092728, "step": 20800 }, { "distill_loss": 0.20360025763511658, "epoch": 6.937958639092728, "step": 20800 }, { "epoch": 6.937958639092728, "ref_ce_loss": 0.0815434455871582, "step": 20800 }, { "epoch": 6.937958639092728, "loss": 0.2596745193004608, "step": 20800 }, { "ce_loss": 0.04510733485221863, "epoch": 6.937958639092728, "step": 20800 }, { "distill_loss": 0.17303520441055298, "epoch": 6.937958639092728, "step": 20800 }, { "epoch": 6.937958639092728, "ref_ce_loss": 0.04140667989850044, "step": 20800 }, { "epoch": 6.937958639092728, "loss": 0.399715393781662, "step": 20800 }, { "ce_loss": 0.035806529223918915, "epoch": 6.937958639092728, "step": 20800 }, { "distill_loss": 0.32043132185935974, "epoch": 6.937958639092728, "step": 20800 }, { "epoch": 6.937958639092728, "ref_ce_loss": 0.04334215074777603, "step": 20800 }, { "epoch": 6.9412941961307535, "loss": 0.4666, "step": 20810 }, { "epoch": 6.9412941961307535, "grad_norm": 6.359853267669678, "step": 20810 }, { "epoch": 6.9412941961307535, "learning_rate": 1.3568973546426332e-05, "step": 20810 }, { "epoch": 6.9412941961307535, "loss": 0.9467858076095581, "step": 20810 }, { "ce_loss": 0.06935672461986542, "epoch": 6.9412941961307535, "step": 20810 }, { "distill_loss": 0.4101283848285675, "epoch": 6.9412941961307535, "step": 20810 }, { "epoch": 6.9412941961307535, "ref_ce_loss": 0.06270965188741684, "step": 20810 }, { "epoch": 6.9412941961307535, "loss": 0.2544810473918915, "step": 20810 }, { "ce_loss": 0.015241993591189384, "epoch": 6.9412941961307535, "step": 20810 }, { "distill_loss": 0.14630302786827087, "epoch": 6.9412941961307535, "step": 20810 }, { "epoch": 6.9412941961307535, "ref_ce_loss": 0.046901024878025055, "step": 20810 }, { "epoch": 6.9412941961307535, "loss": 0.3560320734977722, "step": 20810 }, { "ce_loss": 0.00998441968113184, "epoch": 6.9412941961307535, "step": 20810 }, { "distill_loss": 0.2614750266075134, "epoch": 6.9412941961307535, "step": 20810 }, { "epoch": 6.9412941961307535, "ref_ce_loss": 0.028601357713341713, "step": 20810 }, { "epoch": 6.9412941961307535, "loss": 0.5675954818725586, "step": 20810 }, { "ce_loss": 0.027431802824139595, "epoch": 6.9412941961307535, "step": 20810 }, { "distill_loss": 0.36020371317863464, "epoch": 6.9412941961307535, "step": 20810 }, { "epoch": 6.9412941961307535, "ref_ce_loss": 0.06007641553878784, "step": 20810 }, { "epoch": 6.944629753168779, "loss": 0.449, "step": 20820 }, { "epoch": 6.944629753168779, "grad_norm": 4.561250686645508, "step": 20820 }, { "epoch": 6.944629753168779, "learning_rate": 1.34849101991989e-05, "step": 20820 }, { "epoch": 6.944629753168779, "loss": 0.4040166139602661, "step": 20820 }, { "ce_loss": 0.025941571220755577, "epoch": 6.944629753168779, "step": 20820 }, { "distill_loss": 0.20139163732528687, "epoch": 6.944629753168779, "step": 20820 }, { "epoch": 6.944629753168779, "ref_ce_loss": 0.0640379786491394, "step": 20820 }, { "epoch": 6.944629753168779, "loss": 0.30845510959625244, "step": 20820 }, { "ce_loss": 0.04371127858757973, "epoch": 6.944629753168779, "step": 20820 }, { "distill_loss": 0.204574316740036, "epoch": 6.944629753168779, "step": 20820 }, { "epoch": 6.944629753168779, "ref_ce_loss": 0.04178668558597565, "step": 20820 }, { "epoch": 6.944629753168779, "loss": 0.39555543661117554, "step": 20820 }, { "ce_loss": 0.03262713551521301, "epoch": 6.944629753168779, "step": 20820 }, { "distill_loss": 0.2215277999639511, "epoch": 6.944629753168779, "step": 20820 }, { "epoch": 6.944629753168779, "ref_ce_loss": 0.06563069671392441, "step": 20820 }, { "epoch": 6.944629753168779, "loss": 0.45052284002304077, "step": 20820 }, { "ce_loss": 0.11583734303712845, "epoch": 6.944629753168779, "step": 20820 }, { "distill_loss": 0.2701414227485657, "epoch": 6.944629753168779, "step": 20820 }, { "epoch": 6.944629753168779, "ref_ce_loss": 0.06444041430950165, "step": 20820 }, { "epoch": 6.947965310206804, "loss": 0.4061, "step": 20830 }, { "epoch": 6.947965310206804, "grad_norm": 4.293476581573486, "step": 20830 }, { "epoch": 6.947965310206804, "learning_rate": 1.3401095801644462e-05, "step": 20830 }, { "epoch": 6.947965310206804, "loss": 0.2711658775806427, "step": 20830 }, { "ce_loss": 0.06868616491556168, "epoch": 6.947965310206804, "step": 20830 }, { "distill_loss": 0.13750752806663513, "epoch": 6.947965310206804, "step": 20830 }, { "epoch": 6.947965310206804, "ref_ce_loss": 0.06487326323986053, "step": 20830 }, { "epoch": 6.947965310206804, "loss": 0.34549322724342346, "step": 20830 }, { "ce_loss": 0.016219772398471832, "epoch": 6.947965310206804, "step": 20830 }, { "distill_loss": 0.2700633108615875, "epoch": 6.947965310206804, "step": 20830 }, { "epoch": 6.947965310206804, "ref_ce_loss": 0.059070926159620285, "step": 20830 }, { "epoch": 6.947965310206804, "loss": 0.5564180612564087, "step": 20830 }, { "ce_loss": 0.024273596704006195, "epoch": 6.947965310206804, "step": 20830 }, { "distill_loss": 0.3557685315608978, "epoch": 6.947965310206804, "step": 20830 }, { "epoch": 6.947965310206804, "ref_ce_loss": 0.07073020935058594, "step": 20830 }, { "epoch": 6.947965310206804, "loss": 0.3989897072315216, "step": 20830 }, { "ce_loss": 0.06166957691311836, "epoch": 6.947965310206804, "step": 20830 }, { "distill_loss": 0.1681160181760788, "epoch": 6.947965310206804, "step": 20830 }, { "epoch": 6.947965310206804, "ref_ce_loss": 0.04308732599020004, "step": 20830 }, { "epoch": 6.9513008672448295, "loss": 0.4508, "step": 20840 }, { "epoch": 6.9513008672448295, "grad_norm": 4.196435451507568, "step": 20840 }, { "epoch": 6.9513008672448295, "learning_rate": 1.3317530506607405e-05, "step": 20840 }, { "epoch": 6.9513008672448295, "loss": 0.30096614360809326, "step": 20840 }, { "ce_loss": 0.03617022559046745, "epoch": 6.9513008672448295, "step": 20840 }, { "distill_loss": 0.2046215534210205, "epoch": 6.9513008672448295, "step": 20840 }, { "epoch": 6.9513008672448295, "ref_ce_loss": 0.0513840913772583, "step": 20840 }, { "epoch": 6.9513008672448295, "loss": 0.4168870747089386, "step": 20840 }, { "ce_loss": 0.02515067718923092, "epoch": 6.9513008672448295, "step": 20840 }, { "distill_loss": 0.3294127583503723, "epoch": 6.9513008672448295, "step": 20840 }, { "epoch": 6.9513008672448295, "ref_ce_loss": 0.05164066329598427, "step": 20840 }, { "epoch": 6.9513008672448295, "loss": 0.3690844774246216, "step": 20840 }, { "ce_loss": 0.012738215737044811, "epoch": 6.9513008672448295, "step": 20840 }, { "distill_loss": 0.25531935691833496, "epoch": 6.9513008672448295, "step": 20840 }, { "epoch": 6.9513008672448295, "ref_ce_loss": 0.06127219274640083, "step": 20840 }, { "epoch": 6.9513008672448295, "loss": 0.563440203666687, "step": 20840 }, { "ce_loss": 0.023211704567074776, "epoch": 6.9513008672448295, "step": 20840 }, { "distill_loss": 0.3891507387161255, "epoch": 6.9513008672448295, "step": 20840 }, { "epoch": 6.9513008672448295, "ref_ce_loss": 0.06096247583627701, "step": 20840 }, { "epoch": 6.954636424282855, "loss": 0.4044, "step": 20850 }, { "epoch": 6.954636424282855, "grad_norm": 4.023589134216309, "step": 20850 }, { "epoch": 6.954636424282855, "learning_rate": 1.3234214466477877e-05, "step": 20850 }, { "epoch": 6.954636424282855, "loss": 0.28012433648109436, "step": 20850 }, { "ce_loss": 0.03320290520787239, "epoch": 6.954636424282855, "step": 20850 }, { "distill_loss": 0.19837743043899536, "epoch": 6.954636424282855, "step": 20850 }, { "epoch": 6.954636424282855, "ref_ce_loss": 0.037537477910518646, "step": 20850 }, { "epoch": 6.954636424282855, "loss": 0.37420663237571716, "step": 20850 }, { "ce_loss": 0.05524859204888344, "epoch": 6.954636424282855, "step": 20850 }, { "distill_loss": 0.2331634759902954, "epoch": 6.954636424282855, "step": 20850 }, { "epoch": 6.954636424282855, "ref_ce_loss": 0.04749821871519089, "step": 20850 }, { "epoch": 6.954636424282855, "loss": 0.46007564663887024, "step": 20850 }, { "ce_loss": 0.060388337820768356, "epoch": 6.954636424282855, "step": 20850 }, { "distill_loss": 0.3201514184474945, "epoch": 6.954636424282855, "step": 20850 }, { "epoch": 6.954636424282855, "ref_ce_loss": 0.05457288771867752, "step": 20850 }, { "epoch": 6.954636424282855, "loss": 0.2581188380718231, "step": 20850 }, { "ce_loss": 0.03130757808685303, "epoch": 6.954636424282855, "step": 20850 }, { "distill_loss": 0.162839874625206, "epoch": 6.954636424282855, "step": 20850 }, { "epoch": 6.954636424282855, "ref_ce_loss": 0.05107346549630165, "step": 20850 }, { "epoch": 6.95797198132088, "loss": 0.399, "step": 20860 }, { "epoch": 6.95797198132088, "grad_norm": 5.500354766845703, "step": 20860 }, { "epoch": 6.95797198132088, "learning_rate": 1.315114783319146e-05, "step": 20860 }, { "epoch": 6.95797198132088, "loss": 0.8441788554191589, "step": 20860 }, { "ce_loss": 0.12463463097810745, "epoch": 6.95797198132088, "step": 20860 }, { "distill_loss": 0.37730810046195984, "epoch": 6.95797198132088, "step": 20860 }, { "epoch": 6.95797198132088, "ref_ce_loss": 0.08145100623369217, "step": 20860 }, { "epoch": 6.95797198132088, "loss": 0.7428448796272278, "step": 20860 }, { "ce_loss": 0.0943167582154274, "epoch": 6.95797198132088, "step": 20860 }, { "distill_loss": 0.20089185237884521, "epoch": 6.95797198132088, "step": 20860 }, { "epoch": 6.95797198132088, "ref_ce_loss": 0.11132627725601196, "step": 20860 }, { "epoch": 6.95797198132088, "loss": 0.43067431449890137, "step": 20860 }, { "ce_loss": 0.04894329980015755, "epoch": 6.95797198132088, "step": 20860 }, { "distill_loss": 0.3266683518886566, "epoch": 6.95797198132088, "step": 20860 }, { "epoch": 6.95797198132088, "ref_ce_loss": 0.05494893714785576, "step": 20860 }, { "epoch": 6.95797198132088, "loss": 0.4565647840499878, "step": 20860 }, { "ce_loss": 0.06725724786520004, "epoch": 6.95797198132088, "step": 20860 }, { "distill_loss": 0.2948342561721802, "epoch": 6.95797198132088, "step": 20860 }, { "epoch": 6.95797198132088, "ref_ce_loss": 0.06600886583328247, "step": 20860 }, { "epoch": 6.961307538358906, "loss": 0.4097, "step": 20870 }, { "epoch": 6.961307538358906, "grad_norm": 5.8303399085998535, "step": 20870 }, { "epoch": 6.961307538358906, "learning_rate": 1.3068330758228951e-05, "step": 20870 }, { "epoch": 6.961307538358906, "loss": 0.498879075050354, "step": 20870 }, { "ce_loss": 0.044609811156988144, "epoch": 6.961307538358906, "step": 20870 }, { "distill_loss": 0.21364165842533112, "epoch": 6.961307538358906, "step": 20870 }, { "epoch": 6.961307538358906, "ref_ce_loss": 0.07443445175886154, "step": 20870 }, { "epoch": 6.961307538358906, "loss": 0.6392194628715515, "step": 20870 }, { "ce_loss": 0.0752585232257843, "epoch": 6.961307538358906, "step": 20870 }, { "distill_loss": 0.21459460258483887, "epoch": 6.961307538358906, "step": 20870 }, { "epoch": 6.961307538358906, "ref_ce_loss": 0.049674663692712784, "step": 20870 }, { "epoch": 6.961307538358906, "loss": 0.3434360921382904, "step": 20870 }, { "ce_loss": 0.036089323461055756, "epoch": 6.961307538358906, "step": 20870 }, { "distill_loss": 0.17388133704662323, "epoch": 6.961307538358906, "step": 20870 }, { "epoch": 6.961307538358906, "ref_ce_loss": 0.05977735295891762, "step": 20870 }, { "epoch": 6.961307538358906, "loss": 0.6956120133399963, "step": 20870 }, { "ce_loss": 0.08760103583335876, "epoch": 6.961307538358906, "step": 20870 }, { "distill_loss": 0.4955027997493744, "epoch": 6.961307538358906, "step": 20870 }, { "epoch": 6.961307538358906, "ref_ce_loss": 0.0775177851319313, "step": 20870 }, { "epoch": 6.964643095396931, "loss": 0.3955, "step": 20880 }, { "epoch": 6.964643095396931, "grad_norm": 4.173266887664795, "step": 20880 }, { "epoch": 6.964643095396931, "learning_rate": 1.2985763392615972e-05, "step": 20880 }, { "epoch": 6.964643095396931, "loss": 0.21807421743869781, "step": 20880 }, { "ce_loss": 0.03225923329591751, "epoch": 6.964643095396931, "step": 20880 }, { "distill_loss": 0.143633171916008, "epoch": 6.964643095396931, "step": 20880 }, { "epoch": 6.964643095396931, "ref_ce_loss": 0.042038701474666595, "step": 20880 }, { "epoch": 6.964643095396931, "loss": 0.7549881339073181, "step": 20880 }, { "ce_loss": 0.04640970006585121, "epoch": 6.964643095396931, "step": 20880 }, { "distill_loss": 0.24397540092468262, "epoch": 6.964643095396931, "step": 20880 }, { "epoch": 6.964643095396931, "ref_ce_loss": 0.04105795919895172, "step": 20880 }, { "epoch": 6.964643095396931, "loss": 0.28283512592315674, "step": 20880 }, { "ce_loss": 0.057034965604543686, "epoch": 6.964643095396931, "step": 20880 }, { "distill_loss": 0.1521199345588684, "epoch": 6.964643095396931, "step": 20880 }, { "epoch": 6.964643095396931, "ref_ce_loss": 0.048486873507499695, "step": 20880 }, { "epoch": 6.964643095396931, "loss": 0.46666327118873596, "step": 20880 }, { "ce_loss": 0.085448257625103, "epoch": 6.964643095396931, "step": 20880 }, { "distill_loss": 0.228860542178154, "epoch": 6.964643095396931, "step": 20880 }, { "epoch": 6.964643095396931, "ref_ce_loss": 0.06538163125514984, "step": 20880 }, { "epoch": 6.967978652434956, "loss": 0.3826, "step": 20890 }, { "epoch": 6.967978652434956, "grad_norm": 4.615096092224121, "step": 20890 }, { "epoch": 6.967978652434956, "learning_rate": 1.2903445886922863e-05, "step": 20890 }, { "epoch": 6.967978652434956, "loss": 0.47802993655204773, "step": 20890 }, { "ce_loss": 0.07998108118772507, "epoch": 6.967978652434956, "step": 20890 }, { "distill_loss": 0.18245376646518707, "epoch": 6.967978652434956, "step": 20890 }, { "epoch": 6.967978652434956, "ref_ce_loss": 0.09586384892463684, "step": 20890 }, { "epoch": 6.967978652434956, "loss": 0.2591511607170105, "step": 20890 }, { "ce_loss": 0.02041383646428585, "epoch": 6.967978652434956, "step": 20890 }, { "distill_loss": 0.18587136268615723, "epoch": 6.967978652434956, "step": 20890 }, { "epoch": 6.967978652434956, "ref_ce_loss": 0.05260073021054268, "step": 20890 }, { "epoch": 6.967978652434956, "loss": 0.27677682042121887, "step": 20890 }, { "ce_loss": 0.02615499682724476, "epoch": 6.967978652434956, "step": 20890 }, { "distill_loss": 0.14043205976486206, "epoch": 6.967978652434956, "step": 20890 }, { "epoch": 6.967978652434956, "ref_ce_loss": 0.046882420778274536, "step": 20890 }, { "epoch": 6.967978652434956, "loss": 0.3209933936595917, "step": 20890 }, { "ce_loss": 0.03233850374817848, "epoch": 6.967978652434956, "step": 20890 }, { "distill_loss": 0.14990727603435516, "epoch": 6.967978652434956, "step": 20890 }, { "epoch": 6.967978652434956, "ref_ce_loss": 0.052740324288606644, "step": 20890 }, { "epoch": 6.971314209472982, "loss": 0.4014, "step": 20900 }, { "epoch": 6.971314209472982, "grad_norm": 5.587108135223389, "step": 20900 }, { "epoch": 6.971314209472982, "learning_rate": 1.2821378391264282e-05, "step": 20900 }, { "epoch": 6.971314209472982, "loss": 0.3331134617328644, "step": 20900 }, { "ce_loss": 0.013039899058640003, "epoch": 6.971314209472982, "step": 20900 }, { "distill_loss": 0.23820891976356506, "epoch": 6.971314209472982, "step": 20900 }, { "epoch": 6.971314209472982, "ref_ce_loss": 0.05505063757300377, "step": 20900 }, { "epoch": 6.971314209472982, "loss": 0.31397545337677, "step": 20900 }, { "ce_loss": 0.026968948543071747, "epoch": 6.971314209472982, "step": 20900 }, { "distill_loss": 0.10655680298805237, "epoch": 6.971314209472982, "step": 20900 }, { "epoch": 6.971314209472982, "ref_ce_loss": 0.050659481436014175, "step": 20900 }, { "epoch": 6.971314209472982, "loss": 0.3516398072242737, "step": 20900 }, { "ce_loss": 0.018841532990336418, "epoch": 6.971314209472982, "step": 20900 }, { "distill_loss": 0.12434244155883789, "epoch": 6.971314209472982, "step": 20900 }, { "epoch": 6.971314209472982, "ref_ce_loss": 0.04103760048747063, "step": 20900 }, { "epoch": 6.971314209472982, "loss": 0.3670431971549988, "step": 20900 }, { "ce_loss": 0.01649991050362587, "epoch": 6.971314209472982, "step": 20900 }, { "distill_loss": 0.23683831095695496, "epoch": 6.971314209472982, "step": 20900 }, { "epoch": 6.971314209472982, "ref_ce_loss": 0.07243819534778595, "step": 20900 }, { "epoch": 6.974649766511007, "loss": 0.4017, "step": 20910 }, { "epoch": 6.974649766511007, "grad_norm": 3.5513551235198975, "step": 20910 }, { "epoch": 6.974649766511007, "learning_rate": 1.2739561055298975e-05, "step": 20910 }, { "epoch": 6.974649766511007, "loss": 0.4680349826812744, "step": 20910 }, { "ce_loss": 0.06049255281686783, "epoch": 6.974649766511007, "step": 20910 }, { "distill_loss": 0.2663050591945648, "epoch": 6.974649766511007, "step": 20910 }, { "epoch": 6.974649766511007, "ref_ce_loss": 0.0641944482922554, "step": 20910 }, { "epoch": 6.974649766511007, "loss": 0.6693120002746582, "step": 20910 }, { "ce_loss": 0.041746288537979126, "epoch": 6.974649766511007, "step": 20910 }, { "distill_loss": 0.524785578250885, "epoch": 6.974649766511007, "step": 20910 }, { "epoch": 6.974649766511007, "ref_ce_loss": 0.0707206279039383, "step": 20910 }, { "epoch": 6.974649766511007, "loss": 0.5915324687957764, "step": 20910 }, { "ce_loss": 0.045784156769514084, "epoch": 6.974649766511007, "step": 20910 }, { "distill_loss": 0.3815018832683563, "epoch": 6.974649766511007, "step": 20910 }, { "epoch": 6.974649766511007, "ref_ce_loss": 0.08063600212335587, "step": 20910 }, { "epoch": 6.974649766511007, "loss": 0.3671169877052307, "step": 20910 }, { "ce_loss": 0.026919331401586533, "epoch": 6.974649766511007, "step": 20910 }, { "distill_loss": 0.30458635091781616, "epoch": 6.974649766511007, "step": 20910 }, { "epoch": 6.974649766511007, "ref_ce_loss": 0.035399653017520905, "step": 20910 }, { "epoch": 6.977985323549032, "loss": 0.3908, "step": 20920 }, { "epoch": 6.977985323549032, "grad_norm": 3.7871408462524414, "step": 20920 }, { "epoch": 6.977985323549032, "learning_rate": 1.2657994028229496e-05, "step": 20920 }, { "epoch": 6.977985323549032, "loss": 0.522492527961731, "step": 20920 }, { "ce_loss": 0.03481413051486015, "epoch": 6.977985323549032, "step": 20920 }, { "distill_loss": 0.15558190643787384, "epoch": 6.977985323549032, "step": 20920 }, { "epoch": 6.977985323549032, "ref_ce_loss": 0.04664992541074753, "step": 20920 }, { "epoch": 6.977985323549032, "loss": 0.25579845905303955, "step": 20920 }, { "ce_loss": 0.03506955876946449, "epoch": 6.977985323549032, "step": 20920 }, { "distill_loss": 0.13314028084278107, "epoch": 6.977985323549032, "step": 20920 }, { "epoch": 6.977985323549032, "ref_ce_loss": 0.03844781219959259, "step": 20920 }, { "epoch": 6.977985323549032, "loss": 0.3240671455860138, "step": 20920 }, { "ce_loss": 0.03473073989152908, "epoch": 6.977985323549032, "step": 20920 }, { "distill_loss": 0.19969017803668976, "epoch": 6.977985323549032, "step": 20920 }, { "epoch": 6.977985323549032, "ref_ce_loss": 0.06953977793455124, "step": 20920 }, { "epoch": 6.977985323549032, "loss": 0.3197997808456421, "step": 20920 }, { "ce_loss": 0.01574970968067646, "epoch": 6.977985323549032, "step": 20920 }, { "distill_loss": 0.2247905135154724, "epoch": 6.977985323549032, "step": 20920 }, { "epoch": 6.977985323549032, "ref_ce_loss": 0.059356823563575745, "step": 20920 }, { "epoch": 6.981320880587058, "loss": 0.3878, "step": 20930 }, { "epoch": 6.981320880587058, "grad_norm": 5.743426322937012, "step": 20930 }, { "epoch": 6.981320880587058, "learning_rate": 1.2576677458801875e-05, "step": 20930 }, { "epoch": 6.981320880587058, "loss": 0.36012518405914307, "step": 20930 }, { "ce_loss": 0.051608890295028687, "epoch": 6.981320880587058, "step": 20930 }, { "distill_loss": 0.2507593631744385, "epoch": 6.981320880587058, "step": 20930 }, { "epoch": 6.981320880587058, "ref_ce_loss": 0.057602979242801666, "step": 20930 }, { "epoch": 6.981320880587058, "loss": 0.36070674657821655, "step": 20930 }, { "ce_loss": 0.010836776345968246, "epoch": 6.981320880587058, "step": 20930 }, { "distill_loss": 0.22331926226615906, "epoch": 6.981320880587058, "step": 20930 }, { "epoch": 6.981320880587058, "ref_ce_loss": 0.04523332417011261, "step": 20930 }, { "epoch": 6.981320880587058, "loss": 0.3094327449798584, "step": 20930 }, { "ce_loss": 0.0389716662466526, "epoch": 6.981320880587058, "step": 20930 }, { "distill_loss": 0.17145349085330963, "epoch": 6.981320880587058, "step": 20930 }, { "epoch": 6.981320880587058, "ref_ce_loss": 0.05656014755368233, "step": 20930 }, { "epoch": 6.981320880587058, "loss": 0.7913627028465271, "step": 20930 }, { "ce_loss": 0.0768100693821907, "epoch": 6.981320880587058, "step": 20930 }, { "distill_loss": 0.25836631655693054, "epoch": 6.981320880587058, "step": 20930 }, { "epoch": 6.981320880587058, "ref_ce_loss": 0.10429731756448746, "step": 20930 }, { "epoch": 6.984656437625083, "loss": 0.3749, "step": 20940 }, { "epoch": 6.984656437625083, "grad_norm": 3.8919103145599365, "step": 20940 }, { "epoch": 6.984656437625083, "learning_rate": 1.249561149530553e-05, "step": 20940 }, { "epoch": 6.984656437625083, "loss": 0.4512990415096283, "step": 20940 }, { "ce_loss": 0.010347792878746986, "epoch": 6.984656437625083, "step": 20940 }, { "distill_loss": 0.21304087340831757, "epoch": 6.984656437625083, "step": 20940 }, { "epoch": 6.984656437625083, "ref_ce_loss": 0.07783834636211395, "step": 20940 }, { "epoch": 6.984656437625083, "loss": 0.2394438087940216, "step": 20940 }, { "ce_loss": 0.020768312737345695, "epoch": 6.984656437625083, "step": 20940 }, { "distill_loss": 0.17492659389972687, "epoch": 6.984656437625083, "step": 20940 }, { "epoch": 6.984656437625083, "ref_ce_loss": 0.043548326939344406, "step": 20940 }, { "epoch": 6.984656437625083, "loss": 0.45974916219711304, "step": 20940 }, { "ce_loss": 0.05422638729214668, "epoch": 6.984656437625083, "step": 20940 }, { "distill_loss": 0.24236030876636505, "epoch": 6.984656437625083, "step": 20940 }, { "epoch": 6.984656437625083, "ref_ce_loss": 0.04444364830851555, "step": 20940 }, { "epoch": 6.984656437625083, "loss": 0.5540006756782532, "step": 20940 }, { "ce_loss": 0.09376584738492966, "epoch": 6.984656437625083, "step": 20940 }, { "distill_loss": 0.3020194470882416, "epoch": 6.984656437625083, "step": 20940 }, { "epoch": 6.984656437625083, "ref_ce_loss": 0.055338360369205475, "step": 20940 }, { "epoch": 6.987991994663108, "loss": 0.3807, "step": 20950 }, { "epoch": 6.987991994663108, "grad_norm": 3.6167783737182617, "step": 20950 }, { "epoch": 6.987991994663108, "learning_rate": 1.2414796285572704e-05, "step": 20950 }, { "epoch": 6.987991994663108, "loss": 0.32541486620903015, "step": 20950 }, { "ce_loss": 0.0197074506431818, "epoch": 6.987991994663108, "step": 20950 }, { "distill_loss": 0.17429247498512268, "epoch": 6.987991994663108, "step": 20950 }, { "epoch": 6.987991994663108, "ref_ce_loss": 0.05691966041922569, "step": 20950 }, { "epoch": 6.987991994663108, "loss": 0.4718301594257355, "step": 20950 }, { "ce_loss": 0.050296757370233536, "epoch": 6.987991994663108, "step": 20950 }, { "distill_loss": 0.3400103449821472, "epoch": 6.987991994663108, "step": 20950 }, { "epoch": 6.987991994663108, "ref_ce_loss": 0.08132241666316986, "step": 20950 }, { "epoch": 6.987991994663108, "loss": 0.4324634075164795, "step": 20950 }, { "ce_loss": 0.008074936456978321, "epoch": 6.987991994663108, "step": 20950 }, { "distill_loss": 0.24002067744731903, "epoch": 6.987991994663108, "step": 20950 }, { "epoch": 6.987991994663108, "ref_ce_loss": 0.05884374678134918, "step": 20950 }, { "epoch": 6.987991994663108, "loss": 0.42272958159446716, "step": 20950 }, { "ce_loss": 0.03418363630771637, "epoch": 6.987991994663108, "step": 20950 }, { "distill_loss": 0.13855905830860138, "epoch": 6.987991994663108, "step": 20950 }, { "epoch": 6.987991994663108, "ref_ce_loss": 0.04546462744474411, "step": 20950 }, { "epoch": 6.991327551701134, "loss": 0.3778, "step": 20960 }, { "epoch": 6.991327551701134, "grad_norm": 4.432559967041016, "step": 20960 }, { "epoch": 6.991327551701134, "learning_rate": 1.2334231976978543e-05, "step": 20960 }, { "epoch": 6.991327551701134, "loss": 0.45632582902908325, "step": 20960 }, { "ce_loss": 0.06276565790176392, "epoch": 6.991327551701134, "step": 20960 }, { "distill_loss": 0.1557421088218689, "epoch": 6.991327551701134, "step": 20960 }, { "epoch": 6.991327551701134, "ref_ce_loss": 0.08415982872247696, "step": 20960 }, { "epoch": 6.991327551701134, "loss": 0.3706178665161133, "step": 20960 }, { "ce_loss": 0.046508148312568665, "epoch": 6.991327551701134, "step": 20960 }, { "distill_loss": 0.1922816038131714, "epoch": 6.991327551701134, "step": 20960 }, { "epoch": 6.991327551701134, "ref_ce_loss": 0.059828322380781174, "step": 20960 }, { "epoch": 6.991327551701134, "loss": 0.2592690587043762, "step": 20960 }, { "ce_loss": 0.043664492666721344, "epoch": 6.991327551701134, "step": 20960 }, { "distill_loss": 0.16917598247528076, "epoch": 6.991327551701134, "step": 20960 }, { "epoch": 6.991327551701134, "ref_ce_loss": 0.044888705015182495, "step": 20960 }, { "epoch": 6.991327551701134, "loss": 0.3581051826477051, "step": 20960 }, { "ce_loss": 0.02837333455681801, "epoch": 6.991327551701134, "step": 20960 }, { "distill_loss": 0.11634084582328796, "epoch": 6.991327551701134, "step": 20960 }, { "epoch": 6.991327551701134, "ref_ce_loss": 0.04302706569433212, "step": 20960 }, { "epoch": 6.994663108739159, "loss": 0.404, "step": 20970 }, { "epoch": 6.994663108739159, "grad_norm": 3.7893307209014893, "step": 20970 }, { "epoch": 6.994663108739159, "learning_rate": 1.2253918716440574e-05, "step": 20970 }, { "epoch": 6.994663108739159, "loss": 0.3417932689189911, "step": 20970 }, { "ce_loss": 0.00941214244812727, "epoch": 6.994663108739159, "step": 20970 }, { "distill_loss": 0.16308821737766266, "epoch": 6.994663108739159, "step": 20970 }, { "epoch": 6.994663108739159, "ref_ce_loss": 0.05768497660756111, "step": 20970 }, { "epoch": 6.994663108739159, "loss": 0.33781731128692627, "step": 20970 }, { "ce_loss": 0.025083765387535095, "epoch": 6.994663108739159, "step": 20970 }, { "distill_loss": 0.2208465039730072, "epoch": 6.994663108739159, "step": 20970 }, { "epoch": 6.994663108739159, "ref_ce_loss": 0.04458913579583168, "step": 20970 }, { "epoch": 6.994663108739159, "loss": 0.30446338653564453, "step": 20970 }, { "ce_loss": 0.04164633899927139, "epoch": 6.994663108739159, "step": 20970 }, { "distill_loss": 0.18755929172039032, "epoch": 6.994663108739159, "step": 20970 }, { "epoch": 6.994663108739159, "ref_ce_loss": 0.04275144264101982, "step": 20970 }, { "epoch": 6.994663108739159, "loss": 0.49016839265823364, "step": 20970 }, { "ce_loss": 0.03269338980317116, "epoch": 6.994663108739159, "step": 20970 }, { "distill_loss": 0.33109739422798157, "epoch": 6.994663108739159, "step": 20970 }, { "epoch": 6.994663108739159, "ref_ce_loss": 0.07132968306541443, "step": 20970 }, { "epoch": 6.997998665777184, "loss": 0.3758, "step": 20980 }, { "epoch": 6.997998665777184, "grad_norm": 3.27852725982666, "step": 20980 }, { "epoch": 6.997998665777184, "learning_rate": 1.2173856650418445e-05, "step": 20980 }, { "epoch": 6.997998665777184, "loss": 0.5222610235214233, "step": 20980 }, { "ce_loss": 0.050382375717163086, "epoch": 6.997998665777184, "step": 20980 }, { "distill_loss": 0.32159072160720825, "epoch": 6.997998665777184, "step": 20980 }, { "epoch": 6.997998665777184, "ref_ce_loss": 0.11399988830089569, "step": 20980 }, { "epoch": 6.997998665777184, "loss": 0.22158987820148468, "step": 20980 }, { "ce_loss": 0.0016337481793016195, "epoch": 6.997998665777184, "step": 20980 }, { "distill_loss": 0.17573025822639465, "epoch": 6.997998665777184, "step": 20980 }, { "epoch": 6.997998665777184, "ref_ce_loss": 0.0440516471862793, "step": 20980 }, { "epoch": 6.997998665777184, "loss": 0.3978367745876312, "step": 20980 }, { "ce_loss": 0.08592233061790466, "epoch": 6.997998665777184, "step": 20980 }, { "distill_loss": 0.2198159098625183, "epoch": 6.997998665777184, "step": 20980 }, { "epoch": 6.997998665777184, "ref_ce_loss": 0.0556352324783802, "step": 20980 }, { "epoch": 6.997998665777184, "loss": 0.3263167440891266, "step": 20980 }, { "ce_loss": 0.04236403852701187, "epoch": 6.997998665777184, "step": 20980 }, { "distill_loss": 0.1966823935508728, "epoch": 6.997998665777184, "step": 20980 }, { "epoch": 6.997998665777184, "ref_ce_loss": 0.04797091335058212, "step": 20980 }, { "epoch": 7.00133422281521, "loss": 0.3182, "step": 20990 }, { "epoch": 7.00133422281521, "grad_norm": 4.379688262939453, "step": 20990 }, { "epoch": 7.00133422281521, "learning_rate": 1.2094045924913798e-05, "step": 20990 }, { "epoch": 7.00133422281521, "loss": 0.28644007444381714, "step": 20990 }, { "ce_loss": 0.01825113408267498, "epoch": 7.00133422281521, "step": 20990 }, { "distill_loss": 0.19746197760105133, "epoch": 7.00133422281521, "step": 20990 }, { "epoch": 7.00133422281521, "ref_ce_loss": 0.07044369727373123, "step": 20990 }, { "epoch": 7.00133422281521, "loss": 0.21840335428714752, "step": 20990 }, { "ce_loss": 0.020626522600650787, "epoch": 7.00133422281521, "step": 20990 }, { "distill_loss": 0.14074021577835083, "epoch": 7.00133422281521, "step": 20990 }, { "epoch": 7.00133422281521, "ref_ce_loss": 0.035429518669843674, "step": 20990 }, { "epoch": 7.00133422281521, "loss": 0.8605027198791504, "step": 20990 }, { "ce_loss": 0.058410659432411194, "epoch": 7.00133422281521, "step": 20990 }, { "distill_loss": 0.3554646968841553, "epoch": 7.00133422281521, "step": 20990 }, { "epoch": 7.00133422281521, "ref_ce_loss": 0.07566055655479431, "step": 20990 }, { "epoch": 7.00133422281521, "loss": 0.2325642704963684, "step": 20990 }, { "ce_loss": 0.04752802848815918, "epoch": 7.00133422281521, "step": 20990 }, { "distill_loss": 0.13366730511188507, "epoch": 7.00133422281521, "step": 20990 }, { "epoch": 7.00133422281521, "ref_ce_loss": 0.05121288821101189, "step": 20990 }, { "epoch": 7.004669779853235, "loss": 0.3769, "step": 21000 }, { "epoch": 7.004669779853235, "grad_norm": 4.163489818572998, "step": 21000 }, { "epoch": 7.004669779853235, "learning_rate": 1.2014486685469959e-05, "step": 21000 }, { "epoch": 7.004669779853235, "loss": 0.25092020630836487, "step": 21000 }, { "ce_loss": 0.03144634887576103, "epoch": 7.004669779853235, "step": 21000 }, { "distill_loss": 0.16729390621185303, "epoch": 7.004669779853235, "step": 21000 }, { "epoch": 7.004669779853235, "ref_ce_loss": 0.039170920848846436, "step": 21000 }, { "epoch": 7.004669779853235, "loss": 0.18306779861450195, "step": 21000 }, { "ce_loss": 0.004771217238157988, "epoch": 7.004669779853235, "step": 21000 }, { "distill_loss": 0.12938238680362701, "epoch": 7.004669779853235, "step": 21000 }, { "epoch": 7.004669779853235, "ref_ce_loss": 0.04884642735123634, "step": 21000 }, { "epoch": 7.004669779853235, "loss": 0.25768306851387024, "step": 21000 }, { "ce_loss": 0.011849144473671913, "epoch": 7.004669779853235, "step": 21000 }, { "distill_loss": 0.14590494334697723, "epoch": 7.004669779853235, "step": 21000 }, { "epoch": 7.004669779853235, "ref_ce_loss": 0.05942894518375397, "step": 21000 }, { "epoch": 7.004669779853235, "loss": 0.18659117817878723, "step": 21000 }, { "ce_loss": 0.004362636711448431, "epoch": 7.004669779853235, "step": 21000 }, { "distill_loss": 0.15117177367210388, "epoch": 7.004669779853235, "step": 21000 }, { "epoch": 7.004669779853235, "ref_ce_loss": 0.030531061813235283, "step": 21000 }, { "epoch": 7.0080053368912605, "loss": 0.3495, "step": 21010 }, { "epoch": 7.0080053368912605, "grad_norm": 2.965975761413574, "step": 21010 }, { "epoch": 7.0080053368912605, "learning_rate": 1.1935179077171525e-05, "step": 21010 }, { "epoch": 7.0080053368912605, "loss": 0.36496981978416443, "step": 21010 }, { "ce_loss": 0.027796225622296333, "epoch": 7.0080053368912605, "step": 21010 }, { "distill_loss": 0.28838932514190674, "epoch": 7.0080053368912605, "step": 21010 }, { "epoch": 7.0080053368912605, "ref_ce_loss": 0.034264348447322845, "step": 21010 }, { "epoch": 7.0080053368912605, "loss": 0.8337552547454834, "step": 21010 }, { "ce_loss": 0.011524084955453873, "epoch": 7.0080053368912605, "step": 21010 }, { "distill_loss": 0.22686733305454254, "epoch": 7.0080053368912605, "step": 21010 }, { "epoch": 7.0080053368912605, "ref_ce_loss": 0.05295266583561897, "step": 21010 }, { "epoch": 7.0080053368912605, "loss": 0.2895921468734741, "step": 21010 }, { "ce_loss": 0.038470905274152756, "epoch": 7.0080053368912605, "step": 21010 }, { "distill_loss": 0.16852986812591553, "epoch": 7.0080053368912605, "step": 21010 }, { "epoch": 7.0080053368912605, "ref_ce_loss": 0.060499511659145355, "step": 21010 }, { "epoch": 7.0080053368912605, "loss": 0.3831683099269867, "step": 21010 }, { "ce_loss": 0.007674395106732845, "epoch": 7.0080053368912605, "step": 21010 }, { "distill_loss": 0.31148290634155273, "epoch": 7.0080053368912605, "step": 21010 }, { "epoch": 7.0080053368912605, "ref_ce_loss": 0.05108707398176193, "step": 21010 }, { "epoch": 7.011340893929286, "loss": 0.358, "step": 21020 }, { "epoch": 7.011340893929286, "grad_norm": 4.229156494140625, "step": 21020 }, { "epoch": 7.011340893929286, "learning_rate": 1.1856123244644355e-05, "step": 21020 }, { "epoch": 7.011340893929286, "loss": 0.3345952332019806, "step": 21020 }, { "ce_loss": 0.02739858441054821, "epoch": 7.011340893929286, "step": 21020 }, { "distill_loss": 0.15175358951091766, "epoch": 7.011340893929286, "step": 21020 }, { "epoch": 7.011340893929286, "ref_ce_loss": 0.04447855055332184, "step": 21020 }, { "epoch": 7.011340893929286, "loss": 0.46418091654777527, "step": 21020 }, { "ce_loss": 0.08415623754262924, "epoch": 7.011340893929286, "step": 21020 }, { "distill_loss": 0.26705658435821533, "epoch": 7.011340893929286, "step": 21020 }, { "epoch": 7.011340893929286, "ref_ce_loss": 0.06111234799027443, "step": 21020 }, { "epoch": 7.011340893929286, "loss": 0.3672318756580353, "step": 21020 }, { "ce_loss": 0.06163648143410683, "epoch": 7.011340893929286, "step": 21020 }, { "distill_loss": 0.17714300751686096, "epoch": 7.011340893929286, "step": 21020 }, { "epoch": 7.011340893929286, "ref_ce_loss": 0.04045576602220535, "step": 21020 }, { "epoch": 7.011340893929286, "loss": 0.2918590009212494, "step": 21020 }, { "ce_loss": 0.020832950249314308, "epoch": 7.011340893929286, "step": 21020 }, { "distill_loss": 0.21937981247901917, "epoch": 7.011340893929286, "step": 21020 }, { "epoch": 7.011340893929286, "ref_ce_loss": 0.0513937808573246, "step": 21020 }, { "epoch": 7.014676450967311, "loss": 0.3688, "step": 21030 }, { "epoch": 7.014676450967311, "grad_norm": 3.1062397956848145, "step": 21030 }, { "epoch": 7.014676450967311, "learning_rate": 1.1777319332055062e-05, "step": 21030 }, { "epoch": 7.014676450967311, "loss": 0.39231839776039124, "step": 21030 }, { "ce_loss": 0.03197869285941124, "epoch": 7.014676450967311, "step": 21030 }, { "distill_loss": 0.25150421261787415, "epoch": 7.014676450967311, "step": 21030 }, { "epoch": 7.014676450967311, "ref_ce_loss": 0.05561299994587898, "step": 21030 }, { "epoch": 7.014676450967311, "loss": 0.3744685649871826, "step": 21030 }, { "ce_loss": 0.025469930842518806, "epoch": 7.014676450967311, "step": 21030 }, { "distill_loss": 0.1823444664478302, "epoch": 7.014676450967311, "step": 21030 }, { "epoch": 7.014676450967311, "ref_ce_loss": 0.06127699092030525, "step": 21030 }, { "epoch": 7.014676450967311, "loss": 0.1797933280467987, "step": 21030 }, { "ce_loss": 0.010841690935194492, "epoch": 7.014676450967311, "step": 21030 }, { "distill_loss": 0.11612024158239365, "epoch": 7.014676450967311, "step": 21030 }, { "epoch": 7.014676450967311, "ref_ce_loss": 0.02793746255338192, "step": 21030 }, { "epoch": 7.014676450967311, "loss": 0.5318374633789062, "step": 21030 }, { "ce_loss": 0.026757286861538887, "epoch": 7.014676450967311, "step": 21030 }, { "distill_loss": 0.3860778212547302, "epoch": 7.014676450967311, "step": 21030 }, { "epoch": 7.014676450967311, "ref_ce_loss": 0.0411611869931221, "step": 21030 }, { "epoch": 7.0180120080053365, "loss": 0.3613, "step": 21040 }, { "epoch": 7.0180120080053365, "grad_norm": 2.4196865558624268, "step": 21040 }, { "epoch": 7.0180120080053365, "learning_rate": 1.169876748311091e-05, "step": 21040 }, { "epoch": 7.0180120080053365, "loss": 0.31978467106819153, "step": 21040 }, { "ce_loss": 0.01599363051354885, "epoch": 7.0180120080053365, "step": 21040 }, { "distill_loss": 0.18945179879665375, "epoch": 7.0180120080053365, "step": 21040 }, { "epoch": 7.0180120080053365, "ref_ce_loss": 0.0704239159822464, "step": 21040 }, { "epoch": 7.0180120080053365, "loss": 0.24735331535339355, "step": 21040 }, { "ce_loss": 0.024469342082738876, "epoch": 7.0180120080053365, "step": 21040 }, { "distill_loss": 0.1651473045349121, "epoch": 7.0180120080053365, "step": 21040 }, { "epoch": 7.0180120080053365, "ref_ce_loss": 0.04400210455060005, "step": 21040 }, { "epoch": 7.0180120080053365, "loss": 0.5415984988212585, "step": 21040 }, { "ce_loss": 0.037225011736154556, "epoch": 7.0180120080053365, "step": 21040 }, { "distill_loss": 0.1968650221824646, "epoch": 7.0180120080053365, "step": 21040 }, { "epoch": 7.0180120080053365, "ref_ce_loss": 0.07533683627843857, "step": 21040 }, { "epoch": 7.0180120080053365, "loss": 0.652391254901886, "step": 21040 }, { "ce_loss": 0.013494131155312061, "epoch": 7.0180120080053365, "step": 21040 }, { "distill_loss": 0.1403832733631134, "epoch": 7.0180120080053365, "step": 21040 }, { "epoch": 7.0180120080053365, "ref_ce_loss": 0.05307801812887192, "step": 21040 }, { "epoch": 7.021347565043362, "loss": 0.3395, "step": 21050 }, { "epoch": 7.021347565043362, "grad_norm": 3.360076427459717, "step": 21050 }, { "epoch": 7.021347565043362, "learning_rate": 1.1620467841059511e-05, "step": 21050 }, { "epoch": 7.021347565043362, "loss": 0.2819208800792694, "step": 21050 }, { "ce_loss": 0.0377165786921978, "epoch": 7.021347565043362, "step": 21050 }, { "distill_loss": 0.1763533353805542, "epoch": 7.021347565043362, "step": 21050 }, { "epoch": 7.021347565043362, "ref_ce_loss": 0.04387078434228897, "step": 21050 }, { "epoch": 7.021347565043362, "loss": 0.42356985807418823, "step": 21050 }, { "ce_loss": 0.02946825698018074, "epoch": 7.021347565043362, "step": 21050 }, { "distill_loss": 0.149619922041893, "epoch": 7.021347565043362, "step": 21050 }, { "epoch": 7.021347565043362, "ref_ce_loss": 0.04405827820301056, "step": 21050 }, { "epoch": 7.021347565043362, "loss": 0.33829575777053833, "step": 21050 }, { "ce_loss": 0.03332025557756424, "epoch": 7.021347565043362, "step": 21050 }, { "distill_loss": 0.1677481085062027, "epoch": 7.021347565043362, "step": 21050 }, { "epoch": 7.021347565043362, "ref_ce_loss": 0.04332270473241806, "step": 21050 }, { "epoch": 7.021347565043362, "loss": 0.2877388894557953, "step": 21050 }, { "ce_loss": 0.011127806268632412, "epoch": 7.021347565043362, "step": 21050 }, { "distill_loss": 0.236330047249794, "epoch": 7.021347565043362, "step": 21050 }, { "epoch": 7.021347565043362, "ref_ce_loss": 0.040191177278757095, "step": 21050 }, { "epoch": 7.024683122081387, "loss": 0.3552, "step": 21060 }, { "epoch": 7.024683122081387, "grad_norm": 3.0807788372039795, "step": 21060 }, { "epoch": 7.024683122081387, "learning_rate": 1.1542420548688464e-05, "step": 21060 }, { "epoch": 7.024683122081387, "loss": 0.2960524260997772, "step": 21060 }, { "ce_loss": 0.010671052150428295, "epoch": 7.024683122081387, "step": 21060 }, { "distill_loss": 0.1927395910024643, "epoch": 7.024683122081387, "step": 21060 }, { "epoch": 7.024683122081387, "ref_ce_loss": 0.06929319351911545, "step": 21060 }, { "epoch": 7.024683122081387, "loss": 0.3725976347923279, "step": 21060 }, { "ce_loss": 0.010630992241203785, "epoch": 7.024683122081387, "step": 21060 }, { "distill_loss": 0.2733990252017975, "epoch": 7.024683122081387, "step": 21060 }, { "epoch": 7.024683122081387, "ref_ce_loss": 0.03346521779894829, "step": 21060 }, { "epoch": 7.024683122081387, "loss": 0.3648867607116699, "step": 21060 }, { "ce_loss": 0.014998015947639942, "epoch": 7.024683122081387, "step": 21060 }, { "distill_loss": 0.30057311058044434, "epoch": 7.024683122081387, "step": 21060 }, { "epoch": 7.024683122081387, "ref_ce_loss": 0.03862837329506874, "step": 21060 }, { "epoch": 7.024683122081387, "loss": 0.6767865419387817, "step": 21060 }, { "ce_loss": 0.005089592654258013, "epoch": 7.024683122081387, "step": 21060 }, { "distill_loss": 0.3199460804462433, "epoch": 7.024683122081387, "step": 21060 }, { "epoch": 7.024683122081387, "ref_ce_loss": 0.07371282577514648, "step": 21060 }, { "epoch": 7.028018679119413, "loss": 0.3651, "step": 21070 }, { "epoch": 7.028018679119413, "grad_norm": 4.383955001831055, "step": 21070 }, { "epoch": 7.028018679119413, "learning_rate": 1.1464625748325284e-05, "step": 21070 }, { "epoch": 7.028018679119413, "loss": 0.217289537191391, "step": 21070 }, { "ce_loss": 0.02977265790104866, "epoch": 7.028018679119413, "step": 21070 }, { "distill_loss": 0.13479569554328918, "epoch": 7.028018679119413, "step": 21070 }, { "epoch": 7.028018679119413, "ref_ce_loss": 0.052569203078746796, "step": 21070 }, { "epoch": 7.028018679119413, "loss": 0.25972500443458557, "step": 21070 }, { "ce_loss": 0.012066180817782879, "epoch": 7.028018679119413, "step": 21070 }, { "distill_loss": 0.16654083132743835, "epoch": 7.028018679119413, "step": 21070 }, { "epoch": 7.028018679119413, "ref_ce_loss": 0.026700763031840324, "step": 21070 }, { "epoch": 7.028018679119413, "loss": 0.31060537695884705, "step": 21070 }, { "ce_loss": 0.018852874636650085, "epoch": 7.028018679119413, "step": 21070 }, { "distill_loss": 0.1913139373064041, "epoch": 7.028018679119413, "step": 21070 }, { "epoch": 7.028018679119413, "ref_ce_loss": 0.05901574715971947, "step": 21070 }, { "epoch": 7.028018679119413, "loss": 0.35601550340652466, "step": 21070 }, { "ce_loss": 0.05516732484102249, "epoch": 7.028018679119413, "step": 21070 }, { "distill_loss": 0.2630683481693268, "epoch": 7.028018679119413, "step": 21070 }, { "epoch": 7.028018679119413, "ref_ce_loss": 0.037460535764694214, "step": 21070 }, { "epoch": 7.031354236157438, "loss": 0.3844, "step": 21080 }, { "epoch": 7.031354236157438, "grad_norm": 3.6851420402526855, "step": 21080 }, { "epoch": 7.031354236157438, "learning_rate": 1.1387083581836992e-05, "step": 21080 }, { "epoch": 7.031354236157438, "loss": 0.3004406690597534, "step": 21080 }, { "ce_loss": 0.005100657232105732, "epoch": 7.031354236157438, "step": 21080 }, { "distill_loss": 0.2407703995704651, "epoch": 7.031354236157438, "step": 21080 }, { "epoch": 7.031354236157438, "ref_ce_loss": 0.03812128305435181, "step": 21080 }, { "epoch": 7.031354236157438, "loss": 0.2978489398956299, "step": 21080 }, { "ce_loss": 0.016226215288043022, "epoch": 7.031354236157438, "step": 21080 }, { "distill_loss": 0.22501929104328156, "epoch": 7.031354236157438, "step": 21080 }, { "epoch": 7.031354236157438, "ref_ce_loss": 0.056322839111089706, "step": 21080 }, { "epoch": 7.031354236157438, "loss": 0.6905106902122498, "step": 21080 }, { "ce_loss": 0.024305883795022964, "epoch": 7.031354236157438, "step": 21080 }, { "distill_loss": 0.30780029296875, "epoch": 7.031354236157438, "step": 21080 }, { "epoch": 7.031354236157438, "ref_ce_loss": 0.07255929708480835, "step": 21080 }, { "epoch": 7.031354236157438, "loss": 0.23199906945228577, "step": 21080 }, { "ce_loss": 0.04769589379429817, "epoch": 7.031354236157438, "step": 21080 }, { "distill_loss": 0.1294175684452057, "epoch": 7.031354236157438, "step": 21080 }, { "epoch": 7.031354236157438, "ref_ce_loss": 0.040796466171741486, "step": 21080 }, { "epoch": 7.034689793195463, "loss": 0.3744, "step": 21090 }, { "epoch": 7.034689793195463, "grad_norm": 2.6544904708862305, "step": 21090 }, { "epoch": 7.034689793195463, "learning_rate": 1.1309794190629906e-05, "step": 21090 }, { "epoch": 7.034689793195463, "loss": 0.26282796263694763, "step": 21090 }, { "ce_loss": 0.015210578218102455, "epoch": 7.034689793195463, "step": 21090 }, { "distill_loss": 0.20315514504909515, "epoch": 7.034689793195463, "step": 21090 }, { "epoch": 7.034689793195463, "ref_ce_loss": 0.044133421033620834, "step": 21090 }, { "epoch": 7.034689793195463, "loss": 0.48627135157585144, "step": 21090 }, { "ce_loss": 0.04514510929584503, "epoch": 7.034689793195463, "step": 21090 }, { "distill_loss": 0.26357829570770264, "epoch": 7.034689793195463, "step": 21090 }, { "epoch": 7.034689793195463, "ref_ce_loss": 0.0717320367693901, "step": 21090 }, { "epoch": 7.034689793195463, "loss": 0.3367202877998352, "step": 21090 }, { "ce_loss": 0.05366295948624611, "epoch": 7.034689793195463, "step": 21090 }, { "distill_loss": 0.20038925111293793, "epoch": 7.034689793195463, "step": 21090 }, { "epoch": 7.034689793195463, "ref_ce_loss": 0.08253741264343262, "step": 21090 }, { "epoch": 7.034689793195463, "loss": 0.39167726039886475, "step": 21090 }, { "ce_loss": 0.01120414212346077, "epoch": 7.034689793195463, "step": 21090 }, { "distill_loss": 0.1971331387758255, "epoch": 7.034689793195463, "step": 21090 }, { "epoch": 7.034689793195463, "ref_ce_loss": 0.03756158798933029, "step": 21090 }, { "epoch": 7.038025350233489, "loss": 0.3475, "step": 21100 }, { "epoch": 7.038025350233489, "grad_norm": 2.5539391040802, "step": 21100 }, { "epoch": 7.038025350233489, "learning_rate": 1.1232757715649432e-05, "step": 21100 }, { "epoch": 7.038025350233489, "loss": 0.4853425920009613, "step": 21100 }, { "ce_loss": 0.04947700724005699, "epoch": 7.038025350233489, "step": 21100 }, { "distill_loss": 0.3233245611190796, "epoch": 7.038025350233489, "step": 21100 }, { "epoch": 7.038025350233489, "ref_ce_loss": 0.05600043758749962, "step": 21100 }, { "epoch": 7.038025350233489, "loss": 0.363929808139801, "step": 21100 }, { "ce_loss": 0.10695886611938477, "epoch": 7.038025350233489, "step": 21100 }, { "distill_loss": 0.1929464340209961, "epoch": 7.038025350233489, "step": 21100 }, { "epoch": 7.038025350233489, "ref_ce_loss": 0.06395082175731659, "step": 21100 }, { "epoch": 7.038025350233489, "loss": 0.22964845597743988, "step": 21100 }, { "ce_loss": 0.00565209798514843, "epoch": 7.038025350233489, "step": 21100 }, { "distill_loss": 0.1298438310623169, "epoch": 7.038025350233489, "step": 21100 }, { "epoch": 7.038025350233489, "ref_ce_loss": 0.027368027716875076, "step": 21100 }, { "epoch": 7.038025350233489, "loss": 0.5313661098480225, "step": 21100 }, { "ce_loss": 0.03788033872842789, "epoch": 7.038025350233489, "step": 21100 }, { "distill_loss": 0.13044065237045288, "epoch": 7.038025350233489, "step": 21100 }, { "epoch": 7.038025350233489, "ref_ce_loss": 0.03457934409379959, "step": 21100 }, { "epoch": 7.041360907271514, "loss": 0.3571, "step": 21110 }, { "epoch": 7.041360907271514, "grad_norm": 2.350095748901367, "step": 21110 }, { "epoch": 7.041360907271514, "learning_rate": 1.1155974297379644e-05, "step": 21110 }, { "epoch": 7.041360907271514, "loss": 0.3459613025188446, "step": 21110 }, { "ce_loss": 0.00866713747382164, "epoch": 7.041360907271514, "step": 21110 }, { "distill_loss": 0.20660671591758728, "epoch": 7.041360907271514, "step": 21110 }, { "epoch": 7.041360907271514, "ref_ce_loss": 0.044547878205776215, "step": 21110 }, { "epoch": 7.041360907271514, "loss": 0.23378105461597443, "step": 21110 }, { "ce_loss": 0.01803533174097538, "epoch": 7.041360907271514, "step": 21110 }, { "distill_loss": 0.14041326940059662, "epoch": 7.041360907271514, "step": 21110 }, { "epoch": 7.041360907271514, "ref_ce_loss": 0.050297707319259644, "step": 21110 }, { "epoch": 7.041360907271514, "loss": 0.2744632363319397, "step": 21110 }, { "ce_loss": 0.026918690651655197, "epoch": 7.041360907271514, "step": 21110 }, { "distill_loss": 0.10023162513971329, "epoch": 7.041360907271514, "step": 21110 }, { "epoch": 7.041360907271514, "ref_ce_loss": 0.038829028606414795, "step": 21110 }, { "epoch": 7.041360907271514, "loss": 0.5327430367469788, "step": 21110 }, { "ce_loss": 0.092887282371521, "epoch": 7.041360907271514, "step": 21110 }, { "distill_loss": 0.17936775088310242, "epoch": 7.041360907271514, "step": 21110 }, { "epoch": 7.041360907271514, "ref_ce_loss": 0.06346702575683594, "step": 21110 }, { "epoch": 7.044696464309539, "loss": 0.3346, "step": 21120 }, { "epoch": 7.044696464309539, "grad_norm": 3.586826801300049, "step": 21120 }, { "epoch": 7.044696464309539, "learning_rate": 1.1079444075843252e-05, "step": 21120 }, { "epoch": 7.044696464309539, "loss": 0.4072696268558502, "step": 21120 }, { "ce_loss": 0.07858144491910934, "epoch": 7.044696464309539, "step": 21120 }, { "distill_loss": 0.2500748932361603, "epoch": 7.044696464309539, "step": 21120 }, { "epoch": 7.044696464309539, "ref_ce_loss": 0.05024728551506996, "step": 21120 }, { "epoch": 7.044696464309539, "loss": 0.38662242889404297, "step": 21120 }, { "ce_loss": 0.1051878109574318, "epoch": 7.044696464309539, "step": 21120 }, { "distill_loss": 0.19055181741714478, "epoch": 7.044696464309539, "step": 21120 }, { "epoch": 7.044696464309539, "ref_ce_loss": 0.06847601383924484, "step": 21120 }, { "epoch": 7.044696464309539, "loss": 0.37392425537109375, "step": 21120 }, { "ce_loss": 0.04408292844891548, "epoch": 7.044696464309539, "step": 21120 }, { "distill_loss": 0.23299375176429749, "epoch": 7.044696464309539, "step": 21120 }, { "epoch": 7.044696464309539, "ref_ce_loss": 0.061505965888500214, "step": 21120 }, { "epoch": 7.044696464309539, "loss": 0.6570822596549988, "step": 21120 }, { "ce_loss": 0.07716977596282959, "epoch": 7.044696464309539, "step": 21120 }, { "distill_loss": 0.32018449902534485, "epoch": 7.044696464309539, "step": 21120 }, { "epoch": 7.044696464309539, "ref_ce_loss": 0.09262377768754959, "step": 21120 }, { "epoch": 7.048032021347565, "loss": 0.3679, "step": 21130 }, { "epoch": 7.048032021347565, "grad_norm": 3.6819005012512207, "step": 21130 }, { "epoch": 7.048032021347565, "learning_rate": 1.1003167190601153e-05, "step": 21130 }, { "epoch": 7.048032021347565, "loss": 0.5182002186775208, "step": 21130 }, { "ce_loss": 0.019167399033904076, "epoch": 7.048032021347565, "step": 21130 }, { "distill_loss": 0.3869504928588867, "epoch": 7.048032021347565, "step": 21130 }, { "epoch": 7.048032021347565, "ref_ce_loss": 0.07341236621141434, "step": 21130 }, { "epoch": 7.048032021347565, "loss": 0.2740857005119324, "step": 21130 }, { "ce_loss": 0.017344314604997635, "epoch": 7.048032021347565, "step": 21130 }, { "distill_loss": 0.19354546070098877, "epoch": 7.048032021347565, "step": 21130 }, { "epoch": 7.048032021347565, "ref_ce_loss": 0.06311025470495224, "step": 21130 }, { "epoch": 7.048032021347565, "loss": 0.21160180866718292, "step": 21130 }, { "ce_loss": 0.042595986276865005, "epoch": 7.048032021347565, "step": 21130 }, { "distill_loss": 0.12442618608474731, "epoch": 7.048032021347565, "step": 21130 }, { "epoch": 7.048032021347565, "ref_ce_loss": 0.0371774397790432, "step": 21130 }, { "epoch": 7.048032021347565, "loss": 0.20767563581466675, "step": 21130 }, { "ce_loss": 0.016156502068042755, "epoch": 7.048032021347565, "step": 21130 }, { "distill_loss": 0.14854544401168823, "epoch": 7.048032021347565, "step": 21130 }, { "epoch": 7.048032021347565, "ref_ce_loss": 0.04287472739815712, "step": 21130 }, { "epoch": 7.05136757838559, "loss": 0.3631, "step": 21140 }, { "epoch": 7.05136757838559, "grad_norm": 2.9166581630706787, "step": 21140 }, { "epoch": 7.05136757838559, "learning_rate": 1.0927143780752345e-05, "step": 21140 }, { "epoch": 7.05136757838559, "loss": 0.32964104413986206, "step": 21140 }, { "ce_loss": 0.05006510391831398, "epoch": 7.05136757838559, "step": 21140 }, { "distill_loss": 0.21244673430919647, "epoch": 7.05136757838559, "step": 21140 }, { "epoch": 7.05136757838559, "ref_ce_loss": 0.0668855607509613, "step": 21140 }, { "epoch": 7.05136757838559, "loss": 0.22159138321876526, "step": 21140 }, { "ce_loss": 0.03067399002611637, "epoch": 7.05136757838559, "step": 21140 }, { "distill_loss": 0.10590535402297974, "epoch": 7.05136757838559, "step": 21140 }, { "epoch": 7.05136757838559, "ref_ce_loss": 0.05489220470190048, "step": 21140 }, { "epoch": 7.05136757838559, "loss": 0.6037322282791138, "step": 21140 }, { "ce_loss": 0.011360462754964828, "epoch": 7.05136757838559, "step": 21140 }, { "distill_loss": 0.43363645672798157, "epoch": 7.05136757838559, "step": 21140 }, { "epoch": 7.05136757838559, "ref_ce_loss": 0.06793065369129181, "step": 21140 }, { "epoch": 7.05136757838559, "loss": 0.2623229920864105, "step": 21140 }, { "ce_loss": 0.014065076597034931, "epoch": 7.05136757838559, "step": 21140 }, { "distill_loss": 0.19880500435829163, "epoch": 7.05136757838559, "step": 21140 }, { "epoch": 7.05136757838559, "ref_ce_loss": 0.04935702309012413, "step": 21140 }, { "epoch": 7.054703135423615, "loss": 0.3285, "step": 21150 }, { "epoch": 7.054703135423615, "grad_norm": 2.9054787158966064, "step": 21150 }, { "epoch": 7.054703135423615, "learning_rate": 1.0851373984933532e-05, "step": 21150 }, { "epoch": 7.054703135423615, "loss": 0.2806137204170227, "step": 21150 }, { "ce_loss": 0.03232104331254959, "epoch": 7.054703135423615, "step": 21150 }, { "distill_loss": 0.15062671899795532, "epoch": 7.054703135423615, "step": 21150 }, { "epoch": 7.054703135423615, "ref_ce_loss": 0.03469966724514961, "step": 21150 }, { "epoch": 7.054703135423615, "loss": 0.4322219789028168, "step": 21150 }, { "ce_loss": 0.06931765377521515, "epoch": 7.054703135423615, "step": 21150 }, { "distill_loss": 0.26579010486602783, "epoch": 7.054703135423615, "step": 21150 }, { "epoch": 7.054703135423615, "ref_ce_loss": 0.04841863363981247, "step": 21150 }, { "epoch": 7.054703135423615, "loss": 0.2573651075363159, "step": 21150 }, { "ce_loss": 0.004316686186939478, "epoch": 7.054703135423615, "step": 21150 }, { "distill_loss": 0.1615641862154007, "epoch": 7.054703135423615, "step": 21150 }, { "epoch": 7.054703135423615, "ref_ce_loss": 0.04651380330324173, "step": 21150 }, { "epoch": 7.054703135423615, "loss": 0.25774532556533813, "step": 21150 }, { "ce_loss": 0.004958820529282093, "epoch": 7.054703135423615, "step": 21150 }, { "distill_loss": 0.16653427481651306, "epoch": 7.054703135423615, "step": 21150 }, { "epoch": 7.054703135423615, "ref_ce_loss": 0.046766992658376694, "step": 21150 }, { "epoch": 7.058038692461641, "loss": 0.345, "step": 21160 }, { "epoch": 7.058038692461641, "grad_norm": 2.805272340774536, "step": 21160 }, { "epoch": 7.058038692461641, "learning_rate": 1.0775857941318899e-05, "step": 21160 }, { "epoch": 7.058038692461641, "loss": 0.3288522958755493, "step": 21160 }, { "ce_loss": 0.0010662720305845141, "epoch": 7.058038692461641, "step": 21160 }, { "distill_loss": 0.252332866191864, "epoch": 7.058038692461641, "step": 21160 }, { "epoch": 7.058038692461641, "ref_ce_loss": 0.04569484665989876, "step": 21160 }, { "epoch": 7.058038692461641, "loss": 0.3757239580154419, "step": 21160 }, { "ce_loss": 0.003635851666331291, "epoch": 7.058038692461641, "step": 21160 }, { "distill_loss": 0.11291775107383728, "epoch": 7.058038692461641, "step": 21160 }, { "epoch": 7.058038692461641, "ref_ce_loss": 0.06550167500972748, "step": 21160 }, { "epoch": 7.058038692461641, "loss": 0.2398977279663086, "step": 21160 }, { "ce_loss": 0.0565723218023777, "epoch": 7.058038692461641, "step": 21160 }, { "distill_loss": 0.11294666677713394, "epoch": 7.058038692461641, "step": 21160 }, { "epoch": 7.058038692461641, "ref_ce_loss": 0.05807725712656975, "step": 21160 }, { "epoch": 7.058038692461641, "loss": 0.2866376042366028, "step": 21160 }, { "ce_loss": 0.017606956884264946, "epoch": 7.058038692461641, "step": 21160 }, { "distill_loss": 0.18055574595928192, "epoch": 7.058038692461641, "step": 21160 }, { "epoch": 7.058038692461641, "ref_ce_loss": 0.046308476477861404, "step": 21160 }, { "epoch": 7.061374249499666, "loss": 0.3608, "step": 21170 }, { "epoch": 7.061374249499666, "grad_norm": 2.8814284801483154, "step": 21170 }, { "epoch": 7.061374249499666, "learning_rate": 1.0700595787619925e-05, "step": 21170 }, { "epoch": 7.061374249499666, "loss": 0.2647363543510437, "step": 21170 }, { "ce_loss": 0.01880454272031784, "epoch": 7.061374249499666, "step": 21170 }, { "distill_loss": 0.16595271229743958, "epoch": 7.061374249499666, "step": 21170 }, { "epoch": 7.061374249499666, "ref_ce_loss": 0.06280763447284698, "step": 21170 }, { "epoch": 7.061374249499666, "loss": 0.26586365699768066, "step": 21170 }, { "ce_loss": 0.0166045892983675, "epoch": 7.061374249499666, "step": 21170 }, { "distill_loss": 0.18876267969608307, "epoch": 7.061374249499666, "step": 21170 }, { "epoch": 7.061374249499666, "ref_ce_loss": 0.05875850096344948, "step": 21170 }, { "epoch": 7.061374249499666, "loss": 0.3638674318790436, "step": 21170 }, { "ce_loss": 0.026428092271089554, "epoch": 7.061374249499666, "step": 21170 }, { "distill_loss": 0.20158889889717102, "epoch": 7.061374249499666, "step": 21170 }, { "epoch": 7.061374249499666, "ref_ce_loss": 0.03497897461056709, "step": 21170 }, { "epoch": 7.061374249499666, "loss": 0.2749888002872467, "step": 21170 }, { "ce_loss": 0.0217903982847929, "epoch": 7.061374249499666, "step": 21170 }, { "distill_loss": 0.19239500164985657, "epoch": 7.061374249499666, "step": 21170 }, { "epoch": 7.061374249499666, "ref_ce_loss": 0.04525717347860336, "step": 21170 }, { "epoch": 7.064709806537691, "loss": 0.3577, "step": 21180 }, { "epoch": 7.064709806537691, "grad_norm": 4.790896415710449, "step": 21180 }, { "epoch": 7.064709806537691, "learning_rate": 1.0625587661085105e-05, "step": 21180 }, { "epoch": 7.064709806537691, "loss": 0.21709632873535156, "step": 21180 }, { "ce_loss": 0.01828574389219284, "epoch": 7.064709806537691, "step": 21180 }, { "distill_loss": 0.14977601170539856, "epoch": 7.064709806537691, "step": 21180 }, { "epoch": 7.064709806537691, "ref_ce_loss": 0.04868828505277634, "step": 21180 }, { "epoch": 7.064709806537691, "loss": 0.3032727837562561, "step": 21180 }, { "ce_loss": 0.023485232144594193, "epoch": 7.064709806537691, "step": 21180 }, { "distill_loss": 0.19155225157737732, "epoch": 7.064709806537691, "step": 21180 }, { "epoch": 7.064709806537691, "ref_ce_loss": 0.05645429342985153, "step": 21180 }, { "epoch": 7.064709806537691, "loss": 0.39241763949394226, "step": 21180 }, { "ce_loss": 0.047795649617910385, "epoch": 7.064709806537691, "step": 21180 }, { "distill_loss": 0.2212338000535965, "epoch": 7.064709806537691, "step": 21180 }, { "epoch": 7.064709806537691, "ref_ce_loss": 0.06751947104930878, "step": 21180 }, { "epoch": 7.064709806537691, "loss": 0.25130558013916016, "step": 21180 }, { "ce_loss": 0.003252769820392132, "epoch": 7.064709806537691, "step": 21180 }, { "distill_loss": 0.1279112547636032, "epoch": 7.064709806537691, "step": 21180 }, { "epoch": 7.064709806537691, "ref_ce_loss": 0.027870751917362213, "step": 21180 }, { "epoch": 7.068045363575717, "loss": 0.3106, "step": 21190 }, { "epoch": 7.068045363575717, "grad_norm": 3.6147572994232178, "step": 21190 }, { "epoch": 7.068045363575717, "learning_rate": 1.0550833698499688e-05, "step": 21190 }, { "epoch": 7.068045363575717, "loss": 0.3733055591583252, "step": 21190 }, { "ce_loss": 0.06108830124139786, "epoch": 7.068045363575717, "step": 21190 }, { "distill_loss": 0.21531425416469574, "epoch": 7.068045363575717, "step": 21190 }, { "epoch": 7.068045363575717, "ref_ce_loss": 0.06599297374486923, "step": 21190 }, { "epoch": 7.068045363575717, "loss": 0.37621110677719116, "step": 21190 }, { "ce_loss": 0.048682309687137604, "epoch": 7.068045363575717, "step": 21190 }, { "distill_loss": 0.2646140456199646, "epoch": 7.068045363575717, "step": 21190 }, { "epoch": 7.068045363575717, "ref_ce_loss": 0.05080604925751686, "step": 21190 }, { "epoch": 7.068045363575717, "loss": 0.43180280923843384, "step": 21190 }, { "ce_loss": 0.02251204289495945, "epoch": 7.068045363575717, "step": 21190 }, { "distill_loss": 0.1456415057182312, "epoch": 7.068045363575717, "step": 21190 }, { "epoch": 7.068045363575717, "ref_ce_loss": 0.027025721967220306, "step": 21190 }, { "epoch": 7.068045363575717, "loss": 0.40786445140838623, "step": 21190 }, { "ce_loss": 0.06046026200056076, "epoch": 7.068045363575717, "step": 21190 }, { "distill_loss": 0.22351795434951782, "epoch": 7.068045363575717, "step": 21190 }, { "epoch": 7.068045363575717, "ref_ce_loss": 0.03806782141327858, "step": 21190 }, { "epoch": 7.071380920613742, "loss": 0.3688, "step": 21200 }, { "epoch": 7.071380920613742, "grad_norm": 2.596191883087158, "step": 21200 }, { "epoch": 7.071380920613742, "learning_rate": 1.0476334036185413e-05, "step": 21200 }, { "epoch": 7.071380920613742, "loss": 0.26539090275764465, "step": 21200 }, { "ce_loss": 0.06284356117248535, "epoch": 7.071380920613742, "step": 21200 }, { "distill_loss": 0.14580541849136353, "epoch": 7.071380920613742, "step": 21200 }, { "epoch": 7.071380920613742, "ref_ce_loss": 0.037337347865104675, "step": 21200 }, { "epoch": 7.071380920613742, "loss": 0.3034488558769226, "step": 21200 }, { "ce_loss": 0.016640178859233856, "epoch": 7.071380920613742, "step": 21200 }, { "distill_loss": 0.2183038890361786, "epoch": 7.071380920613742, "step": 21200 }, { "epoch": 7.071380920613742, "ref_ce_loss": 0.068434938788414, "step": 21200 }, { "epoch": 7.071380920613742, "loss": 0.2993912398815155, "step": 21200 }, { "ce_loss": 0.02914571389555931, "epoch": 7.071380920613742, "step": 21200 }, { "distill_loss": 0.2163100391626358, "epoch": 7.071380920613742, "step": 21200 }, { "epoch": 7.071380920613742, "ref_ce_loss": 0.03776365891098976, "step": 21200 }, { "epoch": 7.071380920613742, "loss": 0.312997043132782, "step": 21200 }, { "ce_loss": 0.01290181651711464, "epoch": 7.071380920613742, "step": 21200 }, { "distill_loss": 0.20035392045974731, "epoch": 7.071380920613742, "step": 21200 }, { "epoch": 7.071380920613742, "ref_ce_loss": 0.060398805886507034, "step": 21200 }, { "epoch": 7.0747164776517675, "loss": 0.3435, "step": 21210 }, { "epoch": 7.0747164776517675, "grad_norm": 2.9420666694641113, "step": 21210 }, { "epoch": 7.0747164776517675, "learning_rate": 1.0402088810000237e-05, "step": 21210 }, { "epoch": 7.0747164776517675, "loss": 0.1917342245578766, "step": 21210 }, { "ce_loss": 0.007838577963411808, "epoch": 7.0747164776517675, "step": 21210 }, { "distill_loss": 0.1391885131597519, "epoch": 7.0747164776517675, "step": 21210 }, { "epoch": 7.0747164776517675, "ref_ce_loss": 0.026517007499933243, "step": 21210 }, { "epoch": 7.0747164776517675, "loss": 0.2319009006023407, "step": 21210 }, { "ce_loss": 0.013528701849281788, "epoch": 7.0747164776517675, "step": 21210 }, { "distill_loss": 0.16961170732975006, "epoch": 7.0747164776517675, "step": 21210 }, { "epoch": 7.0747164776517675, "ref_ce_loss": 0.04853387549519539, "step": 21210 }, { "epoch": 7.0747164776517675, "loss": 0.4764244556427002, "step": 21210 }, { "ce_loss": 0.020892919972538948, "epoch": 7.0747164776517675, "step": 21210 }, { "distill_loss": 0.2465275526046753, "epoch": 7.0747164776517675, "step": 21210 }, { "epoch": 7.0747164776517675, "ref_ce_loss": 0.05295209959149361, "step": 21210 }, { "epoch": 7.0747164776517675, "loss": 0.25967442989349365, "step": 21210 }, { "ce_loss": 0.0026893108151853085, "epoch": 7.0747164776517675, "step": 21210 }, { "distill_loss": 0.13870014250278473, "epoch": 7.0747164776517675, "step": 21210 }, { "epoch": 7.0747164776517675, "ref_ce_loss": 0.04126835614442825, "step": 21210 }, { "epoch": 7.078052034689793, "loss": 0.3146, "step": 21220 }, { "epoch": 7.078052034689793, "grad_norm": 3.195159435272217, "step": 21220 }, { "epoch": 7.078052034689793, "learning_rate": 1.0328098155338189e-05, "step": 21220 }, { "epoch": 7.078052034689793, "loss": 0.3102601170539856, "step": 21220 }, { "ce_loss": 0.02565469965338707, "epoch": 7.078052034689793, "step": 21220 }, { "distill_loss": 0.16854459047317505, "epoch": 7.078052034689793, "step": 21220 }, { "epoch": 7.078052034689793, "ref_ce_loss": 0.07197984308004379, "step": 21220 }, { "epoch": 7.078052034689793, "loss": 0.3003503084182739, "step": 21220 }, { "ce_loss": 0.08204754441976547, "epoch": 7.078052034689793, "step": 21220 }, { "distill_loss": 0.17015287280082703, "epoch": 7.078052034689793, "step": 21220 }, { "epoch": 7.078052034689793, "ref_ce_loss": 0.047961872071027756, "step": 21220 }, { "epoch": 7.078052034689793, "loss": 0.24385954439640045, "step": 21220 }, { "ce_loss": 0.03164820745587349, "epoch": 7.078052034689793, "step": 21220 }, { "distill_loss": 0.1497935950756073, "epoch": 7.078052034689793, "step": 21220 }, { "epoch": 7.078052034689793, "ref_ce_loss": 0.04439567029476166, "step": 21220 }, { "epoch": 7.078052034689793, "loss": 0.29692065715789795, "step": 21220 }, { "ce_loss": 0.025501569733023643, "epoch": 7.078052034689793, "step": 21220 }, { "distill_loss": 0.19459682703018188, "epoch": 7.078052034689793, "step": 21220 }, { "epoch": 7.078052034689793, "ref_ce_loss": 0.05867702141404152, "step": 21220 }, { "epoch": 7.081387591727818, "loss": 0.3179, "step": 21230 }, { "epoch": 7.081387591727818, "grad_norm": 3.1232728958129883, "step": 21230 }, { "epoch": 7.081387591727818, "learning_rate": 1.0254362207129035e-05, "step": 21230 }, { "epoch": 7.081387591727818, "loss": 0.2356388121843338, "step": 21230 }, { "ce_loss": 0.035304874181747437, "epoch": 7.081387591727818, "step": 21230 }, { "distill_loss": 0.136562317609787, "epoch": 7.081387591727818, "step": 21230 }, { "epoch": 7.081387591727818, "ref_ce_loss": 0.06360061466693878, "step": 21230 }, { "epoch": 7.081387591727818, "loss": 0.3611792027950287, "step": 21230 }, { "ce_loss": 0.04316425696015358, "epoch": 7.081387591727818, "step": 21230 }, { "distill_loss": 0.14863713085651398, "epoch": 7.081387591727818, "step": 21230 }, { "epoch": 7.081387591727818, "ref_ce_loss": 0.031219588592648506, "step": 21230 }, { "epoch": 7.081387591727818, "loss": 0.27983278036117554, "step": 21230 }, { "ce_loss": 0.026451464742422104, "epoch": 7.081387591727818, "step": 21230 }, { "distill_loss": 0.20756125450134277, "epoch": 7.081387591727818, "step": 21230 }, { "epoch": 7.081387591727818, "ref_ce_loss": 0.04571819305419922, "step": 21230 }, { "epoch": 7.081387591727818, "loss": 0.3777647912502289, "step": 21230 }, { "ce_loss": 0.009428311139345169, "epoch": 7.081387591727818, "step": 21230 }, { "distill_loss": 0.21531248092651367, "epoch": 7.081387591727818, "step": 21230 }, { "epoch": 7.081387591727818, "ref_ce_loss": 0.040740445256233215, "step": 21230 }, { "epoch": 7.0847231487658435, "loss": 0.3516, "step": 21240 }, { "epoch": 7.0847231487658435, "grad_norm": 4.481757164001465, "step": 21240 }, { "epoch": 7.0847231487658435, "learning_rate": 1.0180881099838067e-05, "step": 21240 }, { "epoch": 7.0847231487658435, "loss": 0.24451516568660736, "step": 21240 }, { "ce_loss": 0.016348013654351234, "epoch": 7.0847231487658435, "step": 21240 }, { "distill_loss": 0.194583460688591, "epoch": 7.0847231487658435, "step": 21240 }, { "epoch": 7.0847231487658435, "ref_ce_loss": 0.0334811732172966, "step": 21240 }, { "epoch": 7.0847231487658435, "loss": 0.20874151587486267, "step": 21240 }, { "ce_loss": 0.03042174130678177, "epoch": 7.0847231487658435, "step": 21240 }, { "distill_loss": 0.13220363855361938, "epoch": 7.0847231487658435, "step": 21240 }, { "epoch": 7.0847231487658435, "ref_ce_loss": 0.03144841641187668, "step": 21240 }, { "epoch": 7.0847231487658435, "loss": 0.4618191719055176, "step": 21240 }, { "ce_loss": 0.04640379920601845, "epoch": 7.0847231487658435, "step": 21240 }, { "distill_loss": 0.2875906229019165, "epoch": 7.0847231487658435, "step": 21240 }, { "epoch": 7.0847231487658435, "ref_ce_loss": 0.08858513832092285, "step": 21240 }, { "epoch": 7.0847231487658435, "loss": 0.25335830450057983, "step": 21240 }, { "ce_loss": 0.014944043010473251, "epoch": 7.0847231487658435, "step": 21240 }, { "distill_loss": 0.16305804252624512, "epoch": 7.0847231487658435, "step": 21240 }, { "epoch": 7.0847231487658435, "ref_ce_loss": 0.04924383386969566, "step": 21240 }, { "epoch": 7.088058705803869, "loss": 0.3344, "step": 21250 }, { "epoch": 7.088058705803869, "grad_norm": 6.538269996643066, "step": 21250 }, { "epoch": 7.088058705803869, "learning_rate": 1.0107654967465844e-05, "step": 21250 }, { "epoch": 7.088058705803869, "loss": 0.35251274704933167, "step": 21250 }, { "ce_loss": 0.01282446552067995, "epoch": 7.088058705803869, "step": 21250 }, { "distill_loss": 0.16358213126659393, "epoch": 7.088058705803869, "step": 21250 }, { "epoch": 7.088058705803869, "ref_ce_loss": 0.04842758923768997, "step": 21250 }, { "epoch": 7.088058705803869, "loss": 0.15011943876743317, "step": 21250 }, { "ce_loss": 0.00510772131383419, "epoch": 7.088058705803869, "step": 21250 }, { "distill_loss": 0.1163831353187561, "epoch": 7.088058705803869, "step": 21250 }, { "epoch": 7.088058705803869, "ref_ce_loss": 0.028515568003058434, "step": 21250 }, { "epoch": 7.088058705803869, "loss": 0.2913273274898529, "step": 21250 }, { "ce_loss": 0.04327051341533661, "epoch": 7.088058705803869, "step": 21250 }, { "distill_loss": 0.20849546790122986, "epoch": 7.088058705803869, "step": 21250 }, { "epoch": 7.088058705803869, "ref_ce_loss": 0.03915373235940933, "step": 21250 }, { "epoch": 7.088058705803869, "loss": 0.3624514937400818, "step": 21250 }, { "ce_loss": 0.054993703961372375, "epoch": 7.088058705803869, "step": 21250 }, { "distill_loss": 0.22235330939292908, "epoch": 7.088058705803869, "step": 21250 }, { "epoch": 7.088058705803869, "ref_ce_loss": 0.059161581099033356, "step": 21250 }, { "epoch": 7.091394262841894, "loss": 0.3363, "step": 21260 }, { "epoch": 7.091394262841894, "grad_norm": 2.7685844898223877, "step": 21260 }, { "epoch": 7.091394262841894, "learning_rate": 1.0034683943547916e-05, "step": 21260 }, { "epoch": 7.091394262841894, "loss": 0.2722393870353699, "step": 21260 }, { "ce_loss": 0.01598196104168892, "epoch": 7.091394262841894, "step": 21260 }, { "distill_loss": 0.19478975236415863, "epoch": 7.091394262841894, "step": 21260 }, { "epoch": 7.091394262841894, "ref_ce_loss": 0.044273532927036285, "step": 21260 }, { "epoch": 7.091394262841894, "loss": 0.5252133011817932, "step": 21260 }, { "ce_loss": 0.013084584847092628, "epoch": 7.091394262841894, "step": 21260 }, { "distill_loss": 0.2927495837211609, "epoch": 7.091394262841894, "step": 21260 }, { "epoch": 7.091394262841894, "ref_ce_loss": 0.0654245987534523, "step": 21260 }, { "epoch": 7.091394262841894, "loss": 0.3494012951850891, "step": 21260 }, { "ce_loss": 0.030542615801095963, "epoch": 7.091394262841894, "step": 21260 }, { "distill_loss": 0.21779635548591614, "epoch": 7.091394262841894, "step": 21260 }, { "epoch": 7.091394262841894, "ref_ce_loss": 0.04543349891901016, "step": 21260 }, { "epoch": 7.091394262841894, "loss": 0.2055548131465912, "step": 21260 }, { "ce_loss": 0.004060425795614719, "epoch": 7.091394262841894, "step": 21260 }, { "distill_loss": 0.16809694468975067, "epoch": 7.091394262841894, "step": 21260 }, { "epoch": 7.091394262841894, "ref_ce_loss": 0.021780950948596, "step": 21260 }, { "epoch": 7.09472981987992, "loss": 0.3662, "step": 21270 }, { "epoch": 7.09472981987992, "grad_norm": 5.921723365783691, "step": 21270 }, { "epoch": 7.09472981987992, "learning_rate": 9.961968161154653e-06, "step": 21270 }, { "epoch": 7.09472981987992, "loss": 0.6266645193099976, "step": 21270 }, { "ce_loss": 0.06745955348014832, "epoch": 7.09472981987992, "step": 21270 }, { "distill_loss": 0.3753885328769684, "epoch": 7.09472981987992, "step": 21270 }, { "epoch": 7.09472981987992, "ref_ce_loss": 0.07958387583494186, "step": 21270 }, { "epoch": 7.09472981987992, "loss": 0.21966169774532318, "step": 21270 }, { "ce_loss": 0.04526279494166374, "epoch": 7.09472981987992, "step": 21270 }, { "distill_loss": 0.15075144171714783, "epoch": 7.09472981987992, "step": 21270 }, { "epoch": 7.09472981987992, "ref_ce_loss": 0.023584889248013496, "step": 21270 }, { "epoch": 7.09472981987992, "loss": 0.2990386188030243, "step": 21270 }, { "ce_loss": 0.020705245435237885, "epoch": 7.09472981987992, "step": 21270 }, { "distill_loss": 0.20941773056983948, "epoch": 7.09472981987992, "step": 21270 }, { "epoch": 7.09472981987992, "ref_ce_loss": 0.05655130743980408, "step": 21270 }, { "epoch": 7.09472981987992, "loss": 0.2656441628932953, "step": 21270 }, { "ce_loss": 0.01828238181769848, "epoch": 7.09472981987992, "step": 21270 }, { "distill_loss": 0.19571369886398315, "epoch": 7.09472981987992, "step": 21270 }, { "epoch": 7.09472981987992, "ref_ce_loss": 0.05159137025475502, "step": 21270 }, { "epoch": 7.098065376917945, "loss": 0.3966, "step": 21280 }, { "epoch": 7.098065376917945, "grad_norm": 3.6227405071258545, "step": 21280 }, { "epoch": 7.098065376917945, "learning_rate": 9.889507752891019e-06, "step": 21280 }, { "epoch": 7.098065376917945, "loss": 0.3899279236793518, "step": 21280 }, { "ce_loss": 0.04282451048493385, "epoch": 7.098065376917945, "step": 21280 }, { "distill_loss": 0.2592725455760956, "epoch": 7.098065376917945, "step": 21280 }, { "epoch": 7.098065376917945, "ref_ce_loss": 0.058857910335063934, "step": 21280 }, { "epoch": 7.098065376917945, "loss": 0.2215050756931305, "step": 21280 }, { "ce_loss": 0.026519374921917915, "epoch": 7.098065376917945, "step": 21280 }, { "distill_loss": 0.16466079652309418, "epoch": 7.098065376917945, "step": 21280 }, { "epoch": 7.098065376917945, "ref_ce_loss": 0.02995748445391655, "step": 21280 }, { "epoch": 7.098065376917945, "loss": 0.3635097146034241, "step": 21280 }, { "ce_loss": 0.0383116714656353, "epoch": 7.098065376917945, "step": 21280 }, { "distill_loss": 0.14281782507896423, "epoch": 7.098065376917945, "step": 21280 }, { "epoch": 7.098065376917945, "ref_ce_loss": 0.05979880690574646, "step": 21280 }, { "epoch": 7.098065376917945, "loss": 0.6661757230758667, "step": 21280 }, { "ce_loss": 0.008801848627626896, "epoch": 7.098065376917945, "step": 21280 }, { "distill_loss": 0.32869842648506165, "epoch": 7.098065376917945, "step": 21280 }, { "epoch": 7.098065376917945, "ref_ce_loss": 0.048422083258628845, "step": 21280 }, { "epoch": 7.10140093395597, "loss": 0.3354, "step": 21290 }, { "epoch": 7.10140093395597, "grad_norm": 3.7059237957000732, "step": 21290 }, { "epoch": 7.10140093395597, "learning_rate": 9.817302850896092e-06, "step": 21290 }, { "epoch": 7.10140093395597, "loss": 0.7997819185256958, "step": 21290 }, { "ce_loss": 0.057272523641586304, "epoch": 7.10140093395597, "step": 21290 }, { "distill_loss": 0.3218955993652344, "epoch": 7.10140093395597, "step": 21290 }, { "epoch": 7.10140093395597, "ref_ce_loss": 0.06255380809307098, "step": 21290 }, { "epoch": 7.10140093395597, "loss": 0.7139435410499573, "step": 21290 }, { "ce_loss": 0.084070585668087, "epoch": 7.10140093395597, "step": 21290 }, { "distill_loss": 0.23481443524360657, "epoch": 7.10140093395597, "step": 21290 }, { "epoch": 7.10140093395597, "ref_ce_loss": 0.07495547086000443, "step": 21290 }, { "epoch": 7.10140093395597, "loss": 0.35811299085617065, "step": 21290 }, { "ce_loss": 0.031879205256700516, "epoch": 7.10140093395597, "step": 21290 }, { "distill_loss": 0.13502013683319092, "epoch": 7.10140093395597, "step": 21290 }, { "epoch": 7.10140093395597, "ref_ce_loss": 0.052764344960451126, "step": 21290 }, { "epoch": 7.10140093395597, "loss": 0.20150531828403473, "step": 21290 }, { "ce_loss": 0.00755065493285656, "epoch": 7.10140093395597, "step": 21290 }, { "distill_loss": 0.151234433054924, "epoch": 7.10140093395597, "step": 21290 }, { "epoch": 7.10140093395597, "ref_ce_loss": 0.04256344586610794, "step": 21290 }, { "epoch": 7.104736490993996, "loss": 0.3325, "step": 21300 }, { "epoch": 7.104736490993996, "grad_norm": 3.5701239109039307, "step": 21300 }, { "epoch": 7.104736490993996, "learning_rate": 9.74535358684323e-06, "step": 21300 }, { "epoch": 7.104736490993996, "loss": 0.24100618064403534, "step": 21300 }, { "ce_loss": 0.026463542133569717, "epoch": 7.104736490993996, "step": 21300 }, { "distill_loss": 0.16529029607772827, "epoch": 7.104736490993996, "step": 21300 }, { "epoch": 7.104736490993996, "ref_ce_loss": 0.04909892380237579, "step": 21300 }, { "epoch": 7.104736490993996, "loss": 0.2727445960044861, "step": 21300 }, { "ce_loss": 0.022189678624272346, "epoch": 7.104736490993996, "step": 21300 }, { "distill_loss": 0.18847636878490448, "epoch": 7.104736490993996, "step": 21300 }, { "epoch": 7.104736490993996, "ref_ce_loss": 0.04147105664014816, "step": 21300 }, { "epoch": 7.104736490993996, "loss": 0.2807205021381378, "step": 21300 }, { "ce_loss": 0.0023678650613874197, "epoch": 7.104736490993996, "step": 21300 }, { "distill_loss": 0.19518640637397766, "epoch": 7.104736490993996, "step": 21300 }, { "epoch": 7.104736490993996, "ref_ce_loss": 0.06072551757097244, "step": 21300 }, { "epoch": 7.104736490993996, "loss": 0.26328280568122864, "step": 21300 }, { "ce_loss": 0.03507067635655403, "epoch": 7.104736490993996, "step": 21300 }, { "distill_loss": 0.15676164627075195, "epoch": 7.104736490993996, "step": 21300 }, { "epoch": 7.104736490993996, "ref_ce_loss": 0.041791219264268875, "step": 21300 }, { "epoch": 7.108072048032021, "loss": 0.3332, "step": 21310 }, { "epoch": 7.108072048032021, "grad_norm": 3.480285167694092, "step": 21310 }, { "epoch": 7.108072048032021, "learning_rate": 9.673660091939512e-06, "step": 21310 }, { "epoch": 7.108072048032021, "loss": 0.2976704239845276, "step": 21310 }, { "ce_loss": 0.03696581348776817, "epoch": 7.108072048032021, "step": 21310 }, { "distill_loss": 0.20639725029468536, "epoch": 7.108072048032021, "step": 21310 }, { "epoch": 7.108072048032021, "ref_ce_loss": 0.038472164422273636, "step": 21310 }, { "epoch": 7.108072048032021, "loss": 0.4582396149635315, "step": 21310 }, { "ce_loss": 0.032093822956085205, "epoch": 7.108072048032021, "step": 21310 }, { "distill_loss": 0.20050117373466492, "epoch": 7.108072048032021, "step": 21310 }, { "epoch": 7.108072048032021, "ref_ce_loss": 0.05842212215065956, "step": 21310 }, { "epoch": 7.108072048032021, "loss": 0.24652057886123657, "step": 21310 }, { "ce_loss": 0.032380156219005585, "epoch": 7.108072048032021, "step": 21310 }, { "distill_loss": 0.16446007788181305, "epoch": 7.108072048032021, "step": 21310 }, { "epoch": 7.108072048032021, "ref_ce_loss": 0.04951951652765274, "step": 21310 }, { "epoch": 7.108072048032021, "loss": 0.4571993947029114, "step": 21310 }, { "ce_loss": 0.00208325800485909, "epoch": 7.108072048032021, "step": 21310 }, { "distill_loss": 0.2922826409339905, "epoch": 7.108072048032021, "step": 21310 }, { "epoch": 7.108072048032021, "ref_ce_loss": 0.044417742639780045, "step": 21310 }, { "epoch": 7.111407605070046, "loss": 0.3539, "step": 21320 }, { "epoch": 7.111407605070046, "grad_norm": 3.551114082336426, "step": 21320 }, { "epoch": 7.111407605070046, "learning_rate": 9.602222496925537e-06, "step": 21320 }, { "epoch": 7.111407605070046, "loss": 0.24439047276973724, "step": 21320 }, { "ce_loss": 0.010866631753742695, "epoch": 7.111407605070046, "step": 21320 }, { "distill_loss": 0.17475149035453796, "epoch": 7.111407605070046, "step": 21320 }, { "epoch": 7.111407605070046, "ref_ce_loss": 0.029197681695222855, "step": 21320 }, { "epoch": 7.111407605070046, "loss": 0.17844556272029877, "step": 21320 }, { "ce_loss": 0.0042810384184122086, "epoch": 7.111407605070046, "step": 21320 }, { "distill_loss": 0.14426660537719727, "epoch": 7.111407605070046, "step": 21320 }, { "epoch": 7.111407605070046, "ref_ce_loss": 0.029594624415040016, "step": 21320 }, { "epoch": 7.111407605070046, "loss": 0.2958901524543762, "step": 21320 }, { "ce_loss": 0.011121377348899841, "epoch": 7.111407605070046, "step": 21320 }, { "distill_loss": 0.16819752752780914, "epoch": 7.111407605070046, "step": 21320 }, { "epoch": 7.111407605070046, "ref_ce_loss": 0.041761115193367004, "step": 21320 }, { "epoch": 7.111407605070046, "loss": 0.3758298456668854, "step": 21320 }, { "ce_loss": 0.014851349405944347, "epoch": 7.111407605070046, "step": 21320 }, { "distill_loss": 0.26843661069869995, "epoch": 7.111407605070046, "step": 21320 }, { "epoch": 7.111407605070046, "ref_ce_loss": 0.06300424784421921, "step": 21320 }, { "epoch": 7.114743162108072, "loss": 0.3297, "step": 21330 }, { "epoch": 7.114743162108072, "grad_norm": 2.231074571609497, "step": 21330 }, { "epoch": 7.114743162108072, "learning_rate": 9.531040932075352e-06, "step": 21330 }, { "epoch": 7.114743162108072, "loss": 0.1900641918182373, "step": 21330 }, { "ce_loss": 0.0005149010685272515, "epoch": 7.114743162108072, "step": 21330 }, { "distill_loss": 0.11143720895051956, "epoch": 7.114743162108072, "step": 21330 }, { "epoch": 7.114743162108072, "ref_ce_loss": 0.029446803033351898, "step": 21330 }, { "epoch": 7.114743162108072, "loss": 0.3922143876552582, "step": 21330 }, { "ce_loss": 0.016270199790596962, "epoch": 7.114743162108072, "step": 21330 }, { "distill_loss": 0.24628299474716187, "epoch": 7.114743162108072, "step": 21330 }, { "epoch": 7.114743162108072, "ref_ce_loss": 0.04830366000533104, "step": 21330 }, { "epoch": 7.114743162108072, "loss": 0.49055543541908264, "step": 21330 }, { "ce_loss": 0.04130590334534645, "epoch": 7.114743162108072, "step": 21330 }, { "distill_loss": 0.3618186116218567, "epoch": 7.114743162108072, "step": 21330 }, { "epoch": 7.114743162108072, "ref_ce_loss": 0.06838495284318924, "step": 21330 }, { "epoch": 7.114743162108072, "loss": 0.20764322578907013, "step": 21330 }, { "ce_loss": 0.005058347247540951, "epoch": 7.114743162108072, "step": 21330 }, { "distill_loss": 0.14182953536510468, "epoch": 7.114743162108072, "step": 21330 }, { "epoch": 7.114743162108072, "ref_ce_loss": 0.04315049946308136, "step": 21330 }, { "epoch": 7.118078719146097, "loss": 0.3255, "step": 21340 }, { "epoch": 7.118078719146097, "grad_norm": 3.8594162464141846, "step": 21340 }, { "epoch": 7.118078719146097, "learning_rate": 9.460115527195999e-06, "step": 21340 }, { "epoch": 7.118078719146097, "loss": 0.7007927894592285, "step": 21340 }, { "ce_loss": 0.07202707231044769, "epoch": 7.118078719146097, "step": 21340 }, { "distill_loss": 0.16455012559890747, "epoch": 7.118078719146097, "step": 21340 }, { "epoch": 7.118078719146097, "ref_ce_loss": 0.06058318912982941, "step": 21340 }, { "epoch": 7.118078719146097, "loss": 0.22472164034843445, "step": 21340 }, { "ce_loss": 0.013528671115636826, "epoch": 7.118078719146097, "step": 21340 }, { "distill_loss": 0.1718527376651764, "epoch": 7.118078719146097, "step": 21340 }, { "epoch": 7.118078719146097, "ref_ce_loss": 0.0391010157763958, "step": 21340 }, { "epoch": 7.118078719146097, "loss": 0.22745491564273834, "step": 21340 }, { "ce_loss": 0.02853975258767605, "epoch": 7.118078719146097, "step": 21340 }, { "distill_loss": 0.1669045090675354, "epoch": 7.118078719146097, "step": 21340 }, { "epoch": 7.118078719146097, "ref_ce_loss": 0.03188779205083847, "step": 21340 }, { "epoch": 7.118078719146097, "loss": 0.30058276653289795, "step": 21340 }, { "ce_loss": 0.05709119886159897, "epoch": 7.118078719146097, "step": 21340 }, { "distill_loss": 0.18364207446575165, "epoch": 7.118078719146097, "step": 21340 }, { "epoch": 7.118078719146097, "ref_ce_loss": 0.044599033892154694, "step": 21340 }, { "epoch": 7.121414276184122, "loss": 0.2999, "step": 21350 }, { "epoch": 7.121414276184122, "grad_norm": 2.430203437805176, "step": 21350 }, { "epoch": 7.121414276184122, "learning_rate": 9.389446411627439e-06, "step": 21350 }, { "epoch": 7.121414276184122, "loss": 0.4514169991016388, "step": 21350 }, { "ce_loss": 0.04752679169178009, "epoch": 7.121414276184122, "step": 21350 }, { "distill_loss": 0.13268008828163147, "epoch": 7.121414276184122, "step": 21350 }, { "epoch": 7.121414276184122, "ref_ce_loss": 0.04275263473391533, "step": 21350 }, { "epoch": 7.121414276184122, "loss": 0.37929394841194153, "step": 21350 }, { "ce_loss": 0.045660898089408875, "epoch": 7.121414276184122, "step": 21350 }, { "distill_loss": 0.19815610349178314, "epoch": 7.121414276184122, "step": 21350 }, { "epoch": 7.121414276184122, "ref_ce_loss": 0.05834659934043884, "step": 21350 }, { "epoch": 7.121414276184122, "loss": 0.248044952750206, "step": 21350 }, { "ce_loss": 0.01318468526005745, "epoch": 7.121414276184122, "step": 21350 }, { "distill_loss": 0.16702339053153992, "epoch": 7.121414276184122, "step": 21350 }, { "epoch": 7.121414276184122, "ref_ce_loss": 0.0383535698056221, "step": 21350 }, { "epoch": 7.121414276184122, "loss": 0.23754586279392242, "step": 21350 }, { "ce_loss": 0.015385741367936134, "epoch": 7.121414276184122, "step": 21350 }, { "distill_loss": 0.16478316485881805, "epoch": 7.121414276184122, "step": 21350 }, { "epoch": 7.121414276184122, "ref_ce_loss": 0.03582388535141945, "step": 21350 }, { "epoch": 7.124749833222148, "loss": 0.3619, "step": 21360 }, { "epoch": 7.124749833222148, "grad_norm": 4.065296173095703, "step": 21360 }, { "epoch": 7.124749833222148, "learning_rate": 9.319033714242347e-06, "step": 21360 }, { "epoch": 7.124749833222148, "loss": 0.2339756339788437, "step": 21360 }, { "ce_loss": 0.014882135204970837, "epoch": 7.124749833222148, "step": 21360 }, { "distill_loss": 0.16815043985843658, "epoch": 7.124749833222148, "step": 21360 }, { "epoch": 7.124749833222148, "ref_ce_loss": 0.03089192323386669, "step": 21360 }, { "epoch": 7.124749833222148, "loss": 0.4349558353424072, "step": 21360 }, { "ce_loss": 0.026239456608891487, "epoch": 7.124749833222148, "step": 21360 }, { "distill_loss": 0.2093418538570404, "epoch": 7.124749833222148, "step": 21360 }, { "epoch": 7.124749833222148, "ref_ce_loss": 0.07308489829301834, "step": 21360 }, { "epoch": 7.124749833222148, "loss": 0.313965380191803, "step": 21360 }, { "ce_loss": 0.031080765649676323, "epoch": 7.124749833222148, "step": 21360 }, { "distill_loss": 0.19044944643974304, "epoch": 7.124749833222148, "step": 21360 }, { "epoch": 7.124749833222148, "ref_ce_loss": 0.050029199570417404, "step": 21360 }, { "epoch": 7.124749833222148, "loss": 0.2853374183177948, "step": 21360 }, { "ce_loss": 0.01889471895992756, "epoch": 7.124749833222148, "step": 21360 }, { "distill_loss": 0.14720062911510468, "epoch": 7.124749833222148, "step": 21360 }, { "epoch": 7.124749833222148, "ref_ce_loss": 0.044718701392412186, "step": 21360 }, { "epoch": 7.128085390260173, "loss": 0.3224, "step": 21370 }, { "epoch": 7.128085390260173, "grad_norm": 2.7422754764556885, "step": 21370 }, { "epoch": 7.128085390260173, "learning_rate": 9.248877563445611e-06, "step": 21370 }, { "epoch": 7.128085390260173, "loss": 0.3476315140724182, "step": 21370 }, { "ce_loss": 0.06209288164973259, "epoch": 7.128085390260173, "step": 21370 }, { "distill_loss": 0.2215966433286667, "epoch": 7.128085390260173, "step": 21370 }, { "epoch": 7.128085390260173, "ref_ce_loss": 0.04940538853406906, "step": 21370 }, { "epoch": 7.128085390260173, "loss": 0.30568182468414307, "step": 21370 }, { "ce_loss": 0.06676947325468063, "epoch": 7.128085390260173, "step": 21370 }, { "distill_loss": 0.13683325052261353, "epoch": 7.128085390260173, "step": 21370 }, { "epoch": 7.128085390260173, "ref_ce_loss": 0.0813133716583252, "step": 21370 }, { "epoch": 7.128085390260173, "loss": 0.2064388394355774, "step": 21370 }, { "ce_loss": 0.024937504902482033, "epoch": 7.128085390260173, "step": 21370 }, { "distill_loss": 0.09930000454187393, "epoch": 7.128085390260173, "step": 21370 }, { "epoch": 7.128085390260173, "ref_ce_loss": 0.058826789259910583, "step": 21370 }, { "epoch": 7.128085390260173, "loss": 0.3290785849094391, "step": 21370 }, { "ce_loss": 0.02121903747320175, "epoch": 7.128085390260173, "step": 21370 }, { "distill_loss": 0.2270878106355667, "epoch": 7.128085390260173, "step": 21370 }, { "epoch": 7.128085390260173, "ref_ce_loss": 0.031229302287101746, "step": 21370 }, { "epoch": 7.131420947298198, "loss": 0.3363, "step": 21380 }, { "epoch": 7.131420947298198, "grad_norm": 3.2990729808807373, "step": 21380 }, { "epoch": 7.131420947298198, "learning_rate": 9.178978087174426e-06, "step": 21380 }, { "epoch": 7.131420947298198, "loss": 0.27483171224594116, "step": 21380 }, { "ce_loss": 0.0506613627076149, "epoch": 7.131420947298198, "step": 21380 }, { "distill_loss": 0.1558806151151657, "epoch": 7.131420947298198, "step": 21380 }, { "epoch": 7.131420947298198, "ref_ce_loss": 0.04856330528855324, "step": 21380 }, { "epoch": 7.131420947298198, "loss": 0.43359971046447754, "step": 21380 }, { "ce_loss": 0.0775969922542572, "epoch": 7.131420947298198, "step": 21380 }, { "distill_loss": 0.26211804151535034, "epoch": 7.131420947298198, "step": 21380 }, { "epoch": 7.131420947298198, "ref_ce_loss": 0.06571530550718307, "step": 21380 }, { "epoch": 7.131420947298198, "loss": 0.3924413323402405, "step": 21380 }, { "ce_loss": 0.00827367790043354, "epoch": 7.131420947298198, "step": 21380 }, { "distill_loss": 0.18567365407943726, "epoch": 7.131420947298198, "step": 21380 }, { "epoch": 7.131420947298198, "ref_ce_loss": 0.03840828314423561, "step": 21380 }, { "epoch": 7.131420947298198, "loss": 0.3229596018791199, "step": 21380 }, { "ce_loss": 0.030836794525384903, "epoch": 7.131420947298198, "step": 21380 }, { "distill_loss": 0.20113810896873474, "epoch": 7.131420947298198, "step": 21380 }, { "epoch": 7.131420947298198, "ref_ce_loss": 0.04698263481259346, "step": 21380 }, { "epoch": 7.134756504336224, "loss": 0.3376, "step": 21390 }, { "epoch": 7.134756504336224, "grad_norm": 4.786271572113037, "step": 21390 }, { "epoch": 7.134756504336224, "learning_rate": 9.109335412897845e-06, "step": 21390 }, { "epoch": 7.134756504336224, "loss": 0.5202345252037048, "step": 21390 }, { "ce_loss": 0.05993020907044411, "epoch": 7.134756504336224, "step": 21390 }, { "distill_loss": 0.1180260181427002, "epoch": 7.134756504336224, "step": 21390 }, { "epoch": 7.134756504336224, "ref_ce_loss": 0.0760614275932312, "step": 21390 }, { "epoch": 7.134756504336224, "loss": 0.2719026207923889, "step": 21390 }, { "ce_loss": 0.042188555002212524, "epoch": 7.134756504336224, "step": 21390 }, { "distill_loss": 0.11038866639137268, "epoch": 7.134756504336224, "step": 21390 }, { "epoch": 7.134756504336224, "ref_ce_loss": 0.04938900098204613, "step": 21390 }, { "epoch": 7.134756504336224, "loss": 0.6805160641670227, "step": 21390 }, { "ce_loss": 0.027744436636567116, "epoch": 7.134756504336224, "step": 21390 }, { "distill_loss": 0.17196469008922577, "epoch": 7.134756504336224, "step": 21390 }, { "epoch": 7.134756504336224, "ref_ce_loss": 0.05967358127236366, "step": 21390 }, { "epoch": 7.134756504336224, "loss": 0.33298105001449585, "step": 21390 }, { "ce_loss": 0.047904592007398605, "epoch": 7.134756504336224, "step": 21390 }, { "distill_loss": 0.18573933839797974, "epoch": 7.134756504336224, "step": 21390 }, { "epoch": 7.134756504336224, "ref_ce_loss": 0.0671057403087616, "step": 21390 }, { "epoch": 7.138092061374249, "loss": 0.3493, "step": 21400 }, { "epoch": 7.138092061374249, "grad_norm": 5.421943664550781, "step": 21400 }, { "epoch": 7.138092061374249, "learning_rate": 9.039949667616641e-06, "step": 21400 }, { "epoch": 7.138092061374249, "loss": 0.48561030626296997, "step": 21400 }, { "ce_loss": 0.03090047463774681, "epoch": 7.138092061374249, "step": 21400 }, { "distill_loss": 0.32706791162490845, "epoch": 7.138092061374249, "step": 21400 }, { "epoch": 7.138092061374249, "ref_ce_loss": 0.07010676711797714, "step": 21400 }, { "epoch": 7.138092061374249, "loss": 0.3397560119628906, "step": 21400 }, { "ce_loss": 0.03334207087755203, "epoch": 7.138092061374249, "step": 21400 }, { "distill_loss": 0.22810399532318115, "epoch": 7.138092061374249, "step": 21400 }, { "epoch": 7.138092061374249, "ref_ce_loss": 0.05727453902363777, "step": 21400 }, { "epoch": 7.138092061374249, "loss": 0.2319808006286621, "step": 21400 }, { "ce_loss": 0.024934319779276848, "epoch": 7.138092061374249, "step": 21400 }, { "distill_loss": 0.13915522396564484, "epoch": 7.138092061374249, "step": 21400 }, { "epoch": 7.138092061374249, "ref_ce_loss": 0.03444851189851761, "step": 21400 }, { "epoch": 7.138092061374249, "loss": 0.3142017424106598, "step": 21400 }, { "ce_loss": 0.009186441078782082, "epoch": 7.138092061374249, "step": 21400 }, { "distill_loss": 0.220073401927948, "epoch": 7.138092061374249, "step": 21400 }, { "epoch": 7.138092061374249, "ref_ce_loss": 0.05729452520608902, "step": 21400 }, { "epoch": 7.1414276184122745, "loss": 0.3489, "step": 21410 }, { "epoch": 7.1414276184122745, "grad_norm": 2.15388560295105, "step": 21410 }, { "epoch": 7.1414276184122745, "learning_rate": 8.970820977863019e-06, "step": 21410 }, { "epoch": 7.1414276184122745, "loss": 0.1508547067642212, "step": 21410 }, { "ce_loss": 0.01235622726380825, "epoch": 7.1414276184122745, "step": 21410 }, { "distill_loss": 0.09108994156122208, "epoch": 7.1414276184122745, "step": 21410 }, { "epoch": 7.1414276184122745, "ref_ce_loss": 0.03690817207098007, "step": 21410 }, { "epoch": 7.1414276184122745, "loss": 0.5053752660751343, "step": 21410 }, { "ce_loss": 0.016127459704875946, "epoch": 7.1414276184122745, "step": 21410 }, { "distill_loss": 0.14929452538490295, "epoch": 7.1414276184122745, "step": 21410 }, { "epoch": 7.1414276184122745, "ref_ce_loss": 0.018189487978816032, "step": 21410 }, { "epoch": 7.1414276184122745, "loss": 0.3429606258869171, "step": 21410 }, { "ce_loss": 0.017136679962277412, "epoch": 7.1414276184122745, "step": 21410 }, { "distill_loss": 0.2520356774330139, "epoch": 7.1414276184122745, "step": 21410 }, { "epoch": 7.1414276184122745, "ref_ce_loss": 0.047066353261470795, "step": 21410 }, { "epoch": 7.1414276184122745, "loss": 0.3544640839099884, "step": 21410 }, { "ce_loss": 0.027670860290527344, "epoch": 7.1414276184122745, "step": 21410 }, { "distill_loss": 0.24994097650051117, "epoch": 7.1414276184122745, "step": 21410 }, { "epoch": 7.1414276184122745, "ref_ce_loss": 0.05183815583586693, "step": 21410 }, { "epoch": 7.1447631754503, "loss": 0.3671, "step": 21420 }, { "epoch": 7.1447631754503, "grad_norm": 44.177581787109375, "step": 21420 }, { "epoch": 7.1447631754503, "learning_rate": 8.901949469700487e-06, "step": 21420 }, { "epoch": 7.1447631754503, "loss": 0.460884153842926, "step": 21420 }, { "ce_loss": 0.04858725890517235, "epoch": 7.1447631754503, "step": 21420 }, { "distill_loss": 0.17198331654071808, "epoch": 7.1447631754503, "step": 21420 }, { "epoch": 7.1447631754503, "ref_ce_loss": 0.12297511100769043, "step": 21420 }, { "epoch": 7.1447631754503, "loss": 0.40572482347488403, "step": 21420 }, { "ce_loss": 0.029643459245562553, "epoch": 7.1447631754503, "step": 21420 }, { "distill_loss": 0.30070599913597107, "epoch": 7.1447631754503, "step": 21420 }, { "epoch": 7.1447631754503, "ref_ce_loss": 0.05532967299222946, "step": 21420 }, { "epoch": 7.1447631754503, "loss": 0.3806205689907074, "step": 21420 }, { "ce_loss": 0.02531914971768856, "epoch": 7.1447631754503, "step": 21420 }, { "distill_loss": 0.29515233635902405, "epoch": 7.1447631754503, "step": 21420 }, { "epoch": 7.1447631754503, "ref_ce_loss": 0.059911541640758514, "step": 21420 }, { "epoch": 7.1447631754503, "loss": 0.5730224847793579, "step": 21420 }, { "ce_loss": 0.03719313442707062, "epoch": 7.1447631754503, "step": 21420 }, { "distill_loss": 0.14535340666770935, "epoch": 7.1447631754503, "step": 21420 }, { "epoch": 7.1447631754503, "ref_ce_loss": 0.05887611210346222, "step": 21420 }, { "epoch": 7.148098732488325, "loss": 0.3547, "step": 21430 }, { "epoch": 7.148098732488325, "grad_norm": 4.740625381469727, "step": 21430 }, { "epoch": 7.148098732488325, "learning_rate": 8.833335268723462e-06, "step": 21430 }, { "epoch": 7.148098732488325, "loss": 0.20340901613235474, "step": 21430 }, { "ce_loss": 0.012906559742987156, "epoch": 7.148098732488325, "step": 21430 }, { "distill_loss": 0.14369744062423706, "epoch": 7.148098732488325, "step": 21430 }, { "epoch": 7.148098732488325, "ref_ce_loss": 0.04662976786494255, "step": 21430 }, { "epoch": 7.148098732488325, "loss": 0.3858056962490082, "step": 21430 }, { "ce_loss": 0.06574510037899017, "epoch": 7.148098732488325, "step": 21430 }, { "distill_loss": 0.2124141901731491, "epoch": 7.148098732488325, "step": 21430 }, { "epoch": 7.148098732488325, "ref_ce_loss": 0.0829734280705452, "step": 21430 }, { "epoch": 7.148098732488325, "loss": 0.3030683398246765, "step": 21430 }, { "ce_loss": 0.04919762909412384, "epoch": 7.148098732488325, "step": 21430 }, { "distill_loss": 0.1884440779685974, "epoch": 7.148098732488325, "step": 21430 }, { "epoch": 7.148098732488325, "ref_ce_loss": 0.03764420002698898, "step": 21430 }, { "epoch": 7.148098732488325, "loss": 0.43533486127853394, "step": 21430 }, { "ce_loss": 0.02066192403435707, "epoch": 7.148098732488325, "step": 21430 }, { "distill_loss": 0.23798935115337372, "epoch": 7.148098732488325, "step": 21430 }, { "epoch": 7.148098732488325, "ref_ce_loss": 0.058441367000341415, "step": 21430 }, { "epoch": 7.1514342895263505, "loss": 0.3316, "step": 21440 }, { "epoch": 7.1514342895263505, "grad_norm": 2.6973822116851807, "step": 21440 }, { "epoch": 7.1514342895263505, "learning_rate": 8.76497850005724e-06, "step": 21440 }, { "epoch": 7.1514342895263505, "loss": 0.3422290086746216, "step": 21440 }, { "ce_loss": 0.02658429928123951, "epoch": 7.1514342895263505, "step": 21440 }, { "distill_loss": 0.2068696916103363, "epoch": 7.1514342895263505, "step": 21440 }, { "epoch": 7.1514342895263505, "ref_ce_loss": 0.054690875113010406, "step": 21440 }, { "epoch": 7.1514342895263505, "loss": 0.38151055574417114, "step": 21440 }, { "ce_loss": 0.0338323637843132, "epoch": 7.1514342895263505, "step": 21440 }, { "distill_loss": 0.12502741813659668, "epoch": 7.1514342895263505, "step": 21440 }, { "epoch": 7.1514342895263505, "ref_ce_loss": 0.02690565027296543, "step": 21440 }, { "epoch": 7.1514342895263505, "loss": 0.25190258026123047, "step": 21440 }, { "ce_loss": 0.009585889987647533, "epoch": 7.1514342895263505, "step": 21440 }, { "distill_loss": 0.12396883219480515, "epoch": 7.1514342895263505, "step": 21440 }, { "epoch": 7.1514342895263505, "ref_ce_loss": 0.032859060913324356, "step": 21440 }, { "epoch": 7.1514342895263505, "loss": 0.37992632389068604, "step": 21440 }, { "ce_loss": 0.022066691890358925, "epoch": 7.1514342895263505, "step": 21440 }, { "distill_loss": 0.13286367058753967, "epoch": 7.1514342895263505, "step": 21440 }, { "epoch": 7.1514342895263505, "ref_ce_loss": 0.034272920340299606, "step": 21440 }, { "epoch": 7.154769846564376, "loss": 0.3336, "step": 21450 }, { "epoch": 7.154769846564376, "grad_norm": 6.438956260681152, "step": 21450 }, { "epoch": 7.154769846564376, "learning_rate": 8.69687928835754e-06, "step": 21450 }, { "epoch": 7.154769846564376, "loss": 0.1311127245426178, "step": 21450 }, { "ce_loss": 0.0023328508250415325, "epoch": 7.154769846564376, "step": 21450 }, { "distill_loss": 0.09891074895858765, "epoch": 7.154769846564376, "step": 21450 }, { "epoch": 7.154769846564376, "ref_ce_loss": 0.02963155321776867, "step": 21450 }, { "epoch": 7.154769846564376, "loss": 0.48175233602523804, "step": 21450 }, { "ce_loss": 0.01688428781926632, "epoch": 7.154769846564376, "step": 21450 }, { "distill_loss": 0.15957482159137726, "epoch": 7.154769846564376, "step": 21450 }, { "epoch": 7.154769846564376, "ref_ce_loss": 0.023770110681653023, "step": 21450 }, { "epoch": 7.154769846564376, "loss": 0.3389487862586975, "step": 21450 }, { "ce_loss": 0.04547916725277901, "epoch": 7.154769846564376, "step": 21450 }, { "distill_loss": 0.2358999252319336, "epoch": 7.154769846564376, "step": 21450 }, { "epoch": 7.154769846564376, "ref_ce_loss": 0.03925956413149834, "step": 21450 }, { "epoch": 7.154769846564376, "loss": 0.37305766344070435, "step": 21450 }, { "ce_loss": 0.01338366698473692, "epoch": 7.154769846564376, "step": 21450 }, { "distill_loss": 0.2325180470943451, "epoch": 7.154769846564376, "step": 21450 }, { "epoch": 7.154769846564376, "ref_ce_loss": 0.03668252378702164, "step": 21450 }, { "epoch": 7.158105403602401, "loss": 0.3734, "step": 21460 }, { "epoch": 7.158105403602401, "grad_norm": 3.4364991188049316, "step": 21460 }, { "epoch": 7.158105403602401, "learning_rate": 8.629037757810486e-06, "step": 21460 }, { "epoch": 7.158105403602401, "loss": 0.22355008125305176, "step": 21460 }, { "ce_loss": 0.016299467533826828, "epoch": 7.158105403602401, "step": 21460 }, { "distill_loss": 0.11342824250459671, "epoch": 7.158105403602401, "step": 21460 }, { "epoch": 7.158105403602401, "ref_ce_loss": 0.034232400357723236, "step": 21460 }, { "epoch": 7.158105403602401, "loss": 0.44346651434898376, "step": 21460 }, { "ce_loss": 0.035172708332538605, "epoch": 7.158105403602401, "step": 21460 }, { "distill_loss": 0.33047035336494446, "epoch": 7.158105403602401, "step": 21460 }, { "epoch": 7.158105403602401, "ref_ce_loss": 0.06059928610920906, "step": 21460 }, { "epoch": 7.158105403602401, "loss": 0.22185933589935303, "step": 21460 }, { "ce_loss": 0.026147043332457542, "epoch": 7.158105403602401, "step": 21460 }, { "distill_loss": 0.1444932222366333, "epoch": 7.158105403602401, "step": 21460 }, { "epoch": 7.158105403602401, "ref_ce_loss": 0.024612026289105415, "step": 21460 }, { "epoch": 7.158105403602401, "loss": 0.26896658539772034, "step": 21460 }, { "ce_loss": 0.02288634143769741, "epoch": 7.158105403602401, "step": 21460 }, { "distill_loss": 0.12373004108667374, "epoch": 7.158105403602401, "step": 21460 }, { "epoch": 7.158105403602401, "ref_ce_loss": 0.06691116094589233, "step": 21460 }, { "epoch": 7.161440960640427, "loss": 0.3483, "step": 21470 }, { "epoch": 7.161440960640427, "grad_norm": 2.7852957248687744, "step": 21470 }, { "epoch": 7.161440960640427, "learning_rate": 8.561454032132253e-06, "step": 21470 }, { "epoch": 7.161440960640427, "loss": 0.26754996180534363, "step": 21470 }, { "ce_loss": 0.03477302938699722, "epoch": 7.161440960640427, "step": 21470 }, { "distill_loss": 0.1344570368528366, "epoch": 7.161440960640427, "step": 21470 }, { "epoch": 7.161440960640427, "ref_ce_loss": 0.04726096987724304, "step": 21470 }, { "epoch": 7.161440960640427, "loss": 0.2843259274959564, "step": 21470 }, { "ce_loss": 0.022212691605091095, "epoch": 7.161440960640427, "step": 21470 }, { "distill_loss": 0.1965642273426056, "epoch": 7.161440960640427, "step": 21470 }, { "epoch": 7.161440960640427, "ref_ce_loss": 0.033638931810855865, "step": 21470 }, { "epoch": 7.161440960640427, "loss": 0.28304195404052734, "step": 21470 }, { "ce_loss": 0.02403201535344124, "epoch": 7.161440960640427, "step": 21470 }, { "distill_loss": 0.154997318983078, "epoch": 7.161440960640427, "step": 21470 }, { "epoch": 7.161440960640427, "ref_ce_loss": 0.038654860109090805, "step": 21470 }, { "epoch": 7.161440960640427, "loss": 0.3013923168182373, "step": 21470 }, { "ce_loss": 0.03783771023154259, "epoch": 7.161440960640427, "step": 21470 }, { "distill_loss": 0.1492641121149063, "epoch": 7.161440960640427, "step": 21470 }, { "epoch": 7.161440960640427, "ref_ce_loss": 0.07085458189249039, "step": 21470 }, { "epoch": 7.164776517678452, "loss": 0.3155, "step": 21480 }, { "epoch": 7.164776517678452, "grad_norm": 3.168621301651001, "step": 21480 }, { "epoch": 7.164776517678452, "learning_rate": 8.494128234568936e-06, "step": 21480 }, { "epoch": 7.164776517678452, "loss": 0.5053913593292236, "step": 21480 }, { "ce_loss": 0.05008067935705185, "epoch": 7.164776517678452, "step": 21480 }, { "distill_loss": 0.19111387431621552, "epoch": 7.164776517678452, "step": 21480 }, { "epoch": 7.164776517678452, "ref_ce_loss": 0.06695059686899185, "step": 21480 }, { "epoch": 7.164776517678452, "loss": 0.4717422425746918, "step": 21480 }, { "ce_loss": 0.15805310010910034, "epoch": 7.164776517678452, "step": 21480 }, { "distill_loss": 0.16689586639404297, "epoch": 7.164776517678452, "step": 21480 }, { "epoch": 7.164776517678452, "ref_ce_loss": 0.05943808704614639, "step": 21480 }, { "epoch": 7.164776517678452, "loss": 0.4965991973876953, "step": 21480 }, { "ce_loss": 0.026956552639603615, "epoch": 7.164776517678452, "step": 21480 }, { "distill_loss": 0.38682329654693604, "epoch": 7.164776517678452, "step": 21480 }, { "epoch": 7.164776517678452, "ref_ce_loss": 0.047263722866773605, "step": 21480 }, { "epoch": 7.164776517678452, "loss": 0.24691656231880188, "step": 21480 }, { "ce_loss": 0.02406405471265316, "epoch": 7.164776517678452, "step": 21480 }, { "distill_loss": 0.1757323443889618, "epoch": 7.164776517678452, "step": 21480 }, { "epoch": 7.164776517678452, "ref_ce_loss": 0.04702390730381012, "step": 21480 }, { "epoch": 7.168112074716477, "loss": 0.311, "step": 21490 }, { "epoch": 7.168112074716477, "grad_norm": 3.70613956451416, "step": 21490 }, { "epoch": 7.168112074716477, "learning_rate": 8.427060487896209e-06, "step": 21490 }, { "epoch": 7.168112074716477, "loss": 0.6340287923812866, "step": 21490 }, { "ce_loss": 0.026472387835383415, "epoch": 7.168112074716477, "step": 21490 }, { "distill_loss": 0.32688456773757935, "epoch": 7.168112074716477, "step": 21490 }, { "epoch": 7.168112074716477, "ref_ce_loss": 0.06099152937531471, "step": 21490 }, { "epoch": 7.168112074716477, "loss": 0.14026260375976562, "step": 21490 }, { "ce_loss": 0.00750059774145484, "epoch": 7.168112074716477, "step": 21490 }, { "distill_loss": 0.08525346219539642, "epoch": 7.168112074716477, "step": 21490 }, { "epoch": 7.168112074716477, "ref_ce_loss": 0.032193947583436966, "step": 21490 }, { "epoch": 7.168112074716477, "loss": 0.347702294588089, "step": 21490 }, { "ce_loss": 0.06765453517436981, "epoch": 7.168112074716477, "step": 21490 }, { "distill_loss": 0.18379972875118256, "epoch": 7.168112074716477, "step": 21490 }, { "epoch": 7.168112074716477, "ref_ce_loss": 0.059257086366415024, "step": 21490 }, { "epoch": 7.168112074716477, "loss": 0.3984415531158447, "step": 21490 }, { "ce_loss": 0.06973186135292053, "epoch": 7.168112074716477, "step": 21490 }, { "distill_loss": 0.27749377489089966, "epoch": 7.168112074716477, "step": 21490 }, { "epoch": 7.168112074716477, "ref_ce_loss": 0.050319571048021317, "step": 21490 }, { "epoch": 7.171447631754503, "loss": 0.3219, "step": 21500 }, { "epoch": 7.171447631754503, "grad_norm": 4.287350654602051, "step": 21500 }, { "epoch": 7.171447631754503, "learning_rate": 8.36025091441917e-06, "step": 21500 }, { "epoch": 7.171447631754503, "loss": 0.19232136011123657, "step": 21500 }, { "ce_loss": 0.005756628233939409, "epoch": 7.171447631754503, "step": 21500 }, { "distill_loss": 0.11545160412788391, "epoch": 7.171447631754503, "step": 21500 }, { "epoch": 7.171447631754503, "ref_ce_loss": 0.032617710530757904, "step": 21500 }, { "epoch": 7.171447631754503, "loss": 0.3118692636489868, "step": 21500 }, { "ce_loss": 0.0021937473211437464, "epoch": 7.171447631754503, "step": 21500 }, { "distill_loss": 0.11081088334321976, "epoch": 7.171447631754503, "step": 21500 }, { "epoch": 7.171447631754503, "ref_ce_loss": 0.050306838005781174, "step": 21500 }, { "epoch": 7.171447631754503, "loss": 0.2555490732192993, "step": 21500 }, { "ce_loss": 0.027354789897799492, "epoch": 7.171447631754503, "step": 21500 }, { "distill_loss": 0.20176441967487335, "epoch": 7.171447631754503, "step": 21500 }, { "epoch": 7.171447631754503, "ref_ce_loss": 0.026141945272684097, "step": 21500 }, { "epoch": 7.171447631754503, "loss": 0.3627238869667053, "step": 21500 }, { "ce_loss": 0.02147168107330799, "epoch": 7.171447631754503, "step": 21500 }, { "distill_loss": 0.22193299233913422, "epoch": 7.171447631754503, "step": 21500 }, { "epoch": 7.171447631754503, "ref_ce_loss": 0.04372347891330719, "step": 21500 }, { "epoch": 7.174783188792528, "loss": 0.3478, "step": 21510 }, { "epoch": 7.174783188792528, "grad_norm": 4.667346954345703, "step": 21510 }, { "epoch": 7.174783188792528, "learning_rate": 8.293699635972146e-06, "step": 21510 }, { "epoch": 7.174783188792528, "loss": 0.2823328971862793, "step": 21510 }, { "ce_loss": 0.026913125067949295, "epoch": 7.174783188792528, "step": 21510 }, { "distill_loss": 0.18480005860328674, "epoch": 7.174783188792528, "step": 21510 }, { "epoch": 7.174783188792528, "ref_ce_loss": 0.05143596976995468, "step": 21510 }, { "epoch": 7.174783188792528, "loss": 0.2691704034805298, "step": 21510 }, { "ce_loss": 0.06230126693844795, "epoch": 7.174783188792528, "step": 21510 }, { "distill_loss": 0.1483161896467209, "epoch": 7.174783188792528, "step": 21510 }, { "epoch": 7.174783188792528, "ref_ce_loss": 0.05851830169558525, "step": 21510 }, { "epoch": 7.174783188792528, "loss": 0.3138797879219055, "step": 21510 }, { "ce_loss": 0.04803266003727913, "epoch": 7.174783188792528, "step": 21510 }, { "distill_loss": 0.21480245888233185, "epoch": 7.174783188792528, "step": 21510 }, { "epoch": 7.174783188792528, "ref_ce_loss": 0.05064088851213455, "step": 21510 }, { "epoch": 7.174783188792528, "loss": 0.4249471426010132, "step": 21510 }, { "ce_loss": 0.05544541776180267, "epoch": 7.174783188792528, "step": 21510 }, { "distill_loss": 0.1516028344631195, "epoch": 7.174783188792528, "step": 21510 }, { "epoch": 7.174783188792528, "ref_ce_loss": 0.06550660729408264, "step": 21510 }, { "epoch": 7.178118745830553, "loss": 0.327, "step": 21520 }, { "epoch": 7.178118745830553, "grad_norm": 3.7791588306427, "step": 21520 }, { "epoch": 7.178118745830553, "learning_rate": 8.227406773918405e-06, "step": 21520 }, { "epoch": 7.178118745830553, "loss": 0.20280539989471436, "step": 21520 }, { "ce_loss": 0.006557751446962357, "epoch": 7.178118745830553, "step": 21520 }, { "distill_loss": 0.11654480546712875, "epoch": 7.178118745830553, "step": 21520 }, { "epoch": 7.178118745830553, "ref_ce_loss": 0.055110517889261246, "step": 21520 }, { "epoch": 7.178118745830553, "loss": 0.3507617712020874, "step": 21520 }, { "ce_loss": 0.013434783555567265, "epoch": 7.178118745830553, "step": 21520 }, { "distill_loss": 0.25564488768577576, "epoch": 7.178118745830553, "step": 21520 }, { "epoch": 7.178118745830553, "ref_ce_loss": 0.041842278093099594, "step": 21520 }, { "epoch": 7.178118745830553, "loss": 0.2878246605396271, "step": 21520 }, { "ce_loss": 0.03153041750192642, "epoch": 7.178118745830553, "step": 21520 }, { "distill_loss": 0.18612876534461975, "epoch": 7.178118745830553, "step": 21520 }, { "epoch": 7.178118745830553, "ref_ce_loss": 0.049200598150491714, "step": 21520 }, { "epoch": 7.178118745830553, "loss": 0.33751150965690613, "step": 21520 }, { "ce_loss": 0.04821312800049782, "epoch": 7.178118745830553, "step": 21520 }, { "distill_loss": 0.149494931101799, "epoch": 7.178118745830553, "step": 21520 }, { "epoch": 7.178118745830553, "ref_ce_loss": 0.06942351907491684, "step": 21520 }, { "epoch": 7.181454302868579, "loss": 0.3396, "step": 21530 }, { "epoch": 7.181454302868579, "grad_norm": 2.776742696762085, "step": 21530 }, { "epoch": 7.181454302868579, "learning_rate": 8.161372449149994e-06, "step": 21530 }, { "epoch": 7.181454302868579, "loss": 0.2622234523296356, "step": 21530 }, { "ce_loss": 0.04477609694004059, "epoch": 7.181454302868579, "step": 21530 }, { "distill_loss": 0.1622024029493332, "epoch": 7.181454302868579, "step": 21530 }, { "epoch": 7.181454302868579, "ref_ce_loss": 0.03245099261403084, "step": 21530 }, { "epoch": 7.181454302868579, "loss": 0.5293148159980774, "step": 21530 }, { "ce_loss": 0.036752525717020035, "epoch": 7.181454302868579, "step": 21530 }, { "distill_loss": 0.2745060920715332, "epoch": 7.181454302868579, "step": 21530 }, { "epoch": 7.181454302868579, "ref_ce_loss": 0.0655098631978035, "step": 21530 }, { "epoch": 7.181454302868579, "loss": 0.3222862482070923, "step": 21530 }, { "ce_loss": 0.014142888598144054, "epoch": 7.181454302868579, "step": 21530 }, { "distill_loss": 0.09600169211626053, "epoch": 7.181454302868579, "step": 21530 }, { "epoch": 7.181454302868579, "ref_ce_loss": 0.031071700155735016, "step": 21530 }, { "epoch": 7.181454302868579, "loss": 0.24960437417030334, "step": 21530 }, { "ce_loss": 0.017925996333360672, "epoch": 7.181454302868579, "step": 21530 }, { "distill_loss": 0.1352931410074234, "epoch": 7.181454302868579, "step": 21530 }, { "epoch": 7.181454302868579, "ref_ce_loss": 0.07950492948293686, "step": 21530 }, { "epoch": 7.184789859906604, "loss": 0.344, "step": 21540 }, { "epoch": 7.184789859906604, "grad_norm": 2.4601974487304688, "step": 21540 }, { "epoch": 7.184789859906604, "learning_rate": 8.095596782087487e-06, "step": 21540 }, { "epoch": 7.184789859906604, "loss": 0.3111080825328827, "step": 21540 }, { "ce_loss": 0.02254648320376873, "epoch": 7.184789859906604, "step": 21540 }, { "distill_loss": 0.1943889558315277, "epoch": 7.184789859906604, "step": 21540 }, { "epoch": 7.184789859906604, "ref_ce_loss": 0.05962618440389633, "step": 21540 }, { "epoch": 7.184789859906604, "loss": 0.43967175483703613, "step": 21540 }, { "ce_loss": 0.04014963284134865, "epoch": 7.184789859906604, "step": 21540 }, { "distill_loss": 0.1975461095571518, "epoch": 7.184789859906604, "step": 21540 }, { "epoch": 7.184789859906604, "ref_ce_loss": 0.052939631044864655, "step": 21540 }, { "epoch": 7.184789859906604, "loss": 0.20108476281166077, "step": 21540 }, { "ce_loss": 0.038109976798295975, "epoch": 7.184789859906604, "step": 21540 }, { "distill_loss": 0.1050659641623497, "epoch": 7.184789859906604, "step": 21540 }, { "epoch": 7.184789859906604, "ref_ce_loss": 0.045342423021793365, "step": 21540 }, { "epoch": 7.184789859906604, "loss": 0.2912309765815735, "step": 21540 }, { "ce_loss": 0.011822945438325405, "epoch": 7.184789859906604, "step": 21540 }, { "distill_loss": 0.1958990842103958, "epoch": 7.184789859906604, "step": 21540 }, { "epoch": 7.184789859906604, "ref_ce_loss": 0.030265500769019127, "step": 21540 }, { "epoch": 7.188125416944629, "loss": 0.3729, "step": 21550 }, { "epoch": 7.188125416944629, "grad_norm": 3.593996047973633, "step": 21550 }, { "epoch": 7.188125416944629, "learning_rate": 8.030079892679702e-06, "step": 21550 }, { "epoch": 7.188125416944629, "loss": 0.35638487339019775, "step": 21550 }, { "ce_loss": 0.009184126742184162, "epoch": 7.188125416944629, "step": 21550 }, { "distill_loss": 0.15575531125068665, "epoch": 7.188125416944629, "step": 21550 }, { "epoch": 7.188125416944629, "ref_ce_loss": 0.04846501350402832, "step": 21550 }, { "epoch": 7.188125416944629, "loss": 0.26497361063957214, "step": 21550 }, { "ce_loss": 0.016823608428239822, "epoch": 7.188125416944629, "step": 21550 }, { "distill_loss": 0.17821790277957916, "epoch": 7.188125416944629, "step": 21550 }, { "epoch": 7.188125416944629, "ref_ce_loss": 0.05922691524028778, "step": 21550 }, { "epoch": 7.188125416944629, "loss": 0.35575103759765625, "step": 21550 }, { "ce_loss": 0.03228254243731499, "epoch": 7.188125416944629, "step": 21550 }, { "distill_loss": 0.23816150426864624, "epoch": 7.188125416944629, "step": 21550 }, { "epoch": 7.188125416944629, "ref_ce_loss": 0.06303157657384872, "step": 21550 }, { "epoch": 7.188125416944629, "loss": 0.32615143060684204, "step": 21550 }, { "ce_loss": 0.036773085594177246, "epoch": 7.188125416944629, "step": 21550 }, { "distill_loss": 0.15746957063674927, "epoch": 7.188125416944629, "step": 21550 }, { "epoch": 7.188125416944629, "ref_ce_loss": 0.038831427693367004, "step": 21550 }, { "epoch": 7.191460973982655, "loss": 0.3367, "step": 21560 }, { "epoch": 7.191460973982655, "grad_norm": 6.094079971313477, "step": 21560 }, { "epoch": 7.191460973982655, "learning_rate": 7.96482190040365e-06, "step": 21560 }, { "epoch": 7.191460973982655, "loss": 0.46769973635673523, "step": 21560 }, { "ce_loss": 0.04756125435233116, "epoch": 7.191460973982655, "step": 21560 }, { "distill_loss": 0.31982189416885376, "epoch": 7.191460973982655, "step": 21560 }, { "epoch": 7.191460973982655, "ref_ce_loss": 0.08072319626808167, "step": 21560 }, { "epoch": 7.191460973982655, "loss": 0.30835866928100586, "step": 21560 }, { "ce_loss": 0.048588644713163376, "epoch": 7.191460973982655, "step": 21560 }, { "distill_loss": 0.15134979784488678, "epoch": 7.191460973982655, "step": 21560 }, { "epoch": 7.191460973982655, "ref_ce_loss": 0.0634535551071167, "step": 21560 }, { "epoch": 7.191460973982655, "loss": 0.41379672288894653, "step": 21560 }, { "ce_loss": 0.13131098449230194, "epoch": 7.191460973982655, "step": 21560 }, { "distill_loss": 0.19826488196849823, "epoch": 7.191460973982655, "step": 21560 }, { "epoch": 7.191460973982655, "ref_ce_loss": 0.04915918409824371, "step": 21560 }, { "epoch": 7.191460973982655, "loss": 0.3735307455062866, "step": 21560 }, { "ce_loss": 0.005329641047865152, "epoch": 7.191460973982655, "step": 21560 }, { "distill_loss": 0.15917302668094635, "epoch": 7.191460973982655, "step": 21560 }, { "epoch": 7.191460973982655, "ref_ce_loss": 0.05440036952495575, "step": 21560 }, { "epoch": 7.19479653102068, "loss": 0.3176, "step": 21570 }, { "epoch": 7.19479653102068, "grad_norm": 3.168907403945923, "step": 21570 }, { "epoch": 7.19479653102068, "learning_rate": 7.899822924264104e-06, "step": 21570 }, { "epoch": 7.19479653102068, "loss": 0.607210099697113, "step": 21570 }, { "ce_loss": 0.02107110247015953, "epoch": 7.19479653102068, "step": 21570 }, { "distill_loss": 0.14337393641471863, "epoch": 7.19479653102068, "step": 21570 }, { "epoch": 7.19479653102068, "ref_ce_loss": 0.04880295693874359, "step": 21570 }, { "epoch": 7.19479653102068, "loss": 0.2701234519481659, "step": 21570 }, { "ce_loss": 0.024585945531725883, "epoch": 7.19479653102068, "step": 21570 }, { "distill_loss": 0.1722574383020401, "epoch": 7.19479653102068, "step": 21570 }, { "epoch": 7.19479653102068, "ref_ce_loss": 0.030505353584885597, "step": 21570 }, { "epoch": 7.19479653102068, "loss": 0.4172288775444031, "step": 21570 }, { "ce_loss": 0.03714752942323685, "epoch": 7.19479653102068, "step": 21570 }, { "distill_loss": 0.2899611294269562, "epoch": 7.19479653102068, "step": 21570 }, { "epoch": 7.19479653102068, "ref_ce_loss": 0.06813320517539978, "step": 21570 }, { "epoch": 7.19479653102068, "loss": 0.4019289016723633, "step": 21570 }, { "ce_loss": 0.030576540157198906, "epoch": 7.19479653102068, "step": 21570 }, { "distill_loss": 0.28335803747177124, "epoch": 7.19479653102068, "step": 21570 }, { "epoch": 7.19479653102068, "ref_ce_loss": 0.06374424695968628, "step": 21570 }, { "epoch": 7.198132088058705, "loss": 0.3242, "step": 21580 }, { "epoch": 7.198132088058705, "grad_norm": 2.639319896697998, "step": 21580 }, { "epoch": 7.198132088058705, "learning_rate": 7.835083082793614e-06, "step": 21580 }, { "epoch": 7.198132088058705, "loss": 0.4229542016983032, "step": 21580 }, { "ce_loss": 0.04644731432199478, "epoch": 7.198132088058705, "step": 21580 }, { "distill_loss": 0.2560223937034607, "epoch": 7.198132088058705, "step": 21580 }, { "epoch": 7.198132088058705, "ref_ce_loss": 0.07510516792535782, "step": 21580 }, { "epoch": 7.198132088058705, "loss": 0.5707595348358154, "step": 21580 }, { "ce_loss": 0.04119217395782471, "epoch": 7.198132088058705, "step": 21580 }, { "distill_loss": 0.21253474056720734, "epoch": 7.198132088058705, "step": 21580 }, { "epoch": 7.198132088058705, "ref_ce_loss": 0.05649447813630104, "step": 21580 }, { "epoch": 7.198132088058705, "loss": 0.7057649493217468, "step": 21580 }, { "ce_loss": 0.027443043887615204, "epoch": 7.198132088058705, "step": 21580 }, { "distill_loss": 0.19002290070056915, "epoch": 7.198132088058705, "step": 21580 }, { "epoch": 7.198132088058705, "ref_ce_loss": 0.06741243600845337, "step": 21580 }, { "epoch": 7.198132088058705, "loss": 0.18125252425670624, "step": 21580 }, { "ce_loss": 0.012593379244208336, "epoch": 7.198132088058705, "step": 21580 }, { "distill_loss": 0.13432380557060242, "epoch": 7.198132088058705, "step": 21580 }, { "epoch": 7.198132088058705, "ref_ce_loss": 0.03388500586152077, "step": 21580 }, { "epoch": 7.201467645096731, "loss": 0.3757, "step": 21590 }, { "epoch": 7.201467645096731, "grad_norm": 2.6682674884796143, "step": 21590 }, { "epoch": 7.201467645096731, "learning_rate": 7.770602494052124e-06, "step": 21590 }, { "epoch": 7.201467645096731, "loss": 0.44627857208251953, "step": 21590 }, { "ce_loss": 0.07321686297655106, "epoch": 7.201467645096731, "step": 21590 }, { "distill_loss": 0.29737573862075806, "epoch": 7.201467645096731, "step": 21590 }, { "epoch": 7.201467645096731, "ref_ce_loss": 0.05712924897670746, "step": 21590 }, { "epoch": 7.201467645096731, "loss": 0.23906278610229492, "step": 21590 }, { "ce_loss": 0.0435250923037529, "epoch": 7.201467645096731, "step": 21590 }, { "distill_loss": 0.15379908680915833, "epoch": 7.201467645096731, "step": 21590 }, { "epoch": 7.201467645096731, "ref_ce_loss": 0.04168463125824928, "step": 21590 }, { "epoch": 7.201467645096731, "loss": 0.23472881317138672, "step": 21590 }, { "ce_loss": 0.039945337921381, "epoch": 7.201467645096731, "step": 21590 }, { "distill_loss": 0.14018793404102325, "epoch": 7.201467645096731, "step": 21590 }, { "epoch": 7.201467645096731, "ref_ce_loss": 0.05431056767702103, "step": 21590 }, { "epoch": 7.201467645096731, "loss": 0.3228430449962616, "step": 21590 }, { "ce_loss": 0.003946481738239527, "epoch": 7.201467645096731, "step": 21590 }, { "distill_loss": 0.13325072824954987, "epoch": 7.201467645096731, "step": 21590 }, { "epoch": 7.201467645096731, "ref_ce_loss": 0.08239027857780457, "step": 21590 }, { "epoch": 7.204803202134756, "loss": 0.3117, "step": 21600 }, { "epoch": 7.204803202134756, "grad_norm": 2.6161727905273438, "step": 21600 }, { "epoch": 7.204803202134756, "learning_rate": 7.706381275626745e-06, "step": 21600 }, { "epoch": 7.204803202134756, "loss": 0.250211626291275, "step": 21600 }, { "ce_loss": 0.01741625741124153, "epoch": 7.204803202134756, "step": 21600 }, { "distill_loss": 0.12995892763137817, "epoch": 7.204803202134756, "step": 21600 }, { "epoch": 7.204803202134756, "ref_ce_loss": 0.05979376658797264, "step": 21600 }, { "epoch": 7.204803202134756, "loss": 0.27195289731025696, "step": 21600 }, { "ce_loss": 0.03143487870693207, "epoch": 7.204803202134756, "step": 21600 }, { "distill_loss": 0.1863202154636383, "epoch": 7.204803202134756, "step": 21600 }, { "epoch": 7.204803202134756, "ref_ce_loss": 0.0541115365922451, "step": 21600 }, { "epoch": 7.204803202134756, "loss": 0.2668476104736328, "step": 21600 }, { "ce_loss": 0.01905851997435093, "epoch": 7.204803202134756, "step": 21600 }, { "distill_loss": 0.1592845916748047, "epoch": 7.204803202134756, "step": 21600 }, { "epoch": 7.204803202134756, "ref_ce_loss": 0.05389750003814697, "step": 21600 }, { "epoch": 7.204803202134756, "loss": 0.28188711404800415, "step": 21600 }, { "ce_loss": 0.05415372550487518, "epoch": 7.204803202134756, "step": 21600 }, { "distill_loss": 0.15756256878376007, "epoch": 7.204803202134756, "step": 21600 }, { "epoch": 7.204803202134756, "ref_ce_loss": 0.06997683644294739, "step": 21600 }, { "epoch": 7.2081387591727815, "loss": 0.3161, "step": 21610 }, { "epoch": 7.2081387591727815, "grad_norm": 3.0321600437164307, "step": 21610 }, { "epoch": 7.2081387591727815, "learning_rate": 7.642419544631672e-06, "step": 21610 }, { "epoch": 7.2081387591727815, "loss": 0.2609192132949829, "step": 21610 }, { "ce_loss": 0.025224953889846802, "epoch": 7.2081387591727815, "step": 21610 }, { "distill_loss": 0.13914820551872253, "epoch": 7.2081387591727815, "step": 21610 }, { "epoch": 7.2081387591727815, "ref_ce_loss": 0.06633106619119644, "step": 21610 }, { "epoch": 7.2081387591727815, "loss": 0.25504258275032043, "step": 21610 }, { "ce_loss": 0.025933396071195602, "epoch": 7.2081387591727815, "step": 21610 }, { "distill_loss": 0.1570015847682953, "epoch": 7.2081387591727815, "step": 21610 }, { "epoch": 7.2081387591727815, "ref_ce_loss": 0.04250410944223404, "step": 21610 }, { "epoch": 7.2081387591727815, "loss": 0.3043152391910553, "step": 21610 }, { "ce_loss": 0.012817701324820518, "epoch": 7.2081387591727815, "step": 21610 }, { "distill_loss": 0.1668878197669983, "epoch": 7.2081387591727815, "step": 21610 }, { "epoch": 7.2081387591727815, "ref_ce_loss": 0.04772022366523743, "step": 21610 }, { "epoch": 7.2081387591727815, "loss": 0.27410194277763367, "step": 21610 }, { "ce_loss": 0.004894441459327936, "epoch": 7.2081387591727815, "step": 21610 }, { "distill_loss": 0.14274558424949646, "epoch": 7.2081387591727815, "step": 21610 }, { "epoch": 7.2081387591727815, "ref_ce_loss": 0.04445118084549904, "step": 21610 }, { "epoch": 7.211474316210807, "loss": 0.3263, "step": 21620 }, { "epoch": 7.211474316210807, "grad_norm": 2.9612531661987305, "step": 21620 }, { "epoch": 7.211474316210807, "learning_rate": 7.578717417707892e-06, "step": 21620 }, { "epoch": 7.211474316210807, "loss": 0.2413022220134735, "step": 21620 }, { "ce_loss": 0.019957000389695168, "epoch": 7.211474316210807, "step": 21620 }, { "distill_loss": 0.15345555543899536, "epoch": 7.211474316210807, "step": 21620 }, { "epoch": 7.211474316210807, "ref_ce_loss": 0.04753858596086502, "step": 21620 }, { "epoch": 7.211474316210807, "loss": 0.36386004090309143, "step": 21620 }, { "ce_loss": 0.02395096980035305, "epoch": 7.211474316210807, "step": 21620 }, { "distill_loss": 0.28969570994377136, "epoch": 7.211474316210807, "step": 21620 }, { "epoch": 7.211474316210807, "ref_ce_loss": 0.03671019524335861, "step": 21620 }, { "epoch": 7.211474316210807, "loss": 0.4716953635215759, "step": 21620 }, { "ce_loss": 0.039064228534698486, "epoch": 7.211474316210807, "step": 21620 }, { "distill_loss": 0.2576276361942291, "epoch": 7.211474316210807, "step": 21620 }, { "epoch": 7.211474316210807, "ref_ce_loss": 0.04177805408835411, "step": 21620 }, { "epoch": 7.211474316210807, "loss": 0.4198686480522156, "step": 21620 }, { "ce_loss": 0.05144977569580078, "epoch": 7.211474316210807, "step": 21620 }, { "distill_loss": 0.2534531354904175, "epoch": 7.211474316210807, "step": 21620 }, { "epoch": 7.211474316210807, "ref_ce_loss": 0.07417726516723633, "step": 21620 }, { "epoch": 7.214809873248832, "loss": 0.3098, "step": 21630 }, { "epoch": 7.214809873248832, "grad_norm": 3.0390126705169678, "step": 21630 }, { "epoch": 7.214809873248832, "learning_rate": 7.515275011022876e-06, "step": 21630 }, { "epoch": 7.214809873248832, "loss": 0.2712303400039673, "step": 21630 }, { "ce_loss": 0.02337028831243515, "epoch": 7.214809873248832, "step": 21630 }, { "distill_loss": 0.11533728986978531, "epoch": 7.214809873248832, "step": 21630 }, { "epoch": 7.214809873248832, "ref_ce_loss": 0.020700674504041672, "step": 21630 }, { "epoch": 7.214809873248832, "loss": 0.27735042572021484, "step": 21630 }, { "ce_loss": 0.047773364931344986, "epoch": 7.214809873248832, "step": 21630 }, { "distill_loss": 0.17673635482788086, "epoch": 7.214809873248832, "step": 21630 }, { "epoch": 7.214809873248832, "ref_ce_loss": 0.05261456221342087, "step": 21630 }, { "epoch": 7.214809873248832, "loss": 0.2114078551530838, "step": 21630 }, { "ce_loss": 0.032486531883478165, "epoch": 7.214809873248832, "step": 21630 }, { "distill_loss": 0.13928288221359253, "epoch": 7.214809873248832, "step": 21630 }, { "epoch": 7.214809873248832, "ref_ce_loss": 0.02496570721268654, "step": 21630 }, { "epoch": 7.214809873248832, "loss": 0.21625961363315582, "step": 21630 }, { "ce_loss": 0.00532973138615489, "epoch": 7.214809873248832, "step": 21630 }, { "distill_loss": 0.1723240315914154, "epoch": 7.214809873248832, "step": 21630 }, { "epoch": 7.214809873248832, "ref_ce_loss": 0.03841087222099304, "step": 21630 }, { "epoch": 7.2181454302868575, "loss": 0.3027, "step": 21640 }, { "epoch": 7.2181454302868575, "grad_norm": 3.580503225326538, "step": 21640 }, { "epoch": 7.2181454302868575, "learning_rate": 7.452092440270646e-06, "step": 21640 }, { "epoch": 7.2181454302868575, "loss": 0.29037898778915405, "step": 21640 }, { "ce_loss": 0.0321304053068161, "epoch": 7.2181454302868575, "step": 21640 }, { "distill_loss": 0.14601705968379974, "epoch": 7.2181454302868575, "step": 21640 }, { "epoch": 7.2181454302868575, "ref_ce_loss": 0.05759825184941292, "step": 21640 }, { "epoch": 7.2181454302868575, "loss": 0.5272622108459473, "step": 21640 }, { "ce_loss": 0.1542384922504425, "epoch": 7.2181454302868575, "step": 21640 }, { "distill_loss": 0.16737566888332367, "epoch": 7.2181454302868575, "step": 21640 }, { "epoch": 7.2181454302868575, "ref_ce_loss": 0.07182739675045013, "step": 21640 }, { "epoch": 7.2181454302868575, "loss": 0.26996082067489624, "step": 21640 }, { "ce_loss": 0.02072783373296261, "epoch": 7.2181454302868575, "step": 21640 }, { "distill_loss": 0.1619803011417389, "epoch": 7.2181454302868575, "step": 21640 }, { "epoch": 7.2181454302868575, "ref_ce_loss": 0.05954483151435852, "step": 21640 }, { "epoch": 7.2181454302868575, "loss": 0.40388885140419006, "step": 21640 }, { "ce_loss": 0.02044593170285225, "epoch": 7.2181454302868575, "step": 21640 }, { "distill_loss": 0.14890506863594055, "epoch": 7.2181454302868575, "step": 21640 }, { "epoch": 7.2181454302868575, "ref_ce_loss": 0.06344384700059891, "step": 21640 }, { "epoch": 7.221480987324883, "loss": 0.3518, "step": 21650 }, { "epoch": 7.221480987324883, "grad_norm": 3.95499587059021, "step": 21650 }, { "epoch": 7.221480987324883, "learning_rate": 7.38916982067122e-06, "step": 21650 }, { "epoch": 7.221480987324883, "loss": 0.30346325039863586, "step": 21650 }, { "ce_loss": 0.06752192229032516, "epoch": 7.221480987324883, "step": 21650 }, { "distill_loss": 0.15714608132839203, "epoch": 7.221480987324883, "step": 21650 }, { "epoch": 7.221480987324883, "ref_ce_loss": 0.04498041421175003, "step": 21650 }, { "epoch": 7.221480987324883, "loss": 0.3442494869232178, "step": 21650 }, { "ce_loss": 0.016071105375885963, "epoch": 7.221480987324883, "step": 21650 }, { "distill_loss": 0.2575092315673828, "epoch": 7.221480987324883, "step": 21650 }, { "epoch": 7.221480987324883, "ref_ce_loss": 0.051779165863990784, "step": 21650 }, { "epoch": 7.221480987324883, "loss": 0.38894587755203247, "step": 21650 }, { "ce_loss": 0.01721816696226597, "epoch": 7.221480987324883, "step": 21650 }, { "distill_loss": 0.24018141627311707, "epoch": 7.221480987324883, "step": 21650 }, { "epoch": 7.221480987324883, "ref_ce_loss": 0.05405234917998314, "step": 21650 }, { "epoch": 7.221480987324883, "loss": 0.32709309458732605, "step": 21650 }, { "ce_loss": 0.028886908665299416, "epoch": 7.221480987324883, "step": 21650 }, { "distill_loss": 0.16045019030570984, "epoch": 7.221480987324883, "step": 21650 }, { "epoch": 7.221480987324883, "ref_ce_loss": 0.04315178468823433, "step": 21650 }, { "epoch": 7.224816544362908, "loss": 0.3514, "step": 21660 }, { "epoch": 7.224816544362908, "grad_norm": 4.426061630249023, "step": 21660 }, { "epoch": 7.224816544362908, "learning_rate": 7.326507266970677e-06, "step": 21660 }, { "epoch": 7.224816544362908, "loss": 0.26750248670578003, "step": 21660 }, { "ce_loss": 0.023527834564447403, "epoch": 7.224816544362908, "step": 21660 }, { "distill_loss": 0.18002349138259888, "epoch": 7.224816544362908, "step": 21660 }, { "epoch": 7.224816544362908, "ref_ce_loss": 0.04690827801823616, "step": 21660 }, { "epoch": 7.224816544362908, "loss": 0.6288806200027466, "step": 21660 }, { "ce_loss": 0.07400006800889969, "epoch": 7.224816544362908, "step": 21660 }, { "distill_loss": 0.20145976543426514, "epoch": 7.224816544362908, "step": 21660 }, { "epoch": 7.224816544362908, "ref_ce_loss": 0.034665659070014954, "step": 21660 }, { "epoch": 7.224816544362908, "loss": 0.2678120732307434, "step": 21660 }, { "ce_loss": 0.038052450865507126, "epoch": 7.224816544362908, "step": 21660 }, { "distill_loss": 0.16327275335788727, "epoch": 7.224816544362908, "step": 21660 }, { "epoch": 7.224816544362908, "ref_ce_loss": 0.04916460067033768, "step": 21660 }, { "epoch": 7.224816544362908, "loss": 0.2479705512523651, "step": 21660 }, { "ce_loss": 0.01944034732878208, "epoch": 7.224816544362908, "step": 21660 }, { "distill_loss": 0.15548597276210785, "epoch": 7.224816544362908, "step": 21660 }, { "epoch": 7.224816544362908, "ref_ce_loss": 0.05601424351334572, "step": 21660 }, { "epoch": 7.228152101400934, "loss": 0.3284, "step": 21670 }, { "epoch": 7.228152101400934, "grad_norm": 3.802701234817505, "step": 21670 }, { "epoch": 7.228152101400934, "learning_rate": 7.264104893440792e-06, "step": 21670 }, { "epoch": 7.228152101400934, "loss": 0.4184314012527466, "step": 21670 }, { "ce_loss": 0.03756963461637497, "epoch": 7.228152101400934, "step": 21670 }, { "distill_loss": 0.2632547616958618, "epoch": 7.228152101400934, "step": 21670 }, { "epoch": 7.228152101400934, "ref_ce_loss": 0.04559790715575218, "step": 21670 }, { "epoch": 7.228152101400934, "loss": 0.21939358115196228, "step": 21670 }, { "ce_loss": 0.027253881096839905, "epoch": 7.228152101400934, "step": 21670 }, { "distill_loss": 0.09987346082925797, "epoch": 7.228152101400934, "step": 21670 }, { "epoch": 7.228152101400934, "ref_ce_loss": 0.03330458328127861, "step": 21670 }, { "epoch": 7.228152101400934, "loss": 0.7001844644546509, "step": 21670 }, { "ce_loss": 0.03261253982782364, "epoch": 7.228152101400934, "step": 21670 }, { "distill_loss": 0.12179021537303925, "epoch": 7.228152101400934, "step": 21670 }, { "epoch": 7.228152101400934, "ref_ce_loss": 0.03093714825809002, "step": 21670 }, { "epoch": 7.228152101400934, "loss": 0.4727906286716461, "step": 21670 }, { "ce_loss": 0.1115996390581131, "epoch": 7.228152101400934, "step": 21670 }, { "distill_loss": 0.2063658982515335, "epoch": 7.228152101400934, "step": 21670 }, { "epoch": 7.228152101400934, "ref_ce_loss": 0.04917268455028534, "step": 21670 }, { "epoch": 7.231487658438959, "loss": 0.3376, "step": 21680 }, { "epoch": 7.231487658438959, "grad_norm": 2.850815534591675, "step": 21680 }, { "epoch": 7.231487658438959, "learning_rate": 7.201962813878837e-06, "step": 21680 }, { "epoch": 7.231487658438959, "loss": 0.322729229927063, "step": 21680 }, { "ce_loss": 0.013876276090741158, "epoch": 7.231487658438959, "step": 21680 }, { "distill_loss": 0.14954251050949097, "epoch": 7.231487658438959, "step": 21680 }, { "epoch": 7.231487658438959, "ref_ce_loss": 0.057617779821157455, "step": 21680 }, { "epoch": 7.231487658438959, "loss": 0.22298645973205566, "step": 21680 }, { "ce_loss": 0.02905522659420967, "epoch": 7.231487658438959, "step": 21680 }, { "distill_loss": 0.13507865369319916, "epoch": 7.231487658438959, "step": 21680 }, { "epoch": 7.231487658438959, "ref_ce_loss": 0.043622229248285294, "step": 21680 }, { "epoch": 7.231487658438959, "loss": 0.25708311796188354, "step": 21680 }, { "ce_loss": 0.03127783536911011, "epoch": 7.231487658438959, "step": 21680 }, { "distill_loss": 0.14482389390468597, "epoch": 7.231487658438959, "step": 21680 }, { "epoch": 7.231487658438959, "ref_ce_loss": 0.052324190735816956, "step": 21680 }, { "epoch": 7.231487658438959, "loss": 0.4473877549171448, "step": 21680 }, { "ce_loss": 0.034591808915138245, "epoch": 7.231487658438959, "step": 21680 }, { "distill_loss": 0.16112400591373444, "epoch": 7.231487658438959, "step": 21680 }, { "epoch": 7.231487658438959, "ref_ce_loss": 0.05375932529568672, "step": 21680 }, { "epoch": 7.234823215476984, "loss": 0.3232, "step": 21690 }, { "epoch": 7.234823215476984, "grad_norm": 2.6115386486053467, "step": 21690 }, { "epoch": 7.234823215476984, "learning_rate": 7.140081141607479e-06, "step": 21690 }, { "epoch": 7.234823215476984, "loss": 0.2910879850387573, "step": 21690 }, { "ce_loss": 0.02544204518198967, "epoch": 7.234823215476984, "step": 21690 }, { "distill_loss": 0.21739305555820465, "epoch": 7.234823215476984, "step": 21690 }, { "epoch": 7.234823215476984, "ref_ce_loss": 0.04808683693408966, "step": 21690 }, { "epoch": 7.234823215476984, "loss": 0.36207371950149536, "step": 21690 }, { "ce_loss": 0.03599226474761963, "epoch": 7.234823215476984, "step": 21690 }, { "distill_loss": 0.19642367959022522, "epoch": 7.234823215476984, "step": 21690 }, { "epoch": 7.234823215476984, "ref_ce_loss": 0.0867382362484932, "step": 21690 }, { "epoch": 7.234823215476984, "loss": 0.44683051109313965, "step": 21690 }, { "ce_loss": 0.10506901890039444, "epoch": 7.234823215476984, "step": 21690 }, { "distill_loss": 0.2525382936000824, "epoch": 7.234823215476984, "step": 21690 }, { "epoch": 7.234823215476984, "ref_ce_loss": 0.06445559859275818, "step": 21690 }, { "epoch": 7.234823215476984, "loss": 0.28379279375076294, "step": 21690 }, { "ce_loss": 0.024022918194532394, "epoch": 7.234823215476984, "step": 21690 }, { "distill_loss": 0.1875816434621811, "epoch": 7.234823215476984, "step": 21690 }, { "epoch": 7.234823215476984, "ref_ce_loss": 0.025708317756652832, "step": 21690 }, { "epoch": 7.23815877251501, "loss": 0.3608, "step": 21700 }, { "epoch": 7.23815877251501, "grad_norm": 3.2080724239349365, "step": 21700 }, { "epoch": 7.23815877251501, "learning_rate": 7.0784599894745e-06, "step": 21700 }, { "epoch": 7.23815877251501, "loss": 0.34930747747421265, "step": 21700 }, { "ce_loss": 0.052470266819000244, "epoch": 7.23815877251501, "step": 21700 }, { "distill_loss": 0.1867637187242508, "epoch": 7.23815877251501, "step": 21700 }, { "epoch": 7.23815877251501, "ref_ce_loss": 0.04565154388546944, "step": 21700 }, { "epoch": 7.23815877251501, "loss": 0.3086041510105133, "step": 21700 }, { "ce_loss": 0.03938201069831848, "epoch": 7.23815877251501, "step": 21700 }, { "distill_loss": 0.20732665061950684, "epoch": 7.23815877251501, "step": 21700 }, { "epoch": 7.23815877251501, "ref_ce_loss": 0.05089586228132248, "step": 21700 }, { "epoch": 7.23815877251501, "loss": 0.4481806755065918, "step": 21700 }, { "ce_loss": 0.07399517297744751, "epoch": 7.23815877251501, "step": 21700 }, { "distill_loss": 0.14980250597000122, "epoch": 7.23815877251501, "step": 21700 }, { "epoch": 7.23815877251501, "ref_ce_loss": 0.05937043949961662, "step": 21700 }, { "epoch": 7.23815877251501, "loss": 0.24760335683822632, "step": 21700 }, { "ce_loss": 0.009519390761852264, "epoch": 7.23815877251501, "step": 21700 }, { "distill_loss": 0.15549524128437042, "epoch": 7.23815877251501, "step": 21700 }, { "epoch": 7.23815877251501, "ref_ce_loss": 0.056334398686885834, "step": 21700 }, { "epoch": 7.241494329553035, "loss": 0.3446, "step": 21710 }, { "epoch": 7.241494329553035, "grad_norm": 3.619967460632324, "step": 21710 }, { "epoch": 7.241494329553035, "learning_rate": 7.0170994698525274e-06, "step": 21710 }, { "epoch": 7.241494329553035, "loss": 0.28680312633514404, "step": 21710 }, { "ce_loss": 0.05779293179512024, "epoch": 7.241494329553035, "step": 21710 }, { "distill_loss": 0.1770837903022766, "epoch": 7.241494329553035, "step": 21710 }, { "epoch": 7.241494329553035, "ref_ce_loss": 0.03173800930380821, "step": 21710 }, { "epoch": 7.241494329553035, "loss": 0.17028909921646118, "step": 21710 }, { "ce_loss": 0.010208682157099247, "epoch": 7.241494329553035, "step": 21710 }, { "distill_loss": 0.11676836013793945, "epoch": 7.241494329553035, "step": 21710 }, { "epoch": 7.241494329553035, "ref_ce_loss": 0.042866289615631104, "step": 21710 }, { "epoch": 7.241494329553035, "loss": 0.22058066725730896, "step": 21710 }, { "ce_loss": 0.02043846808373928, "epoch": 7.241494329553035, "step": 21710 }, { "distill_loss": 0.1391141414642334, "epoch": 7.241494329553035, "step": 21710 }, { "epoch": 7.241494329553035, "ref_ce_loss": 0.043113306164741516, "step": 21710 }, { "epoch": 7.241494329553035, "loss": 0.2991776168346405, "step": 21710 }, { "ce_loss": 0.019159501418471336, "epoch": 7.241494329553035, "step": 21710 }, { "distill_loss": 0.20416052639484406, "epoch": 7.241494329553035, "step": 21710 }, { "epoch": 7.241494329553035, "ref_ce_loss": 0.03246960788965225, "step": 21710 }, { "epoch": 7.24482988659106, "loss": 0.343, "step": 21720 }, { "epoch": 7.24482988659106, "grad_norm": 4.5923380851745605, "step": 21720 }, { "epoch": 7.24482988659106, "learning_rate": 6.955999694639003e-06, "step": 21720 }, { "epoch": 7.24482988659106, "loss": 0.3403356969356537, "step": 21720 }, { "ce_loss": 0.0032351722475141287, "epoch": 7.24482988659106, "step": 21720 }, { "distill_loss": 0.16471640765666962, "epoch": 7.24482988659106, "step": 21720 }, { "epoch": 7.24482988659106, "ref_ce_loss": 0.059697773307561874, "step": 21720 }, { "epoch": 7.24482988659106, "loss": 0.24258598685264587, "step": 21720 }, { "ce_loss": 0.0037439356092363596, "epoch": 7.24482988659106, "step": 21720 }, { "distill_loss": 0.1754499077796936, "epoch": 7.24482988659106, "step": 21720 }, { "epoch": 7.24482988659106, "ref_ce_loss": 0.030157914385199547, "step": 21720 }, { "epoch": 7.24482988659106, "loss": 0.1979200690984726, "step": 21720 }, { "ce_loss": 0.0022134960163384676, "epoch": 7.24482988659106, "step": 21720 }, { "distill_loss": 0.1415894627571106, "epoch": 7.24482988659106, "step": 21720 }, { "epoch": 7.24482988659106, "ref_ce_loss": 0.023886706680059433, "step": 21720 }, { "epoch": 7.24482988659106, "loss": 0.5604916214942932, "step": 21720 }, { "ce_loss": 0.03827451169490814, "epoch": 7.24482988659106, "step": 21720 }, { "distill_loss": 0.22929227352142334, "epoch": 7.24482988659106, "step": 21720 }, { "epoch": 7.24482988659106, "ref_ce_loss": 0.07590640336275101, "step": 21720 }, { "epoch": 7.248165443629086, "loss": 0.3597, "step": 21730 }, { "epoch": 7.248165443629086, "grad_norm": 3.0626065731048584, "step": 21730 }, { "epoch": 7.248165443629086, "learning_rate": 6.895160775255764e-06, "step": 21730 }, { "epoch": 7.248165443629086, "loss": 0.16732649505138397, "step": 21730 }, { "ce_loss": 0.013631356880068779, "epoch": 7.248165443629086, "step": 21730 }, { "distill_loss": 0.12075482308864594, "epoch": 7.248165443629086, "step": 21730 }, { "epoch": 7.248165443629086, "ref_ce_loss": 0.022803358733654022, "step": 21730 }, { "epoch": 7.248165443629086, "loss": 0.24672572314739227, "step": 21730 }, { "ce_loss": 0.010074594989418983, "epoch": 7.248165443629086, "step": 21730 }, { "distill_loss": 0.16110017895698547, "epoch": 7.248165443629086, "step": 21730 }, { "epoch": 7.248165443629086, "ref_ce_loss": 0.0509311780333519, "step": 21730 }, { "epoch": 7.248165443629086, "loss": 0.8778437376022339, "step": 21730 }, { "ce_loss": 0.0628712847828865, "epoch": 7.248165443629086, "step": 21730 }, { "distill_loss": 0.38136518001556396, "epoch": 7.248165443629086, "step": 21730 }, { "epoch": 7.248165443629086, "ref_ce_loss": 0.06170080602169037, "step": 21730 }, { "epoch": 7.248165443629086, "loss": 0.8466122150421143, "step": 21730 }, { "ce_loss": 0.08081597089767456, "epoch": 7.248165443629086, "step": 21730 }, { "distill_loss": 0.15677189826965332, "epoch": 7.248165443629086, "step": 21730 }, { "epoch": 7.248165443629086, "ref_ce_loss": 0.05595485121011734, "step": 21730 }, { "epoch": 7.251501000667111, "loss": 0.359, "step": 21740 }, { "epoch": 7.251501000667111, "grad_norm": 3.6847281455993652, "step": 21740 }, { "epoch": 7.251501000667111, "learning_rate": 6.834582822649015e-06, "step": 21740 }, { "epoch": 7.251501000667111, "loss": 0.27559083700180054, "step": 21740 }, { "ce_loss": 0.03012896329164505, "epoch": 7.251501000667111, "step": 21740 }, { "distill_loss": 0.13183234632015228, "epoch": 7.251501000667111, "step": 21740 }, { "epoch": 7.251501000667111, "ref_ce_loss": 0.045790113508701324, "step": 21740 }, { "epoch": 7.251501000667111, "loss": 0.4375584125518799, "step": 21740 }, { "ce_loss": 0.0076003712601959705, "epoch": 7.251501000667111, "step": 21740 }, { "distill_loss": 0.1997174620628357, "epoch": 7.251501000667111, "step": 21740 }, { "epoch": 7.251501000667111, "ref_ce_loss": 0.07456426322460175, "step": 21740 }, { "epoch": 7.251501000667111, "loss": 0.2726700007915497, "step": 21740 }, { "ce_loss": 0.03593643754720688, "epoch": 7.251501000667111, "step": 21740 }, { "distill_loss": 0.11179260164499283, "epoch": 7.251501000667111, "step": 21740 }, { "epoch": 7.251501000667111, "ref_ce_loss": 0.04381109029054642, "step": 21740 }, { "epoch": 7.251501000667111, "loss": 0.42131561040878296, "step": 21740 }, { "ce_loss": 0.06356733292341232, "epoch": 7.251501000667111, "step": 21740 }, { "distill_loss": 0.1883009672164917, "epoch": 7.251501000667111, "step": 21740 }, { "epoch": 7.251501000667111, "ref_ce_loss": 0.06666535139083862, "step": 21740 }, { "epoch": 7.254836557705136, "loss": 0.3445, "step": 21750 }, { "epoch": 7.254836557705136, "grad_norm": 3.5137743949890137, "step": 21750 }, { "epoch": 7.254836557705136, "learning_rate": 6.774265947289053e-06, "step": 21750 }, { "epoch": 7.254836557705136, "loss": 0.4460420310497284, "step": 21750 }, { "ce_loss": 0.07431213557720184, "epoch": 7.254836557705136, "step": 21750 }, { "distill_loss": 0.26951563358306885, "epoch": 7.254836557705136, "step": 21750 }, { "epoch": 7.254836557705136, "ref_ce_loss": 0.05519580841064453, "step": 21750 }, { "epoch": 7.254836557705136, "loss": 0.23930570483207703, "step": 21750 }, { "ce_loss": 0.02638310380280018, "epoch": 7.254836557705136, "step": 21750 }, { "distill_loss": 0.11300281435251236, "epoch": 7.254836557705136, "step": 21750 }, { "epoch": 7.254836557705136, "ref_ce_loss": 0.051546212285757065, "step": 21750 }, { "epoch": 7.254836557705136, "loss": 0.41818490624427795, "step": 21750 }, { "ce_loss": 0.019194016233086586, "epoch": 7.254836557705136, "step": 21750 }, { "distill_loss": 0.2620835602283478, "epoch": 7.254836557705136, "step": 21750 }, { "epoch": 7.254836557705136, "ref_ce_loss": 0.05575841665267944, "step": 21750 }, { "epoch": 7.254836557705136, "loss": 0.2405170053243637, "step": 21750 }, { "ce_loss": 0.01714288257062435, "epoch": 7.254836557705136, "step": 21750 }, { "distill_loss": 0.1712266355752945, "epoch": 7.254836557705136, "step": 21750 }, { "epoch": 7.254836557705136, "ref_ce_loss": 0.05204594135284424, "step": 21750 }, { "epoch": 7.258172114743162, "loss": 0.3298, "step": 21760 }, { "epoch": 7.258172114743162, "grad_norm": 2.681328058242798, "step": 21760 }, { "epoch": 7.258172114743162, "learning_rate": 6.7142102591700606e-06, "step": 21760 }, { "epoch": 7.258172114743162, "loss": 0.541475772857666, "step": 21760 }, { "ce_loss": 0.018261613324284554, "epoch": 7.258172114743162, "step": 21760 }, { "distill_loss": 0.3466701805591583, "epoch": 7.258172114743162, "step": 21760 }, { "epoch": 7.258172114743162, "ref_ce_loss": 0.05522928014397621, "step": 21760 }, { "epoch": 7.258172114743162, "loss": 0.33088329434394836, "step": 21760 }, { "ce_loss": 0.018995869904756546, "epoch": 7.258172114743162, "step": 21760 }, { "distill_loss": 0.14749498665332794, "epoch": 7.258172114743162, "step": 21760 }, { "epoch": 7.258172114743162, "ref_ce_loss": 0.03270300105214119, "step": 21760 }, { "epoch": 7.258172114743162, "loss": 0.3270573318004608, "step": 21760 }, { "ce_loss": 0.050677765160799026, "epoch": 7.258172114743162, "step": 21760 }, { "distill_loss": 0.19777001440525055, "epoch": 7.258172114743162, "step": 21760 }, { "epoch": 7.258172114743162, "ref_ce_loss": 0.0531977117061615, "step": 21760 }, { "epoch": 7.258172114743162, "loss": 0.35968199372291565, "step": 21760 }, { "ce_loss": 0.0543878972530365, "epoch": 7.258172114743162, "step": 21760 }, { "distill_loss": 0.1706765741109848, "epoch": 7.258172114743162, "step": 21760 }, { "epoch": 7.258172114743162, "ref_ce_loss": 0.050895582884550095, "step": 21760 }, { "epoch": 7.261507671781187, "loss": 0.3198, "step": 21770 }, { "epoch": 7.261507671781187, "grad_norm": 3.674339532852173, "step": 21770 }, { "epoch": 7.261507671781187, "learning_rate": 6.6544158678099476e-06, "step": 21770 }, { "epoch": 7.261507671781187, "loss": 0.3187209367752075, "step": 21770 }, { "ce_loss": 0.005656755529344082, "epoch": 7.261507671781187, "step": 21770 }, { "distill_loss": 0.20920640230178833, "epoch": 7.261507671781187, "step": 21770 }, { "epoch": 7.261507671781187, "ref_ce_loss": 0.05724534019827843, "step": 21770 }, { "epoch": 7.261507671781187, "loss": 0.6237984895706177, "step": 21770 }, { "ce_loss": 0.007133541628718376, "epoch": 7.261507671781187, "step": 21770 }, { "distill_loss": 0.20287460088729858, "epoch": 7.261507671781187, "step": 21770 }, { "epoch": 7.261507671781187, "ref_ce_loss": 0.05001780763268471, "step": 21770 }, { "epoch": 7.261507671781187, "loss": 0.38897567987442017, "step": 21770 }, { "ce_loss": 0.0849277600646019, "epoch": 7.261507671781187, "step": 21770 }, { "distill_loss": 0.21830356121063232, "epoch": 7.261507671781187, "step": 21770 }, { "epoch": 7.261507671781187, "ref_ce_loss": 0.05569655820727348, "step": 21770 }, { "epoch": 7.261507671781187, "loss": 0.32482588291168213, "step": 21770 }, { "ce_loss": 0.012219560332596302, "epoch": 7.261507671781187, "step": 21770 }, { "distill_loss": 0.19683362543582916, "epoch": 7.261507671781187, "step": 21770 }, { "epoch": 7.261507671781187, "ref_ce_loss": 0.04783269762992859, "step": 21770 }, { "epoch": 7.264843228819212, "loss": 0.3182, "step": 21780 }, { "epoch": 7.264843228819212, "grad_norm": 4.115479946136475, "step": 21780 }, { "epoch": 7.264843228819212, "learning_rate": 6.594882882250041e-06, "step": 21780 }, { "epoch": 7.264843228819212, "loss": 0.2964266538619995, "step": 21780 }, { "ce_loss": 0.03865527734160423, "epoch": 7.264843228819212, "step": 21780 }, { "distill_loss": 0.15016742050647736, "epoch": 7.264843228819212, "step": 21780 }, { "epoch": 7.264843228819212, "ref_ce_loss": 0.03687658905982971, "step": 21780 }, { "epoch": 7.264843228819212, "loss": 0.3021934926509857, "step": 21780 }, { "ce_loss": 0.022084327414631844, "epoch": 7.264843228819212, "step": 21780 }, { "distill_loss": 0.20831188559532166, "epoch": 7.264843228819212, "step": 21780 }, { "epoch": 7.264843228819212, "ref_ce_loss": 0.07171301543712616, "step": 21780 }, { "epoch": 7.264843228819212, "loss": 0.331741601228714, "step": 21780 }, { "ce_loss": 0.055888574570417404, "epoch": 7.264843228819212, "step": 21780 }, { "distill_loss": 0.22407200932502747, "epoch": 7.264843228819212, "step": 21780 }, { "epoch": 7.264843228819212, "ref_ce_loss": 0.051574043929576874, "step": 21780 }, { "epoch": 7.264843228819212, "loss": 0.24614335596561432, "step": 21780 }, { "ce_loss": 0.028283387422561646, "epoch": 7.264843228819212, "step": 21780 }, { "distill_loss": 0.1770130693912506, "epoch": 7.264843228819212, "step": 21780 }, { "epoch": 7.264843228819212, "ref_ce_loss": 0.04076956957578659, "step": 21780 }, { "epoch": 7.268178785857238, "loss": 0.3231, "step": 21790 }, { "epoch": 7.268178785857238, "grad_norm": 3.716766595840454, "step": 21790 }, { "epoch": 7.268178785857238, "learning_rate": 6.535611411055064e-06, "step": 21790 }, { "epoch": 7.268178785857238, "loss": 0.4391646981239319, "step": 21790 }, { "ce_loss": 0.0490754172205925, "epoch": 7.268178785857238, "step": 21790 }, { "distill_loss": 0.27585989236831665, "epoch": 7.268178785857238, "step": 21790 }, { "epoch": 7.268178785857238, "ref_ce_loss": 0.07593761384487152, "step": 21790 }, { "epoch": 7.268178785857238, "loss": 0.3659456968307495, "step": 21790 }, { "ce_loss": 0.03586322441697121, "epoch": 7.268178785857238, "step": 21790 }, { "distill_loss": 0.2670363783836365, "epoch": 7.268178785857238, "step": 21790 }, { "epoch": 7.268178785857238, "ref_ce_loss": 0.06285092234611511, "step": 21790 }, { "epoch": 7.268178785857238, "loss": 0.34236231446266174, "step": 21790 }, { "ce_loss": 0.06531081348657608, "epoch": 7.268178785857238, "step": 21790 }, { "distill_loss": 0.1928083449602127, "epoch": 7.268178785857238, "step": 21790 }, { "epoch": 7.268178785857238, "ref_ce_loss": 0.04548466578125954, "step": 21790 }, { "epoch": 7.268178785857238, "loss": 0.3572752773761749, "step": 21790 }, { "ce_loss": 0.04397887364029884, "epoch": 7.268178785857238, "step": 21790 }, { "distill_loss": 0.16652613878250122, "epoch": 7.268178785857238, "step": 21790 }, { "epoch": 7.268178785857238, "ref_ce_loss": 0.05783737450838089, "step": 21790 }, { "epoch": 7.271514342895263, "loss": 0.3356, "step": 21800 }, { "epoch": 7.271514342895263, "grad_norm": 3.6929001808166504, "step": 21800 }, { "epoch": 7.271514342895263, "learning_rate": 6.476601562312788e-06, "step": 21800 }, { "epoch": 7.271514342895263, "loss": 0.41686001420021057, "step": 21800 }, { "ce_loss": 0.10789791494607925, "epoch": 7.271514342895263, "step": 21800 }, { "distill_loss": 0.2107146978378296, "epoch": 7.271514342895263, "step": 21800 }, { "epoch": 7.271514342895263, "ref_ce_loss": 0.052300117909908295, "step": 21800 }, { "epoch": 7.271514342895263, "loss": 0.3694479465484619, "step": 21800 }, { "ce_loss": 0.03740653023123741, "epoch": 7.271514342895263, "step": 21800 }, { "distill_loss": 0.16590005159378052, "epoch": 7.271514342895263, "step": 21800 }, { "epoch": 7.271514342895263, "ref_ce_loss": 0.07728464156389236, "step": 21800 }, { "epoch": 7.271514342895263, "loss": 0.2800734043121338, "step": 21800 }, { "ce_loss": 0.04555989429354668, "epoch": 7.271514342895263, "step": 21800 }, { "distill_loss": 0.1753097027540207, "epoch": 7.271514342895263, "step": 21800 }, { "epoch": 7.271514342895263, "ref_ce_loss": 0.04043455794453621, "step": 21800 }, { "epoch": 7.271514342895263, "loss": 0.2379513531923294, "step": 21800 }, { "ce_loss": 0.01604171097278595, "epoch": 7.271514342895263, "step": 21800 }, { "distill_loss": 0.13487288355827332, "epoch": 7.271514342895263, "step": 21800 }, { "epoch": 7.271514342895263, "ref_ce_loss": 0.04081754758954048, "step": 21800 }, { "epoch": 7.2748498999332885, "loss": 0.3518, "step": 21810 }, { "epoch": 7.2748498999332885, "grad_norm": 2.9927523136138916, "step": 21810 }, { "epoch": 7.2748498999332885, "learning_rate": 6.417853443633902e-06, "step": 21810 }, { "epoch": 7.2748498999332885, "loss": 0.2745986580848694, "step": 21810 }, { "ce_loss": 0.09028304368257523, "epoch": 7.2748498999332885, "step": 21810 }, { "distill_loss": 0.14296692609786987, "epoch": 7.2748498999332885, "step": 21810 }, { "epoch": 7.2748498999332885, "ref_ce_loss": 0.03005700558423996, "step": 21810 }, { "epoch": 7.2748498999332885, "loss": 0.4135947525501251, "step": 21810 }, { "ce_loss": 0.023570353165268898, "epoch": 7.2748498999332885, "step": 21810 }, { "distill_loss": 0.33251529932022095, "epoch": 7.2748498999332885, "step": 21810 }, { "epoch": 7.2748498999332885, "ref_ce_loss": 0.05725204572081566, "step": 21810 }, { "epoch": 7.2748498999332885, "loss": 0.21228575706481934, "step": 21810 }, { "ce_loss": 0.01579027622938156, "epoch": 7.2748498999332885, "step": 21810 }, { "distill_loss": 0.14295357465744019, "epoch": 7.2748498999332885, "step": 21810 }, { "epoch": 7.2748498999332885, "ref_ce_loss": 0.05342310294508934, "step": 21810 }, { "epoch": 7.2748498999332885, "loss": 0.28802329301834106, "step": 21810 }, { "ce_loss": 0.02745390497148037, "epoch": 7.2748498999332885, "step": 21810 }, { "distill_loss": 0.1880541741847992, "epoch": 7.2748498999332885, "step": 21810 }, { "epoch": 7.2748498999332885, "ref_ce_loss": 0.02855663187801838, "step": 21810 }, { "epoch": 7.278185456971314, "loss": 0.3351, "step": 21820 }, { "epoch": 7.278185456971314, "grad_norm": 3.009716510772705, "step": 21820 }, { "epoch": 7.278185456971314, "learning_rate": 6.359367162151824e-06, "step": 21820 }, { "epoch": 7.278185456971314, "loss": 0.3255745768547058, "step": 21820 }, { "ce_loss": 0.07791402190923691, "epoch": 7.278185456971314, "step": 21820 }, { "distill_loss": 0.17362235486507416, "epoch": 7.278185456971314, "step": 21820 }, { "epoch": 7.278185456971314, "ref_ce_loss": 0.07391360402107239, "step": 21820 }, { "epoch": 7.278185456971314, "loss": 0.3911256492137909, "step": 21820 }, { "ce_loss": 0.027354300022125244, "epoch": 7.278185456971314, "step": 21820 }, { "distill_loss": 0.3045755922794342, "epoch": 7.278185456971314, "step": 21820 }, { "epoch": 7.278185456971314, "ref_ce_loss": 0.03816691413521767, "step": 21820 }, { "epoch": 7.278185456971314, "loss": 0.5183221101760864, "step": 21820 }, { "ce_loss": 0.03627387061715126, "epoch": 7.278185456971314, "step": 21820 }, { "distill_loss": 0.17436179518699646, "epoch": 7.278185456971314, "step": 21820 }, { "epoch": 7.278185456971314, "ref_ce_loss": 0.06194957718253136, "step": 21820 }, { "epoch": 7.278185456971314, "loss": 0.27898895740509033, "step": 21820 }, { "ce_loss": 0.0026024733670055866, "epoch": 7.278185456971314, "step": 21820 }, { "distill_loss": 0.22231397032737732, "epoch": 7.278185456971314, "step": 21820 }, { "epoch": 7.278185456971314, "ref_ce_loss": 0.035759177058935165, "step": 21820 }, { "epoch": 7.281521014009339, "loss": 0.3531, "step": 21830 }, { "epoch": 7.281521014009339, "grad_norm": 3.047001838684082, "step": 21830 }, { "epoch": 7.281521014009339, "learning_rate": 6.30114282452242e-06, "step": 21830 }, { "epoch": 7.281521014009339, "loss": 0.4379478096961975, "step": 21830 }, { "ce_loss": 0.08663278818130493, "epoch": 7.281521014009339, "step": 21830 }, { "distill_loss": 0.305935800075531, "epoch": 7.281521014009339, "step": 21830 }, { "epoch": 7.281521014009339, "ref_ce_loss": 0.04534162953495979, "step": 21830 }, { "epoch": 7.281521014009339, "loss": 0.4171701669692993, "step": 21830 }, { "ce_loss": 0.01691397652029991, "epoch": 7.281521014009339, "step": 21830 }, { "distill_loss": 0.18893980979919434, "epoch": 7.281521014009339, "step": 21830 }, { "epoch": 7.281521014009339, "ref_ce_loss": 0.04799644649028778, "step": 21830 }, { "epoch": 7.281521014009339, "loss": 0.38544657826423645, "step": 21830 }, { "ce_loss": 0.023883184418082237, "epoch": 7.281521014009339, "step": 21830 }, { "distill_loss": 0.256287544965744, "epoch": 7.281521014009339, "step": 21830 }, { "epoch": 7.281521014009339, "ref_ce_loss": 0.046406473964452744, "step": 21830 }, { "epoch": 7.281521014009339, "loss": 0.336747407913208, "step": 21830 }, { "ce_loss": 0.0368281789124012, "epoch": 7.281521014009339, "step": 21830 }, { "distill_loss": 0.22192564606666565, "epoch": 7.281521014009339, "step": 21830 }, { "epoch": 7.281521014009339, "ref_ce_loss": 0.05088139325380325, "step": 21830 }, { "epoch": 7.2848565710473645, "loss": 0.3154, "step": 21840 }, { "epoch": 7.2848565710473645, "grad_norm": 2.9419236183166504, "step": 21840 }, { "epoch": 7.2848565710473645, "learning_rate": 6.243180536923925e-06, "step": 21840 }, { "epoch": 7.2848565710473645, "loss": 0.21947017312049866, "step": 21840 }, { "ce_loss": 0.045658010989427567, "epoch": 7.2848565710473645, "step": 21840 }, { "distill_loss": 0.1260603815317154, "epoch": 7.2848565710473645, "step": 21840 }, { "epoch": 7.2848565710473645, "ref_ce_loss": 0.0476338192820549, "step": 21840 }, { "epoch": 7.2848565710473645, "loss": 0.2900714576244354, "step": 21840 }, { "ce_loss": 0.012831090949475765, "epoch": 7.2848565710473645, "step": 21840 }, { "distill_loss": 0.21750670671463013, "epoch": 7.2848565710473645, "step": 21840 }, { "epoch": 7.2848565710473645, "ref_ce_loss": 0.05956600233912468, "step": 21840 }, { "epoch": 7.2848565710473645, "loss": 0.3997568190097809, "step": 21840 }, { "ce_loss": 0.0886455699801445, "epoch": 7.2848565710473645, "step": 21840 }, { "distill_loss": 0.20969116687774658, "epoch": 7.2848565710473645, "step": 21840 }, { "epoch": 7.2848565710473645, "ref_ce_loss": 0.07083108276128769, "step": 21840 }, { "epoch": 7.2848565710473645, "loss": 0.4463040232658386, "step": 21840 }, { "ce_loss": 0.08025004714727402, "epoch": 7.2848565710473645, "step": 21840 }, { "distill_loss": 0.2879204750061035, "epoch": 7.2848565710473645, "step": 21840 }, { "epoch": 7.2848565710473645, "ref_ce_loss": 0.06017610430717468, "step": 21840 }, { "epoch": 7.28819212808539, "loss": 0.3874, "step": 21850 }, { "epoch": 7.28819212808539, "grad_norm": 3.621523380279541, "step": 21850 }, { "epoch": 7.28819212808539, "learning_rate": 6.185480405056686e-06, "step": 21850 }, { "epoch": 7.28819212808539, "loss": 0.3507860004901886, "step": 21850 }, { "ce_loss": 0.007834719493985176, "epoch": 7.28819212808539, "step": 21850 }, { "distill_loss": 0.12249279022216797, "epoch": 7.28819212808539, "step": 21850 }, { "epoch": 7.28819212808539, "ref_ce_loss": 0.06363532692193985, "step": 21850 }, { "epoch": 7.28819212808539, "loss": 0.39529716968536377, "step": 21850 }, { "ce_loss": 0.03328666463494301, "epoch": 7.28819212808539, "step": 21850 }, { "distill_loss": 0.1468905508518219, "epoch": 7.28819212808539, "step": 21850 }, { "epoch": 7.28819212808539, "ref_ce_loss": 0.0494842529296875, "step": 21850 }, { "epoch": 7.28819212808539, "loss": 0.28177881240844727, "step": 21850 }, { "ce_loss": 0.07029473036527634, "epoch": 7.28819212808539, "step": 21850 }, { "distill_loss": 0.16273248195648193, "epoch": 7.28819212808539, "step": 21850 }, { "epoch": 7.28819212808539, "ref_ce_loss": 0.04860760644078255, "step": 21850 }, { "epoch": 7.28819212808539, "loss": 0.16786929965019226, "step": 21850 }, { "ce_loss": 0.0027700222562998533, "epoch": 7.28819212808539, "step": 21850 }, { "distill_loss": 0.12942123413085938, "epoch": 7.28819212808539, "step": 21850 }, { "epoch": 7.28819212808539, "ref_ce_loss": 0.035599589347839355, "step": 21850 }, { "epoch": 7.291527685123415, "loss": 0.3397, "step": 21860 }, { "epoch": 7.291527685123415, "grad_norm": 3.5767362117767334, "step": 21860 }, { "epoch": 7.291527685123415, "learning_rate": 6.128042534143002e-06, "step": 21860 }, { "epoch": 7.291527685123415, "loss": 0.3591082692146301, "step": 21860 }, { "ce_loss": 0.012450255453586578, "epoch": 7.291527685123415, "step": 21860 }, { "distill_loss": 0.20948486030101776, "epoch": 7.291527685123415, "step": 21860 }, { "epoch": 7.291527685123415, "ref_ce_loss": 0.042204998433589935, "step": 21860 }, { "epoch": 7.291527685123415, "loss": 0.3279259204864502, "step": 21860 }, { "ce_loss": 0.04447029158473015, "epoch": 7.291527685123415, "step": 21860 }, { "distill_loss": 0.21123427152633667, "epoch": 7.291527685123415, "step": 21860 }, { "epoch": 7.291527685123415, "ref_ce_loss": 0.05294876918196678, "step": 21860 }, { "epoch": 7.291527685123415, "loss": 0.3567955195903778, "step": 21860 }, { "ce_loss": 0.08325055241584778, "epoch": 7.291527685123415, "step": 21860 }, { "distill_loss": 0.16381686925888062, "epoch": 7.291527685123415, "step": 21860 }, { "epoch": 7.291527685123415, "ref_ce_loss": 0.07084712386131287, "step": 21860 }, { "epoch": 7.291527685123415, "loss": 0.22216692566871643, "step": 21860 }, { "ce_loss": 0.03289766609668732, "epoch": 7.291527685123415, "step": 21860 }, { "distill_loss": 0.15384438633918762, "epoch": 7.291527685123415, "step": 21860 }, { "epoch": 7.291527685123415, "ref_ce_loss": 0.02933252416551113, "step": 21860 }, { "epoch": 7.2948632421614406, "loss": 0.3305, "step": 21870 }, { "epoch": 7.2948632421614406, "grad_norm": 5.346068859100342, "step": 21870 }, { "epoch": 7.2948632421614406, "learning_rate": 6.070867028926868e-06, "step": 21870 }, { "epoch": 7.2948632421614406, "loss": 0.42858025431632996, "step": 21870 }, { "ce_loss": 0.023396633565425873, "epoch": 7.2948632421614406, "step": 21870 }, { "distill_loss": 0.30239880084991455, "epoch": 7.2948632421614406, "step": 21870 }, { "epoch": 7.2948632421614406, "ref_ce_loss": 0.07331034541130066, "step": 21870 }, { "epoch": 7.2948632421614406, "loss": 0.3239220678806305, "step": 21870 }, { "ce_loss": 0.009180337190628052, "epoch": 7.2948632421614406, "step": 21870 }, { "distill_loss": 0.13598200678825378, "epoch": 7.2948632421614406, "step": 21870 }, { "epoch": 7.2948632421614406, "ref_ce_loss": 0.05417673662304878, "step": 21870 }, { "epoch": 7.2948632421614406, "loss": 0.21557603776454926, "step": 21870 }, { "ce_loss": 0.012299394235014915, "epoch": 7.2948632421614406, "step": 21870 }, { "distill_loss": 0.12474885582923889, "epoch": 7.2948632421614406, "step": 21870 }, { "epoch": 7.2948632421614406, "ref_ce_loss": 0.05441064387559891, "step": 21870 }, { "epoch": 7.2948632421614406, "loss": 0.32964056730270386, "step": 21870 }, { "ce_loss": 0.05307295173406601, "epoch": 7.2948632421614406, "step": 21870 }, { "distill_loss": 0.17791429162025452, "epoch": 7.2948632421614406, "step": 21870 }, { "epoch": 7.2948632421614406, "ref_ce_loss": 0.042303744703531265, "step": 21870 }, { "epoch": 7.298198799199466, "loss": 0.327, "step": 21880 }, { "epoch": 7.298198799199466, "grad_norm": 2.8577730655670166, "step": 21880 }, { "epoch": 7.298198799199466, "learning_rate": 6.0139539936738975e-06, "step": 21880 }, { "epoch": 7.298198799199466, "loss": 0.3614603877067566, "step": 21880 }, { "ce_loss": 0.1079586073756218, "epoch": 7.298198799199466, "step": 21880 }, { "distill_loss": 0.1908852756023407, "epoch": 7.298198799199466, "step": 21880 }, { "epoch": 7.298198799199466, "ref_ce_loss": 0.04335471987724304, "step": 21880 }, { "epoch": 7.298198799199466, "loss": 0.2224392294883728, "step": 21880 }, { "ce_loss": 0.04111739993095398, "epoch": 7.298198799199466, "step": 21880 }, { "distill_loss": 0.12654203176498413, "epoch": 7.298198799199466, "step": 21880 }, { "epoch": 7.298198799199466, "ref_ce_loss": 0.0440577007830143, "step": 21880 }, { "epoch": 7.298198799199466, "loss": 0.24377888441085815, "step": 21880 }, { "ce_loss": 0.02872404456138611, "epoch": 7.298198799199466, "step": 21880 }, { "distill_loss": 0.11825037002563477, "epoch": 7.298198799199466, "step": 21880 }, { "epoch": 7.298198799199466, "ref_ce_loss": 0.03949522599577904, "step": 21880 }, { "epoch": 7.298198799199466, "loss": 0.6034325957298279, "step": 21880 }, { "ce_loss": 0.023845089599490166, "epoch": 7.298198799199466, "step": 21880 }, { "distill_loss": 0.22611646354198456, "epoch": 7.298198799199466, "step": 21880 }, { "epoch": 7.298198799199466, "ref_ce_loss": 0.04899800941348076, "step": 21880 }, { "epoch": 7.301534356237491, "loss": 0.3174, "step": 21890 }, { "epoch": 7.301534356237491, "grad_norm": 2.451704740524292, "step": 21890 }, { "epoch": 7.301534356237491, "learning_rate": 5.9573035321709535e-06, "step": 21890 }, { "epoch": 7.301534356237491, "loss": 0.254489928483963, "step": 21890 }, { "ce_loss": 0.017234718427062035, "epoch": 7.301534356237491, "step": 21890 }, { "distill_loss": 0.18709203600883484, "epoch": 7.301534356237491, "step": 21890 }, { "epoch": 7.301534356237491, "ref_ce_loss": 0.03838713467121124, "step": 21890 }, { "epoch": 7.301534356237491, "loss": 0.5210539102554321, "step": 21890 }, { "ce_loss": 0.010104808956384659, "epoch": 7.301534356237491, "step": 21890 }, { "distill_loss": 0.1786583811044693, "epoch": 7.301534356237491, "step": 21890 }, { "epoch": 7.301534356237491, "ref_ce_loss": 0.08645079284906387, "step": 21890 }, { "epoch": 7.301534356237491, "loss": 0.5077769756317139, "step": 21890 }, { "ce_loss": 0.04721241444349289, "epoch": 7.301534356237491, "step": 21890 }, { "distill_loss": 0.12380172312259674, "epoch": 7.301534356237491, "step": 21890 }, { "epoch": 7.301534356237491, "ref_ce_loss": 0.06811454147100449, "step": 21890 }, { "epoch": 7.301534356237491, "loss": 0.4189784526824951, "step": 21890 }, { "ce_loss": 0.024810247123241425, "epoch": 7.301534356237491, "step": 21890 }, { "distill_loss": 0.3204716444015503, "epoch": 7.301534356237491, "step": 21890 }, { "epoch": 7.301534356237491, "ref_ce_loss": 0.04616758972406387, "step": 21890 }, { "epoch": 7.304869913275517, "loss": 0.3187, "step": 21900 }, { "epoch": 7.304869913275517, "grad_norm": 2.8359055519104004, "step": 21900 }, { "epoch": 7.304869913275517, "learning_rate": 5.900915747726182e-06, "step": 21900 }, { "epoch": 7.304869913275517, "loss": 0.2653408348560333, "step": 21900 }, { "ce_loss": 0.034685708582401276, "epoch": 7.304869913275517, "step": 21900 }, { "distill_loss": 0.1382124423980713, "epoch": 7.304869913275517, "step": 21900 }, { "epoch": 7.304869913275517, "ref_ce_loss": 0.05031518265604973, "step": 21900 }, { "epoch": 7.304869913275517, "loss": 0.3756208121776581, "step": 21900 }, { "ce_loss": 0.062474705278873444, "epoch": 7.304869913275517, "step": 21900 }, { "distill_loss": 0.1996249258518219, "epoch": 7.304869913275517, "step": 21900 }, { "epoch": 7.304869913275517, "ref_ce_loss": 0.06248785927891731, "step": 21900 }, { "epoch": 7.304869913275517, "loss": 0.3942401111125946, "step": 21900 }, { "ce_loss": 0.017181461676955223, "epoch": 7.304869913275517, "step": 21900 }, { "distill_loss": 0.26439082622528076, "epoch": 7.304869913275517, "step": 21900 }, { "epoch": 7.304869913275517, "ref_ce_loss": 0.0879964753985405, "step": 21900 }, { "epoch": 7.304869913275517, "loss": 0.29367902874946594, "step": 21900 }, { "ce_loss": 0.021005744114518166, "epoch": 7.304869913275517, "step": 21900 }, { "distill_loss": 0.1865503340959549, "epoch": 7.304869913275517, "step": 21900 }, { "epoch": 7.304869913275517, "ref_ce_loss": 0.06075814738869667, "step": 21900 }, { "epoch": 7.308205470313542, "loss": 0.3228, "step": 21910 }, { "epoch": 7.308205470313542, "grad_norm": 5.696154594421387, "step": 21910 }, { "epoch": 7.308205470313542, "learning_rate": 5.844790743168593e-06, "step": 21910 }, { "epoch": 7.308205470313542, "loss": 0.5747356414794922, "step": 21910 }, { "ce_loss": 0.04167019948363304, "epoch": 7.308205470313542, "step": 21910 }, { "distill_loss": 0.376578688621521, "epoch": 7.308205470313542, "step": 21910 }, { "epoch": 7.308205470313542, "ref_ce_loss": 0.061236023902893066, "step": 21910 }, { "epoch": 7.308205470313542, "loss": 0.2941112816333771, "step": 21910 }, { "ce_loss": 0.03991208225488663, "epoch": 7.308205470313542, "step": 21910 }, { "distill_loss": 0.17364008724689484, "epoch": 7.308205470313542, "step": 21910 }, { "epoch": 7.308205470313542, "ref_ce_loss": 0.04996495693922043, "step": 21910 }, { "epoch": 7.308205470313542, "loss": 0.5141149759292603, "step": 21910 }, { "ce_loss": 0.021073229610919952, "epoch": 7.308205470313542, "step": 21910 }, { "distill_loss": 0.308474600315094, "epoch": 7.308205470313542, "step": 21910 }, { "epoch": 7.308205470313542, "ref_ce_loss": 0.03844856843352318, "step": 21910 }, { "epoch": 7.308205470313542, "loss": 0.3157500922679901, "step": 21910 }, { "ce_loss": 0.0359286367893219, "epoch": 7.308205470313542, "step": 21910 }, { "distill_loss": 0.1874678134918213, "epoch": 7.308205470313542, "step": 21910 }, { "epoch": 7.308205470313542, "ref_ce_loss": 0.04697670415043831, "step": 21910 }, { "epoch": 7.311541027351567, "loss": 0.3232, "step": 21920 }, { "epoch": 7.311541027351567, "grad_norm": 3.316659688949585, "step": 21920 }, { "epoch": 7.311541027351567, "learning_rate": 5.788928620848115e-06, "step": 21920 }, { "epoch": 7.311541027351567, "loss": 0.3719853460788727, "step": 21920 }, { "ce_loss": 0.046290088444948196, "epoch": 7.311541027351567, "step": 21920 }, { "distill_loss": 0.19583165645599365, "epoch": 7.311541027351567, "step": 21920 }, { "epoch": 7.311541027351567, "ref_ce_loss": 0.06114131957292557, "step": 21920 }, { "epoch": 7.311541027351567, "loss": 0.3432145118713379, "step": 21920 }, { "ce_loss": 0.02922457829117775, "epoch": 7.311541027351567, "step": 21920 }, { "distill_loss": 0.2272334098815918, "epoch": 7.311541027351567, "step": 21920 }, { "epoch": 7.311541027351567, "ref_ce_loss": 0.0575200691819191, "step": 21920 }, { "epoch": 7.311541027351567, "loss": 0.3053845763206482, "step": 21920 }, { "ce_loss": 0.010052576661109924, "epoch": 7.311541027351567, "step": 21920 }, { "distill_loss": 0.15204855799674988, "epoch": 7.311541027351567, "step": 21920 }, { "epoch": 7.311541027351567, "ref_ce_loss": 0.043563369661569595, "step": 21920 }, { "epoch": 7.311541027351567, "loss": 0.3522624671459198, "step": 21920 }, { "ce_loss": 0.04388388246297836, "epoch": 7.311541027351567, "step": 21920 }, { "distill_loss": 0.2134374976158142, "epoch": 7.311541027351567, "step": 21920 }, { "epoch": 7.311541027351567, "ref_ce_loss": 0.06627210229635239, "step": 21920 }, { "epoch": 7.314876584389593, "loss": 0.3748, "step": 21930 }, { "epoch": 7.314876584389593, "grad_norm": 3.1502254009246826, "step": 21930 }, { "epoch": 7.314876584389593, "learning_rate": 5.73332948263523e-06, "step": 21930 }, { "epoch": 7.314876584389593, "loss": 0.6507969498634338, "step": 21930 }, { "ce_loss": 0.005436472594738007, "epoch": 7.314876584389593, "step": 21930 }, { "distill_loss": 0.13547836244106293, "epoch": 7.314876584389593, "step": 21930 }, { "epoch": 7.314876584389593, "ref_ce_loss": 0.03680480644106865, "step": 21930 }, { "epoch": 7.314876584389593, "loss": 0.26195141673088074, "step": 21930 }, { "ce_loss": 0.026195021346211433, "epoch": 7.314876584389593, "step": 21930 }, { "distill_loss": 0.1941198855638504, "epoch": 7.314876584389593, "step": 21930 }, { "epoch": 7.314876584389593, "ref_ce_loss": 0.0410786047577858, "step": 21930 }, { "epoch": 7.314876584389593, "loss": 0.27653250098228455, "step": 21930 }, { "ce_loss": 0.0419037900865078, "epoch": 7.314876584389593, "step": 21930 }, { "distill_loss": 0.18454256653785706, "epoch": 7.314876584389593, "step": 21930 }, { "epoch": 7.314876584389593, "ref_ce_loss": 0.027803665027022362, "step": 21930 }, { "epoch": 7.314876584389593, "loss": 0.4033946096897125, "step": 21930 }, { "ce_loss": 0.0186313409358263, "epoch": 7.314876584389593, "step": 21930 }, { "distill_loss": 0.2735311686992645, "epoch": 7.314876584389593, "step": 21930 }, { "epoch": 7.314876584389593, "ref_ce_loss": 0.05757341533899307, "step": 21930 }, { "epoch": 7.318212141427618, "loss": 0.3289, "step": 21940 }, { "epoch": 7.318212141427618, "grad_norm": 3.537044048309326, "step": 21940 }, { "epoch": 7.318212141427618, "learning_rate": 5.677993429920796e-06, "step": 21940 }, { "epoch": 7.318212141427618, "loss": 0.28274813294410706, "step": 21940 }, { "ce_loss": 0.025311071425676346, "epoch": 7.318212141427618, "step": 21940 }, { "distill_loss": 0.20105645060539246, "epoch": 7.318212141427618, "step": 21940 }, { "epoch": 7.318212141427618, "ref_ce_loss": 0.05622454360127449, "step": 21940 }, { "epoch": 7.318212141427618, "loss": 0.3809490501880646, "step": 21940 }, { "ce_loss": 0.012795322574675083, "epoch": 7.318212141427618, "step": 21940 }, { "distill_loss": 0.27297529578208923, "epoch": 7.318212141427618, "step": 21940 }, { "epoch": 7.318212141427618, "ref_ce_loss": 0.06003205478191376, "step": 21940 }, { "epoch": 7.318212141427618, "loss": 0.16590841114521027, "step": 21940 }, { "ce_loss": 0.026391826570034027, "epoch": 7.318212141427618, "step": 21940 }, { "distill_loss": 0.0870550125837326, "epoch": 7.318212141427618, "step": 21940 }, { "epoch": 7.318212141427618, "ref_ce_loss": 0.033449750393629074, "step": 21940 }, { "epoch": 7.318212141427618, "loss": 0.36872372031211853, "step": 21940 }, { "ce_loss": 0.02109898068010807, "epoch": 7.318212141427618, "step": 21940 }, { "distill_loss": 0.25131669640541077, "epoch": 7.318212141427618, "step": 21940 }, { "epoch": 7.318212141427618, "ref_ce_loss": 0.03654967620968819, "step": 21940 }, { "epoch": 7.321547698465643, "loss": 0.3165, "step": 21950 }, { "epoch": 7.321547698465643, "grad_norm": 2.954684257507324, "step": 21950 }, { "epoch": 7.321547698465643, "learning_rate": 5.6229205636159794e-06, "step": 21950 }, { "epoch": 7.321547698465643, "loss": 0.44153302907943726, "step": 21950 }, { "ce_loss": 0.015158475376665592, "epoch": 7.321547698465643, "step": 21950 }, { "distill_loss": 0.1195288747549057, "epoch": 7.321547698465643, "step": 21950 }, { "epoch": 7.321547698465643, "ref_ce_loss": 0.0664001852273941, "step": 21950 }, { "epoch": 7.321547698465643, "loss": 0.22423836588859558, "step": 21950 }, { "ce_loss": 0.011930732056498528, "epoch": 7.321547698465643, "step": 21950 }, { "distill_loss": 0.17623679339885712, "epoch": 7.321547698465643, "step": 21950 }, { "epoch": 7.321547698465643, "ref_ce_loss": 0.0360279381275177, "step": 21950 }, { "epoch": 7.321547698465643, "loss": 0.2697838544845581, "step": 21950 }, { "ce_loss": 0.010306882672011852, "epoch": 7.321547698465643, "step": 21950 }, { "distill_loss": 0.19858577847480774, "epoch": 7.321547698465643, "step": 21950 }, { "epoch": 7.321547698465643, "ref_ce_loss": 0.047593116760253906, "step": 21950 }, { "epoch": 7.321547698465643, "loss": 0.2993145287036896, "step": 21950 }, { "ce_loss": 0.055308952927589417, "epoch": 7.321547698465643, "step": 21950 }, { "distill_loss": 0.15778449177742004, "epoch": 7.321547698465643, "step": 21950 }, { "epoch": 7.321547698465643, "ref_ce_loss": 0.06056550145149231, "step": 21950 }, { "epoch": 7.324883255503669, "loss": 0.3201, "step": 21960 }, { "epoch": 7.324883255503669, "grad_norm": 2.9763293266296387, "step": 21960 }, { "epoch": 7.324883255503669, "learning_rate": 5.568110984151925e-06, "step": 21960 }, { "epoch": 7.324883255503669, "loss": 0.40113523602485657, "step": 21960 }, { "ce_loss": 0.04013872146606445, "epoch": 7.324883255503669, "step": 21960 }, { "distill_loss": 0.23629052937030792, "epoch": 7.324883255503669, "step": 21960 }, { "epoch": 7.324883255503669, "ref_ce_loss": 0.055267203599214554, "step": 21960 }, { "epoch": 7.324883255503669, "loss": 0.2200232297182083, "step": 21960 }, { "ce_loss": 0.006899657659232616, "epoch": 7.324883255503669, "step": 21960 }, { "distill_loss": 0.13835325837135315, "epoch": 7.324883255503669, "step": 21960 }, { "epoch": 7.324883255503669, "ref_ce_loss": 0.03750453144311905, "step": 21960 }, { "epoch": 7.324883255503669, "loss": 0.34387919306755066, "step": 21960 }, { "ce_loss": 0.05290165916085243, "epoch": 7.324883255503669, "step": 21960 }, { "distill_loss": 0.1321592777967453, "epoch": 7.324883255503669, "step": 21960 }, { "epoch": 7.324883255503669, "ref_ce_loss": 0.07124420255422592, "step": 21960 }, { "epoch": 7.324883255503669, "loss": 0.364957332611084, "step": 21960 }, { "ce_loss": 0.03829869255423546, "epoch": 7.324883255503669, "step": 21960 }, { "distill_loss": 0.15634045004844666, "epoch": 7.324883255503669, "step": 21960 }, { "epoch": 7.324883255503669, "ref_ce_loss": 0.044923074543476105, "step": 21960 }, { "epoch": 7.328218812541694, "loss": 0.3256, "step": 21970 }, { "epoch": 7.328218812541694, "grad_norm": 4.486271381378174, "step": 21970 }, { "epoch": 7.328218812541694, "learning_rate": 5.513564791479697e-06, "step": 21970 }, { "epoch": 7.328218812541694, "loss": 0.28296107053756714, "step": 21970 }, { "ce_loss": 0.02217467688024044, "epoch": 7.328218812541694, "step": 21970 }, { "distill_loss": 0.20849867165088654, "epoch": 7.328218812541694, "step": 21970 }, { "epoch": 7.328218812541694, "ref_ce_loss": 0.04285474866628647, "step": 21970 }, { "epoch": 7.328218812541694, "loss": 0.35586079955101013, "step": 21970 }, { "ce_loss": 0.03993077948689461, "epoch": 7.328218812541694, "step": 21970 }, { "distill_loss": 0.20565329492092133, "epoch": 7.328218812541694, "step": 21970 }, { "epoch": 7.328218812541694, "ref_ce_loss": 0.056240327656269073, "step": 21970 }, { "epoch": 7.328218812541694, "loss": 0.41194766759872437, "step": 21970 }, { "ce_loss": 0.08907749503850937, "epoch": 7.328218812541694, "step": 21970 }, { "distill_loss": 0.21700620651245117, "epoch": 7.328218812541694, "step": 21970 }, { "epoch": 7.328218812541694, "ref_ce_loss": 0.060150161385536194, "step": 21970 }, { "epoch": 7.328218812541694, "loss": 0.36081603169441223, "step": 21970 }, { "ce_loss": 0.010890948586165905, "epoch": 7.328218812541694, "step": 21970 }, { "distill_loss": 0.12495280802249908, "epoch": 7.328218812541694, "step": 21970 }, { "epoch": 7.328218812541694, "ref_ce_loss": 0.03179723769426346, "step": 21970 }, { "epoch": 7.331554369579719, "loss": 0.3436, "step": 21980 }, { "epoch": 7.331554369579719, "grad_norm": 2.896735429763794, "step": 21980 }, { "epoch": 7.331554369579719, "learning_rate": 5.45928208507006e-06, "step": 21980 }, { "epoch": 7.331554369579719, "loss": 0.38351812958717346, "step": 21980 }, { "ce_loss": 0.01840709149837494, "epoch": 7.331554369579719, "step": 21980 }, { "distill_loss": 0.24827256798744202, "epoch": 7.331554369579719, "step": 21980 }, { "epoch": 7.331554369579719, "ref_ce_loss": 0.05634456127882004, "step": 21980 }, { "epoch": 7.331554369579719, "loss": 0.25412923097610474, "step": 21980 }, { "ce_loss": 0.03049309179186821, "epoch": 7.331554369579719, "step": 21980 }, { "distill_loss": 0.15272663533687592, "epoch": 7.331554369579719, "step": 21980 }, { "epoch": 7.331554369579719, "ref_ce_loss": 0.038149669766426086, "step": 21980 }, { "epoch": 7.331554369579719, "loss": 0.40757080912590027, "step": 21980 }, { "ce_loss": 0.029884718358516693, "epoch": 7.331554369579719, "step": 21980 }, { "distill_loss": 0.2894759476184845, "epoch": 7.331554369579719, "step": 21980 }, { "epoch": 7.331554369579719, "ref_ce_loss": 0.05772984027862549, "step": 21980 }, { "epoch": 7.331554369579719, "loss": 0.3815244436264038, "step": 21980 }, { "ce_loss": 0.024049028754234314, "epoch": 7.331554369579719, "step": 21980 }, { "distill_loss": 0.1359594762325287, "epoch": 7.331554369579719, "step": 21980 }, { "epoch": 7.331554369579719, "ref_ce_loss": 0.05487595498561859, "step": 21980 }, { "epoch": 7.334889926617745, "loss": 0.3177, "step": 21990 }, { "epoch": 7.334889926617745, "grad_norm": 3.9288063049316406, "step": 21990 }, { "epoch": 7.334889926617745, "learning_rate": 5.405262963913231e-06, "step": 21990 }, { "epoch": 7.334889926617745, "loss": 0.3045029044151306, "step": 21990 }, { "ce_loss": 0.018765542656183243, "epoch": 7.334889926617745, "step": 21990 }, { "distill_loss": 0.19060693681240082, "epoch": 7.334889926617745, "step": 21990 }, { "epoch": 7.334889926617745, "ref_ce_loss": 0.05052812024950981, "step": 21990 }, { "epoch": 7.334889926617745, "loss": 0.37354931235313416, "step": 21990 }, { "ce_loss": 0.03803478553891182, "epoch": 7.334889926617745, "step": 21990 }, { "distill_loss": 0.15719890594482422, "epoch": 7.334889926617745, "step": 21990 }, { "epoch": 7.334889926617745, "ref_ce_loss": 0.044708650559186935, "step": 21990 }, { "epoch": 7.334889926617745, "loss": 0.25138357281684875, "step": 21990 }, { "ce_loss": 0.03792501613497734, "epoch": 7.334889926617745, "step": 21990 }, { "distill_loss": 0.13845518231391907, "epoch": 7.334889926617745, "step": 21990 }, { "epoch": 7.334889926617745, "ref_ce_loss": 0.04259892553091049, "step": 21990 }, { "epoch": 7.334889926617745, "loss": 0.30000004172325134, "step": 21990 }, { "ce_loss": 0.016089096665382385, "epoch": 7.334889926617745, "step": 21990 }, { "distill_loss": 0.21034321188926697, "epoch": 7.334889926617745, "step": 21990 }, { "epoch": 7.334889926617745, "ref_ce_loss": 0.050738152116537094, "step": 21990 }, { "epoch": 7.33822548365577, "loss": 0.329, "step": 22000 }, { "epoch": 7.33822548365577, "grad_norm": 3.6049487590789795, "step": 22000 }, { "epoch": 7.33822548365577, "learning_rate": 5.351507526518811e-06, "step": 22000 }, { "epoch": 7.33822548365577, "loss": 0.7393406629562378, "step": 22000 }, { "ce_loss": 0.03983442857861519, "epoch": 7.33822548365577, "step": 22000 }, { "distill_loss": 0.23520348966121674, "epoch": 7.33822548365577, "step": 22000 }, { "epoch": 7.33822548365577, "ref_ce_loss": 0.04483520984649658, "step": 22000 }, { "epoch": 7.33822548365577, "loss": 0.24591264128684998, "step": 22000 }, { "ce_loss": 0.029901692643761635, "epoch": 7.33822548365577, "step": 22000 }, { "distill_loss": 0.15099608898162842, "epoch": 7.33822548365577, "step": 22000 }, { "epoch": 7.33822548365577, "ref_ce_loss": 0.046401675790548325, "step": 22000 }, { "epoch": 7.33822548365577, "loss": 0.15313398838043213, "step": 22000 }, { "ce_loss": 0.013905913569033146, "epoch": 7.33822548365577, "step": 22000 }, { "distill_loss": 0.08073040097951889, "epoch": 7.33822548365577, "step": 22000 }, { "epoch": 7.33822548365577, "ref_ce_loss": 0.022350676357746124, "step": 22000 }, { "epoch": 7.33822548365577, "loss": 0.13868626952171326, "step": 22000 }, { "ce_loss": 0.0033345171250402927, "epoch": 7.33822548365577, "step": 22000 }, { "distill_loss": 0.08276716619729996, "epoch": 7.33822548365577, "step": 22000 }, { "epoch": 7.33822548365577, "ref_ce_loss": 0.03727690130472183, "step": 22000 }, { "epoch": 7.3415610406937954, "loss": 0.3285, "step": 22010 }, { "epoch": 7.3415610406937954, "grad_norm": 2.786442518234253, "step": 22010 }, { "epoch": 7.3415610406937954, "learning_rate": 5.2980158709154504e-06, "step": 22010 }, { "epoch": 7.3415610406937954, "loss": 0.20294231176376343, "step": 22010 }, { "ce_loss": 0.02882452681660652, "epoch": 7.3415610406937954, "step": 22010 }, { "distill_loss": 0.11589177697896957, "epoch": 7.3415610406937954, "step": 22010 }, { "epoch": 7.3415610406937954, "ref_ce_loss": 0.058139339089393616, "step": 22010 }, { "epoch": 7.3415610406937954, "loss": 0.3361784517765045, "step": 22010 }, { "ce_loss": 0.0275823213160038, "epoch": 7.3415610406937954, "step": 22010 }, { "distill_loss": 0.21154193580150604, "epoch": 7.3415610406937954, "step": 22010 }, { "epoch": 7.3415610406937954, "ref_ce_loss": 0.07388574630022049, "step": 22010 }, { "epoch": 7.3415610406937954, "loss": 0.4463978707790375, "step": 22010 }, { "ce_loss": 0.03035353124141693, "epoch": 7.3415610406937954, "step": 22010 }, { "distill_loss": 0.24757204949855804, "epoch": 7.3415610406937954, "step": 22010 }, { "epoch": 7.3415610406937954, "ref_ce_loss": 0.054496344178915024, "step": 22010 }, { "epoch": 7.3415610406937954, "loss": 0.3473254442214966, "step": 22010 }, { "ce_loss": 0.03475075587630272, "epoch": 7.3415610406937954, "step": 22010 }, { "distill_loss": 0.27547967433929443, "epoch": 7.3415610406937954, "step": 22010 }, { "epoch": 7.3415610406937954, "ref_ce_loss": 0.036626383662223816, "step": 22010 }, { "epoch": 7.344896597731822, "loss": 0.324, "step": 22020 }, { "epoch": 7.344896597731822, "grad_norm": 2.6016852855682373, "step": 22020 }, { "epoch": 7.344896597731822, "learning_rate": 5.244788094650887e-06, "step": 22020 }, { "epoch": 7.344896597731822, "loss": 0.22592085599899292, "step": 22020 }, { "ce_loss": 0.006041758228093386, "epoch": 7.344896597731822, "step": 22020 }, { "distill_loss": 0.16178952157497406, "epoch": 7.344896597731822, "step": 22020 }, { "epoch": 7.344896597731822, "ref_ce_loss": 0.03367564082145691, "step": 22020 }, { "epoch": 7.344896597731822, "loss": 0.3396591544151306, "step": 22020 }, { "ce_loss": 0.033659134060144424, "epoch": 7.344896597731822, "step": 22020 }, { "distill_loss": 0.13412310183048248, "epoch": 7.344896597731822, "step": 22020 }, { "epoch": 7.344896597731822, "ref_ce_loss": 0.044457755982875824, "step": 22020 }, { "epoch": 7.344896597731822, "loss": 0.34963589906692505, "step": 22020 }, { "ce_loss": 0.021980000659823418, "epoch": 7.344896597731822, "step": 22020 }, { "distill_loss": 0.2693895101547241, "epoch": 7.344896597731822, "step": 22020 }, { "epoch": 7.344896597731822, "ref_ce_loss": 0.058126404881477356, "step": 22020 }, { "epoch": 7.344896597731822, "loss": 0.3431185483932495, "step": 22020 }, { "ce_loss": 0.026764724403619766, "epoch": 7.344896597731822, "step": 22020 }, { "distill_loss": 0.15950065851211548, "epoch": 7.344896597731822, "step": 22020 }, { "epoch": 7.344896597731822, "ref_ce_loss": 0.04904036968946457, "step": 22020 }, { "epoch": 7.348232154769846, "loss": 0.3301, "step": 22030 }, { "epoch": 7.348232154769846, "grad_norm": 2.480509042739868, "step": 22030 }, { "epoch": 7.348232154769846, "learning_rate": 5.191824294791558e-06, "step": 22030 }, { "epoch": 7.348232154769846, "loss": 0.5115179419517517, "step": 22030 }, { "ce_loss": 0.05640953779220581, "epoch": 7.348232154769846, "step": 22030 }, { "distill_loss": 0.2712617814540863, "epoch": 7.348232154769846, "step": 22030 }, { "epoch": 7.348232154769846, "ref_ce_loss": 0.06187128275632858, "step": 22030 }, { "epoch": 7.348232154769846, "loss": 0.22394618391990662, "step": 22030 }, { "ce_loss": 0.013718812726438046, "epoch": 7.348232154769846, "step": 22030 }, { "distill_loss": 0.1504044383764267, "epoch": 7.348232154769846, "step": 22030 }, { "epoch": 7.348232154769846, "ref_ce_loss": 0.03647022321820259, "step": 22030 }, { "epoch": 7.348232154769846, "loss": 0.2851472795009613, "step": 22030 }, { "ce_loss": 0.05637501925230026, "epoch": 7.348232154769846, "step": 22030 }, { "distill_loss": 0.14488068222999573, "epoch": 7.348232154769846, "step": 22030 }, { "epoch": 7.348232154769846, "ref_ce_loss": 0.05806015431880951, "step": 22030 }, { "epoch": 7.348232154769846, "loss": 0.2885356545448303, "step": 22030 }, { "ce_loss": 0.06287986040115356, "epoch": 7.348232154769846, "step": 22030 }, { "distill_loss": 0.15702137351036072, "epoch": 7.348232154769846, "step": 22030 }, { "epoch": 7.348232154769846, "ref_ce_loss": 0.06848686933517456, "step": 22030 }, { "epoch": 7.351567711807872, "loss": 0.3485, "step": 22040 }, { "epoch": 7.351567711807872, "grad_norm": 5.346456050872803, "step": 22040 }, { "epoch": 7.351567711807872, "learning_rate": 5.139124567922553e-06, "step": 22040 }, { "epoch": 7.351567711807872, "loss": 0.3215425908565521, "step": 22040 }, { "ce_loss": 0.03501839190721512, "epoch": 7.351567711807872, "step": 22040 }, { "distill_loss": 0.25432682037353516, "epoch": 7.351567711807872, "step": 22040 }, { "epoch": 7.351567711807872, "ref_ce_loss": 0.032068297266960144, "step": 22040 }, { "epoch": 7.351567711807872, "loss": 0.24832525849342346, "step": 22040 }, { "ce_loss": 0.01957583799958229, "epoch": 7.351567711807872, "step": 22040 }, { "distill_loss": 0.14824563264846802, "epoch": 7.351567711807872, "step": 22040 }, { "epoch": 7.351567711807872, "ref_ce_loss": 0.03626695275306702, "step": 22040 }, { "epoch": 7.351567711807872, "loss": 0.22427628934383392, "step": 22040 }, { "ce_loss": 0.023650500923395157, "epoch": 7.351567711807872, "step": 22040 }, { "distill_loss": 0.15130510926246643, "epoch": 7.351567711807872, "step": 22040 }, { "epoch": 7.351567711807872, "ref_ce_loss": 0.04904693365097046, "step": 22040 }, { "epoch": 7.351567711807872, "loss": 0.4551049768924713, "step": 22040 }, { "ce_loss": 0.02893562614917755, "epoch": 7.351567711807872, "step": 22040 }, { "distill_loss": 0.24910324811935425, "epoch": 7.351567711807872, "step": 22040 }, { "epoch": 7.351567711807872, "ref_ce_loss": 0.04241586849093437, "step": 22040 }, { "epoch": 7.354903268845897, "loss": 0.3493, "step": 22050 }, { "epoch": 7.354903268845897, "grad_norm": 3.2416086196899414, "step": 22050 }, { "epoch": 7.354903268845897, "learning_rate": 5.0866890101473826e-06, "step": 22050 }, { "epoch": 7.354903268845897, "loss": 0.2246338427066803, "step": 22050 }, { "ce_loss": 0.016258342191576958, "epoch": 7.354903268845897, "step": 22050 }, { "distill_loss": 0.14530493319034576, "epoch": 7.354903268845897, "step": 22050 }, { "epoch": 7.354903268845897, "ref_ce_loss": 0.06291282176971436, "step": 22050 }, { "epoch": 7.354903268845897, "loss": 0.2104296237230301, "step": 22050 }, { "ce_loss": 0.026468923315405846, "epoch": 7.354903268845897, "step": 22050 }, { "distill_loss": 0.12647053599357605, "epoch": 7.354903268845897, "step": 22050 }, { "epoch": 7.354903268845897, "ref_ce_loss": 0.04710124433040619, "step": 22050 }, { "epoch": 7.354903268845897, "loss": 0.4000055491924286, "step": 22050 }, { "ce_loss": 0.05000147968530655, "epoch": 7.354903268845897, "step": 22050 }, { "distill_loss": 0.2810991108417511, "epoch": 7.354903268845897, "step": 22050 }, { "epoch": 7.354903268845897, "ref_ce_loss": 0.042414627969264984, "step": 22050 }, { "epoch": 7.354903268845897, "loss": 0.32178622484207153, "step": 22050 }, { "ce_loss": 0.01044121477752924, "epoch": 7.354903268845897, "step": 22050 }, { "distill_loss": 0.21580266952514648, "epoch": 7.354903268845897, "step": 22050 }, { "epoch": 7.354903268845897, "ref_ce_loss": 0.0623142309486866, "step": 22050 }, { "epoch": 7.358238825883923, "loss": 0.3604, "step": 22060 }, { "epoch": 7.358238825883923, "grad_norm": 5.3949360847473145, "step": 22060 }, { "epoch": 7.358238825883923, "learning_rate": 5.034517717087838e-06, "step": 22060 }, { "epoch": 7.358238825883923, "loss": 0.25852417945861816, "step": 22060 }, { "ce_loss": 0.04160953685641289, "epoch": 7.358238825883923, "step": 22060 }, { "distill_loss": 0.14878787100315094, "epoch": 7.358238825883923, "step": 22060 }, { "epoch": 7.358238825883923, "ref_ce_loss": 0.04364071413874626, "step": 22060 }, { "epoch": 7.358238825883923, "loss": 0.20833426713943481, "step": 22060 }, { "ce_loss": 0.01812390238046646, "epoch": 7.358238825883923, "step": 22060 }, { "distill_loss": 0.13836698234081268, "epoch": 7.358238825883923, "step": 22060 }, { "epoch": 7.358238825883923, "ref_ce_loss": 0.051717013120651245, "step": 22060 }, { "epoch": 7.358238825883923, "loss": 0.3156091570854187, "step": 22060 }, { "ce_loss": 0.07016031444072723, "epoch": 7.358238825883923, "step": 22060 }, { "distill_loss": 0.18352413177490234, "epoch": 7.358238825883923, "step": 22060 }, { "epoch": 7.358238825883923, "ref_ce_loss": 0.041767437011003494, "step": 22060 }, { "epoch": 7.358238825883923, "loss": 0.24763324856758118, "step": 22060 }, { "ce_loss": 0.019852010533213615, "epoch": 7.358238825883923, "step": 22060 }, { "distill_loss": 0.1532268226146698, "epoch": 7.358238825883923, "step": 22060 }, { "epoch": 7.358238825883923, "ref_ce_loss": 0.044450026005506516, "step": 22060 }, { "epoch": 7.3615743829219475, "loss": 0.3264, "step": 22070 }, { "epoch": 7.3615743829219475, "grad_norm": 3.7023305892944336, "step": 22070 }, { "epoch": 7.3615743829219475, "learning_rate": 4.98261078388375e-06, "step": 22070 }, { "epoch": 7.3615743829219475, "loss": 0.39206361770629883, "step": 22070 }, { "ce_loss": 0.0862056091427803, "epoch": 7.3615743829219475, "step": 22070 }, { "distill_loss": 0.20357592403888702, "epoch": 7.3615743829219475, "step": 22070 }, { "epoch": 7.3615743829219475, "ref_ce_loss": 0.04326269030570984, "step": 22070 }, { "epoch": 7.3615743829219475, "loss": 0.25698763132095337, "step": 22070 }, { "ce_loss": 0.0048144906759262085, "epoch": 7.3615743829219475, "step": 22070 }, { "distill_loss": 0.10615068674087524, "epoch": 7.3615743829219475, "step": 22070 }, { "epoch": 7.3615743829219475, "ref_ce_loss": 0.04334083944559097, "step": 22070 }, { "epoch": 7.3615743829219475, "loss": 0.5344517230987549, "step": 22070 }, { "ce_loss": 0.10412358492612839, "epoch": 7.3615743829219475, "step": 22070 }, { "distill_loss": 0.1765112578868866, "epoch": 7.3615743829219475, "step": 22070 }, { "epoch": 7.3615743829219475, "ref_ce_loss": 0.06247731298208237, "step": 22070 }, { "epoch": 7.3615743829219475, "loss": 0.3683743476867676, "step": 22070 }, { "ce_loss": 0.030895305797457695, "epoch": 7.3615743829219475, "step": 22070 }, { "distill_loss": 0.16962184011936188, "epoch": 7.3615743829219475, "step": 22070 }, { "epoch": 7.3615743829219475, "ref_ce_loss": 0.04016808047890663, "step": 22070 }, { "epoch": 7.364909939959974, "loss": 0.3374, "step": 22080 }, { "epoch": 7.364909939959974, "grad_norm": 2.693723678588867, "step": 22080 }, { "epoch": 7.364909939959974, "learning_rate": 4.9309683051929e-06, "step": 22080 }, { "epoch": 7.364909939959974, "loss": 0.38984400033950806, "step": 22080 }, { "ce_loss": 0.010099812410771847, "epoch": 7.364909939959974, "step": 22080 }, { "distill_loss": 0.17128121852874756, "epoch": 7.364909939959974, "step": 22080 }, { "epoch": 7.364909939959974, "ref_ce_loss": 0.03249981254339218, "step": 22080 }, { "epoch": 7.364909939959974, "loss": 0.29879438877105713, "step": 22080 }, { "ce_loss": 0.07127712666988373, "epoch": 7.364909939959974, "step": 22080 }, { "distill_loss": 0.15073591470718384, "epoch": 7.364909939959974, "step": 22080 }, { "epoch": 7.364909939959974, "ref_ce_loss": 0.06147244572639465, "step": 22080 }, { "epoch": 7.364909939959974, "loss": 0.41210445761680603, "step": 22080 }, { "ce_loss": 0.02065237984061241, "epoch": 7.364909939959974, "step": 22080 }, { "distill_loss": 0.2004043161869049, "epoch": 7.364909939959974, "step": 22080 }, { "epoch": 7.364909939959974, "ref_ce_loss": 0.03282521665096283, "step": 22080 }, { "epoch": 7.364909939959974, "loss": 1.1021695137023926, "step": 22080 }, { "ce_loss": 0.03629375249147415, "epoch": 7.364909939959974, "step": 22080 }, { "distill_loss": 0.23419694602489471, "epoch": 7.364909939959974, "step": 22080 }, { "epoch": 7.364909939959974, "ref_ce_loss": 0.03863668441772461, "step": 22080 }, { "epoch": 7.368245496997998, "loss": 0.3415, "step": 22090 }, { "epoch": 7.368245496997998, "grad_norm": 3.0293538570404053, "step": 22090 }, { "epoch": 7.368245496997998, "learning_rate": 4.879590375190789e-06, "step": 22090 }, { "epoch": 7.368245496997998, "loss": 0.4525725543498993, "step": 22090 }, { "ce_loss": 0.06142522767186165, "epoch": 7.368245496997998, "step": 22090 }, { "distill_loss": 0.23353439569473267, "epoch": 7.368245496997998, "step": 22090 }, { "epoch": 7.368245496997998, "ref_ce_loss": 0.09087929874658585, "step": 22090 }, { "epoch": 7.368245496997998, "loss": 0.2484576404094696, "step": 22090 }, { "ce_loss": 0.05308183655142784, "epoch": 7.368245496997998, "step": 22090 }, { "distill_loss": 0.1403602808713913, "epoch": 7.368245496997998, "step": 22090 }, { "epoch": 7.368245496997998, "ref_ce_loss": 0.03931579738855362, "step": 22090 }, { "epoch": 7.368245496997998, "loss": 0.2212214171886444, "step": 22090 }, { "ce_loss": 0.006685642991214991, "epoch": 7.368245496997998, "step": 22090 }, { "distill_loss": 0.11087189614772797, "epoch": 7.368245496997998, "step": 22090 }, { "epoch": 7.368245496997998, "ref_ce_loss": 0.05651544779539108, "step": 22090 }, { "epoch": 7.368245496997998, "loss": 0.2329418957233429, "step": 22090 }, { "ce_loss": 0.021505938842892647, "epoch": 7.368245496997998, "step": 22090 }, { "distill_loss": 0.15570920705795288, "epoch": 7.368245496997998, "step": 22090 }, { "epoch": 7.368245496997998, "ref_ce_loss": 0.03630523011088371, "step": 22090 }, { "epoch": 7.3715810540360245, "loss": 0.3003, "step": 22100 }, { "epoch": 7.3715810540360245, "grad_norm": 2.885834217071533, "step": 22100 }, { "epoch": 7.3715810540360245, "learning_rate": 4.82847708757052e-06, "step": 22100 }, { "epoch": 7.3715810540360245, "loss": 0.4009764492511749, "step": 22100 }, { "ce_loss": 0.03690299391746521, "epoch": 7.3715810540360245, "step": 22100 }, { "distill_loss": 0.17693805694580078, "epoch": 7.3715810540360245, "step": 22100 }, { "epoch": 7.3715810540360245, "ref_ce_loss": 0.04862113296985626, "step": 22100 }, { "epoch": 7.3715810540360245, "loss": 0.36756935715675354, "step": 22100 }, { "ce_loss": 0.017550576478242874, "epoch": 7.3715810540360245, "step": 22100 }, { "distill_loss": 0.26459965109825134, "epoch": 7.3715810540360245, "step": 22100 }, { "epoch": 7.3715810540360245, "ref_ce_loss": 0.056327540427446365, "step": 22100 }, { "epoch": 7.3715810540360245, "loss": 0.1626485288143158, "step": 22100 }, { "ce_loss": 0.005254819057881832, "epoch": 7.3715810540360245, "step": 22100 }, { "distill_loss": 0.10461093485355377, "epoch": 7.3715810540360245, "step": 22100 }, { "epoch": 7.3715810540360245, "ref_ce_loss": 0.03123287670314312, "step": 22100 }, { "epoch": 7.3715810540360245, "loss": 0.2599450647830963, "step": 22100 }, { "ce_loss": 0.029316769912838936, "epoch": 7.3715810540360245, "step": 22100 }, { "distill_loss": 0.1351774036884308, "epoch": 7.3715810540360245, "step": 22100 }, { "epoch": 7.3715810540360245, "ref_ce_loss": 0.04561307653784752, "step": 22100 }, { "epoch": 7.374916611074049, "loss": 0.3431, "step": 22110 }, { "epoch": 7.374916611074049, "grad_norm": 4.652987957000732, "step": 22110 }, { "epoch": 7.374916611074049, "learning_rate": 4.777628535542549e-06, "step": 22110 }, { "epoch": 7.374916611074049, "loss": 0.38078269362449646, "step": 22110 }, { "ce_loss": 0.039288006722927094, "epoch": 7.374916611074049, "step": 22110 }, { "distill_loss": 0.24713261425495148, "epoch": 7.374916611074049, "step": 22110 }, { "epoch": 7.374916611074049, "ref_ce_loss": 0.06356287002563477, "step": 22110 }, { "epoch": 7.374916611074049, "loss": 0.35073116421699524, "step": 22110 }, { "ce_loss": 0.04300399497151375, "epoch": 7.374916611074049, "step": 22110 }, { "distill_loss": 0.22265471518039703, "epoch": 7.374916611074049, "step": 22110 }, { "epoch": 7.374916611074049, "ref_ce_loss": 0.08492851257324219, "step": 22110 }, { "epoch": 7.374916611074049, "loss": 0.47575217485427856, "step": 22110 }, { "ce_loss": 0.0735011100769043, "epoch": 7.374916611074049, "step": 22110 }, { "distill_loss": 0.1917407512664795, "epoch": 7.374916611074049, "step": 22110 }, { "epoch": 7.374916611074049, "ref_ce_loss": 0.07961657643318176, "step": 22110 }, { "epoch": 7.374916611074049, "loss": 0.2863282561302185, "step": 22110 }, { "ce_loss": 0.04646497592329979, "epoch": 7.374916611074049, "step": 22110 }, { "distill_loss": 0.1628018170595169, "epoch": 7.374916611074049, "step": 22110 }, { "epoch": 7.374916611074049, "ref_ce_loss": 0.052869487553834915, "step": 22110 }, { "epoch": 7.378252168112075, "loss": 0.3514, "step": 22120 }, { "epoch": 7.378252168112075, "grad_norm": 3.9610838890075684, "step": 22120 }, { "epoch": 7.378252168112075, "learning_rate": 4.727044811834585e-06, "step": 22120 }, { "epoch": 7.378252168112075, "loss": 0.2892042398452759, "step": 22120 }, { "ce_loss": 0.03907979279756546, "epoch": 7.378252168112075, "step": 22120 }, { "distill_loss": 0.1598028540611267, "epoch": 7.378252168112075, "step": 22120 }, { "epoch": 7.378252168112075, "ref_ce_loss": 0.07168059796094894, "step": 22120 }, { "epoch": 7.378252168112075, "loss": 0.20294763147830963, "step": 22120 }, { "ce_loss": 0.009141350165009499, "epoch": 7.378252168112075, "step": 22120 }, { "distill_loss": 0.13888469338417053, "epoch": 7.378252168112075, "step": 22120 }, { "epoch": 7.378252168112075, "ref_ce_loss": 0.03615092858672142, "step": 22120 }, { "epoch": 7.378252168112075, "loss": 0.43699246644973755, "step": 22120 }, { "ce_loss": 0.07229337841272354, "epoch": 7.378252168112075, "step": 22120 }, { "distill_loss": 0.26068443059921265, "epoch": 7.378252168112075, "step": 22120 }, { "epoch": 7.378252168112075, "ref_ce_loss": 0.05587449297308922, "step": 22120 }, { "epoch": 7.378252168112075, "loss": 0.2592012584209442, "step": 22120 }, { "ce_loss": 0.058118805289268494, "epoch": 7.378252168112075, "step": 22120 }, { "distill_loss": 0.1439739167690277, "epoch": 7.378252168112075, "step": 22120 }, { "epoch": 7.378252168112075, "ref_ce_loss": 0.037676967680454254, "step": 22120 }, { "epoch": 7.3815877251501, "loss": 0.3062, "step": 22130 }, { "epoch": 7.3815877251501, "grad_norm": 3.771886110305786, "step": 22130 }, { "epoch": 7.3815877251501, "learning_rate": 4.676726008691356e-06, "step": 22130 }, { "epoch": 7.3815877251501, "loss": 0.32454532384872437, "step": 22130 }, { "ce_loss": 0.045760221779346466, "epoch": 7.3815877251501, "step": 22130 }, { "distill_loss": 0.16390490531921387, "epoch": 7.3815877251501, "step": 22130 }, { "epoch": 7.3815877251501, "ref_ce_loss": 0.07490749657154083, "step": 22130 }, { "epoch": 7.3815877251501, "loss": 0.275404691696167, "step": 22130 }, { "ce_loss": 0.033711016178131104, "epoch": 7.3815877251501, "step": 22130 }, { "distill_loss": 0.12929511070251465, "epoch": 7.3815877251501, "step": 22130 }, { "epoch": 7.3815877251501, "ref_ce_loss": 0.04097050428390503, "step": 22130 }, { "epoch": 7.3815877251501, "loss": 0.5244038105010986, "step": 22130 }, { "ce_loss": 0.010484430938959122, "epoch": 7.3815877251501, "step": 22130 }, { "distill_loss": 0.2396150678396225, "epoch": 7.3815877251501, "step": 22130 }, { "epoch": 7.3815877251501, "ref_ce_loss": 0.04648149758577347, "step": 22130 }, { "epoch": 7.3815877251501, "loss": 0.3090200424194336, "step": 22130 }, { "ce_loss": 0.03500140830874443, "epoch": 7.3815877251501, "step": 22130 }, { "distill_loss": 0.18160416185855865, "epoch": 7.3815877251501, "step": 22130 }, { "epoch": 7.3815877251501, "ref_ce_loss": 0.0565800704061985, "step": 22130 }, { "epoch": 7.384923282188126, "loss": 0.31, "step": 22140 }, { "epoch": 7.384923282188126, "grad_norm": 2.869490623474121, "step": 22140 }, { "epoch": 7.384923282188126, "learning_rate": 4.626672217874544e-06, "step": 22140 }, { "epoch": 7.384923282188126, "loss": 0.21312899887561798, "step": 22140 }, { "ce_loss": 0.0147785022854805, "epoch": 7.384923282188126, "step": 22140 }, { "distill_loss": 0.12025727331638336, "epoch": 7.384923282188126, "step": 22140 }, { "epoch": 7.384923282188126, "ref_ce_loss": 0.07750562578439713, "step": 22140 }, { "epoch": 7.384923282188126, "loss": 0.15995611250400543, "step": 22140 }, { "ce_loss": 0.010545612312853336, "epoch": 7.384923282188126, "step": 22140 }, { "distill_loss": 0.10653532296419144, "epoch": 7.384923282188126, "step": 22140 }, { "epoch": 7.384923282188126, "ref_ce_loss": 0.04272819682955742, "step": 22140 }, { "epoch": 7.384923282188126, "loss": 0.2929874360561371, "step": 22140 }, { "ce_loss": 0.004137550480663776, "epoch": 7.384923282188126, "step": 22140 }, { "distill_loss": 0.23183076083660126, "epoch": 7.384923282188126, "step": 22140 }, { "epoch": 7.384923282188126, "ref_ce_loss": 0.04496045410633087, "step": 22140 }, { "epoch": 7.384923282188126, "loss": 0.3128769099712372, "step": 22140 }, { "ce_loss": 0.022112419828772545, "epoch": 7.384923282188126, "step": 22140 }, { "distill_loss": 0.22397132217884064, "epoch": 7.384923282188126, "step": 22140 }, { "epoch": 7.384923282188126, "ref_ce_loss": 0.054382920265197754, "step": 22140 }, { "epoch": 7.38825883922615, "loss": 0.3406, "step": 22150 }, { "epoch": 7.38825883922615, "grad_norm": 3.3012917041778564, "step": 22150 }, { "epoch": 7.38825883922615, "learning_rate": 4.576883530662517e-06, "step": 22150 }, { "epoch": 7.38825883922615, "loss": 0.4942885935306549, "step": 22150 }, { "ce_loss": 0.05201977118849754, "epoch": 7.38825883922615, "step": 22150 }, { "distill_loss": 0.29401320219039917, "epoch": 7.38825883922615, "step": 22150 }, { "epoch": 7.38825883922615, "ref_ce_loss": 0.03955161198973656, "step": 22150 }, { "epoch": 7.38825883922615, "loss": 0.25629130005836487, "step": 22150 }, { "ce_loss": 0.03334483876824379, "epoch": 7.38825883922615, "step": 22150 }, { "distill_loss": 0.1529396027326584, "epoch": 7.38825883922615, "step": 22150 }, { "epoch": 7.38825883922615, "ref_ce_loss": 0.04761706292629242, "step": 22150 }, { "epoch": 7.38825883922615, "loss": 0.3196744918823242, "step": 22150 }, { "ce_loss": 0.032249510288238525, "epoch": 7.38825883922615, "step": 22150 }, { "distill_loss": 0.24185246229171753, "epoch": 7.38825883922615, "step": 22150 }, { "epoch": 7.38825883922615, "ref_ce_loss": 0.03860660269856453, "step": 22150 }, { "epoch": 7.38825883922615, "loss": 0.22365950047969818, "step": 22150 }, { "ce_loss": 0.0067135305143892765, "epoch": 7.38825883922615, "step": 22150 }, { "distill_loss": 0.1155950278043747, "epoch": 7.38825883922615, "step": 22150 }, { "epoch": 7.38825883922615, "ref_ce_loss": 0.0510890819132328, "step": 22150 }, { "epoch": 7.391594396264177, "loss": 0.3196, "step": 22160 }, { "epoch": 7.391594396264177, "grad_norm": 3.433950185775757, "step": 22160 }, { "epoch": 7.391594396264177, "learning_rate": 4.527360037850197e-06, "step": 22160 }, { "epoch": 7.391594396264177, "loss": 0.3733089566230774, "step": 22160 }, { "ce_loss": 0.045424215495586395, "epoch": 7.391594396264177, "step": 22160 }, { "distill_loss": 0.23835572600364685, "epoch": 7.391594396264177, "step": 22160 }, { "epoch": 7.391594396264177, "ref_ce_loss": 0.04281904175877571, "step": 22160 }, { "epoch": 7.391594396264177, "loss": 0.21638333797454834, "step": 22160 }, { "ce_loss": 0.03292544558644295, "epoch": 7.391594396264177, "step": 22160 }, { "distill_loss": 0.10522749274969101, "epoch": 7.391594396264177, "step": 22160 }, { "epoch": 7.391594396264177, "ref_ce_loss": 0.05115901306271553, "step": 22160 }, { "epoch": 7.391594396264177, "loss": 0.19880634546279907, "step": 22160 }, { "ce_loss": 0.025685761123895645, "epoch": 7.391594396264177, "step": 22160 }, { "distill_loss": 0.09403679519891739, "epoch": 7.391594396264177, "step": 22160 }, { "epoch": 7.391594396264177, "ref_ce_loss": 0.030222853645682335, "step": 22160 }, { "epoch": 7.391594396264177, "loss": 0.37925201654434204, "step": 22160 }, { "ce_loss": 0.024915773421525955, "epoch": 7.391594396264177, "step": 22160 }, { "distill_loss": 0.2479330599308014, "epoch": 7.391594396264177, "step": 22160 }, { "epoch": 7.391594396264177, "ref_ce_loss": 0.0947517454624176, "step": 22160 }, { "epoch": 7.394929953302201, "loss": 0.3212, "step": 22170 }, { "epoch": 7.394929953302201, "grad_norm": 2.706461191177368, "step": 22170 }, { "epoch": 7.394929953302201, "learning_rate": 4.4781018297488755e-06, "step": 22170 }, { "epoch": 7.394929953302201, "loss": 0.3403114676475525, "step": 22170 }, { "ce_loss": 0.030661532655358315, "epoch": 7.394929953302201, "step": 22170 }, { "distill_loss": 0.21051430702209473, "epoch": 7.394929953302201, "step": 22170 }, { "epoch": 7.394929953302201, "ref_ce_loss": 0.042628444731235504, "step": 22170 }, { "epoch": 7.394929953302201, "loss": 0.27130600810050964, "step": 22170 }, { "ce_loss": 0.042654745280742645, "epoch": 7.394929953302201, "step": 22170 }, { "distill_loss": 0.14102618396282196, "epoch": 7.394929953302201, "step": 22170 }, { "epoch": 7.394929953302201, "ref_ce_loss": 0.060014378279447556, "step": 22170 }, { "epoch": 7.394929953302201, "loss": 0.3314165472984314, "step": 22170 }, { "ce_loss": 0.026579681783914566, "epoch": 7.394929953302201, "step": 22170 }, { "distill_loss": 0.19740203022956848, "epoch": 7.394929953302201, "step": 22170 }, { "epoch": 7.394929953302201, "ref_ce_loss": 0.04624663665890694, "step": 22170 }, { "epoch": 7.394929953302201, "loss": 0.1601991355419159, "step": 22170 }, { "ce_loss": 0.010934227146208286, "epoch": 7.394929953302201, "step": 22170 }, { "distill_loss": 0.10959579795598984, "epoch": 7.394929953302201, "step": 22170 }, { "epoch": 7.394929953302201, "ref_ce_loss": 0.026321228593587875, "step": 22170 }, { "epoch": 7.398265510340227, "loss": 0.3253, "step": 22180 }, { "epoch": 7.398265510340227, "grad_norm": 3.5024940967559814, "step": 22180 }, { "epoch": 7.398265510340227, "learning_rate": 4.429108996186115e-06, "step": 22180 }, { "epoch": 7.398265510340227, "loss": 0.3466987609863281, "step": 22180 }, { "ce_loss": 0.018985463306307793, "epoch": 7.398265510340227, "step": 22180 }, { "distill_loss": 0.19944104552268982, "epoch": 7.398265510340227, "step": 22180 }, { "epoch": 7.398265510340227, "ref_ce_loss": 0.06336013227701187, "step": 22180 }, { "epoch": 7.398265510340227, "loss": 0.38137781620025635, "step": 22180 }, { "ce_loss": 0.07608304917812347, "epoch": 7.398265510340227, "step": 22180 }, { "distill_loss": 0.15607169270515442, "epoch": 7.398265510340227, "step": 22180 }, { "epoch": 7.398265510340227, "ref_ce_loss": 0.06901339441537857, "step": 22180 }, { "epoch": 7.398265510340227, "loss": 0.34570735692977905, "step": 22180 }, { "ce_loss": 0.04311949387192726, "epoch": 7.398265510340227, "step": 22180 }, { "distill_loss": 0.202956423163414, "epoch": 7.398265510340227, "step": 22180 }, { "epoch": 7.398265510340227, "ref_ce_loss": 0.04889204725623131, "step": 22180 }, { "epoch": 7.398265510340227, "loss": 0.30262553691864014, "step": 22180 }, { "ce_loss": 0.027611130848526955, "epoch": 7.398265510340227, "step": 22180 }, { "distill_loss": 0.1811426728963852, "epoch": 7.398265510340227, "step": 22180 }, { "epoch": 7.398265510340227, "ref_ce_loss": 0.06274111568927765, "step": 22180 }, { "epoch": 7.401601067378252, "loss": 0.3435, "step": 22190 }, { "epoch": 7.401601067378252, "grad_norm": 2.390714645385742, "step": 22190 }, { "epoch": 7.401601067378252, "learning_rate": 4.380381626505514e-06, "step": 22190 }, { "epoch": 7.401601067378252, "loss": 0.1738487333059311, "step": 22190 }, { "ce_loss": 0.012540126219391823, "epoch": 7.401601067378252, "step": 22190 }, { "distill_loss": 0.10962365567684174, "epoch": 7.401601067378252, "step": 22190 }, { "epoch": 7.401601067378252, "ref_ce_loss": 0.05156319588422775, "step": 22190 }, { "epoch": 7.401601067378252, "loss": 0.4952594041824341, "step": 22190 }, { "ce_loss": 0.04229583218693733, "epoch": 7.401601067378252, "step": 22190 }, { "distill_loss": 0.2744204103946686, "epoch": 7.401601067378252, "step": 22190 }, { "epoch": 7.401601067378252, "ref_ce_loss": 0.07085690647363663, "step": 22190 }, { "epoch": 7.401601067378252, "loss": 0.2841060161590576, "step": 22190 }, { "ce_loss": 0.007605770602822304, "epoch": 7.401601067378252, "step": 22190 }, { "distill_loss": 0.20285113155841827, "epoch": 7.401601067378252, "step": 22190 }, { "epoch": 7.401601067378252, "ref_ce_loss": 0.04578419402241707, "step": 22190 }, { "epoch": 7.401601067378252, "loss": 0.3293168246746063, "step": 22190 }, { "ce_loss": 0.05847109109163284, "epoch": 7.401601067378252, "step": 22190 }, { "distill_loss": 0.204945370554924, "epoch": 7.401601067378252, "step": 22190 }, { "epoch": 7.401601067378252, "ref_ce_loss": 0.03360971435904503, "step": 22190 }, { "epoch": 7.404936624416278, "loss": 0.3205, "step": 22200 }, { "epoch": 7.404936624416278, "grad_norm": 2.24493670463562, "step": 22200 }, { "epoch": 7.404936624416278, "learning_rate": 4.3319198095665915e-06, "step": 22200 }, { "epoch": 7.404936624416278, "loss": 0.18313585221767426, "step": 22200 }, { "ce_loss": 0.03454110771417618, "epoch": 7.404936624416278, "step": 22200 }, { "distill_loss": 0.10332349687814713, "epoch": 7.404936624416278, "step": 22200 }, { "epoch": 7.404936624416278, "ref_ce_loss": 0.0450221411883831, "step": 22200 }, { "epoch": 7.404936624416278, "loss": 0.34247058629989624, "step": 22200 }, { "ce_loss": 0.07478012144565582, "epoch": 7.404936624416278, "step": 22200 }, { "distill_loss": 0.2062947154045105, "epoch": 7.404936624416278, "step": 22200 }, { "epoch": 7.404936624416278, "ref_ce_loss": 0.06096559762954712, "step": 22200 }, { "epoch": 7.404936624416278, "loss": 0.26020678877830505, "step": 22200 }, { "ce_loss": 0.05754929408431053, "epoch": 7.404936624416278, "step": 22200 }, { "distill_loss": 0.15126144886016846, "epoch": 7.404936624416278, "step": 22200 }, { "epoch": 7.404936624416278, "ref_ce_loss": 0.02520419843494892, "step": 22200 }, { "epoch": 7.404936624416278, "loss": 0.48407548666000366, "step": 22200 }, { "ce_loss": 0.06056468188762665, "epoch": 7.404936624416278, "step": 22200 }, { "distill_loss": 0.2092147171497345, "epoch": 7.404936624416278, "step": 22200 }, { "epoch": 7.404936624416278, "ref_ce_loss": 0.10852415859699249, "step": 22200 }, { "epoch": 7.408272181454302, "loss": 0.3367, "step": 22210 }, { "epoch": 7.408272181454302, "grad_norm": 3.42170786857605, "step": 22210 }, { "epoch": 7.408272181454302, "learning_rate": 4.283723633744557e-06, "step": 22210 }, { "epoch": 7.408272181454302, "loss": 0.1436109095811844, "step": 22210 }, { "ce_loss": 0.0023982953280210495, "epoch": 7.408272181454302, "step": 22210 }, { "distill_loss": 0.09932367503643036, "epoch": 7.408272181454302, "step": 22210 }, { "epoch": 7.408272181454302, "ref_ce_loss": 0.029441095888614655, "step": 22210 }, { "epoch": 7.408272181454302, "loss": 0.28712987899780273, "step": 22210 }, { "ce_loss": 0.01837264373898506, "epoch": 7.408272181454302, "step": 22210 }, { "distill_loss": 0.14383924007415771, "epoch": 7.408272181454302, "step": 22210 }, { "epoch": 7.408272181454302, "ref_ce_loss": 0.04897449538111687, "step": 22210 }, { "epoch": 7.408272181454302, "loss": 0.31028202176094055, "step": 22210 }, { "ce_loss": 0.024862175807356834, "epoch": 7.408272181454302, "step": 22210 }, { "distill_loss": 0.1748422533273697, "epoch": 7.408272181454302, "step": 22210 }, { "epoch": 7.408272181454302, "ref_ce_loss": 0.055731479078531265, "step": 22210 }, { "epoch": 7.408272181454302, "loss": 0.2121187001466751, "step": 22210 }, { "ce_loss": 0.016437353566288948, "epoch": 7.408272181454302, "step": 22210 }, { "distill_loss": 0.13269154727458954, "epoch": 7.408272181454302, "step": 22210 }, { "epoch": 7.408272181454302, "ref_ce_loss": 0.04025792330503464, "step": 22210 }, { "epoch": 7.411607738492329, "loss": 0.2707, "step": 22220 }, { "epoch": 7.411607738492329, "grad_norm": 2.6476309299468994, "step": 22220 }, { "epoch": 7.411607738492329, "learning_rate": 4.235793186930237e-06, "step": 22220 }, { "epoch": 7.411607738492329, "loss": 0.2639855146408081, "step": 22220 }, { "ce_loss": 0.042686786502599716, "epoch": 7.411607738492329, "step": 22220 }, { "distill_loss": 0.17029103636741638, "epoch": 7.411607738492329, "step": 22220 }, { "epoch": 7.411607738492329, "ref_ce_loss": 0.050944022834300995, "step": 22220 }, { "epoch": 7.411607738492329, "loss": 0.4206048548221588, "step": 22220 }, { "ce_loss": 0.03519580513238907, "epoch": 7.411607738492329, "step": 22220 }, { "distill_loss": 0.3233287036418915, "epoch": 7.411607738492329, "step": 22220 }, { "epoch": 7.411607738492329, "ref_ce_loss": 0.061676353216171265, "step": 22220 }, { "epoch": 7.411607738492329, "loss": 0.5416637659072876, "step": 22220 }, { "ce_loss": 0.03354215249419212, "epoch": 7.411607738492329, "step": 22220 }, { "distill_loss": 0.13695743680000305, "epoch": 7.411607738492329, "step": 22220 }, { "epoch": 7.411607738492329, "ref_ce_loss": 0.03787124529480934, "step": 22220 }, { "epoch": 7.411607738492329, "loss": 0.3412051200866699, "step": 22220 }, { "ce_loss": 0.01873054727911949, "epoch": 7.411607738492329, "step": 22220 }, { "distill_loss": 0.2385825216770172, "epoch": 7.411607738492329, "step": 22220 }, { "epoch": 7.411607738492329, "ref_ce_loss": 0.04805649444460869, "step": 22220 }, { "epoch": 7.414943295530353, "loss": 0.3411, "step": 22230 }, { "epoch": 7.414943295530353, "grad_norm": 2.7999589443206787, "step": 22230 }, { "epoch": 7.414943295530353, "learning_rate": 4.188128556529846e-06, "step": 22230 }, { "epoch": 7.414943295530353, "loss": 0.26248857378959656, "step": 22230 }, { "ce_loss": 0.017941398546099663, "epoch": 7.414943295530353, "step": 22230 }, { "distill_loss": 0.17652080953121185, "epoch": 7.414943295530353, "step": 22230 }, { "epoch": 7.414943295530353, "ref_ce_loss": 0.028682641685009003, "step": 22230 }, { "epoch": 7.414943295530353, "loss": 0.18539687991142273, "step": 22230 }, { "ce_loss": 0.026324966922402382, "epoch": 7.414943295530353, "step": 22230 }, { "distill_loss": 0.11990171670913696, "epoch": 7.414943295530353, "step": 22230 }, { "epoch": 7.414943295530353, "ref_ce_loss": 0.038930267095565796, "step": 22230 }, { "epoch": 7.414943295530353, "loss": 0.6258493065834045, "step": 22230 }, { "ce_loss": 0.06917796283960342, "epoch": 7.414943295530353, "step": 22230 }, { "distill_loss": 0.18112300336360931, "epoch": 7.414943295530353, "step": 22230 }, { "epoch": 7.414943295530353, "ref_ce_loss": 0.07729977369308472, "step": 22230 }, { "epoch": 7.414943295530353, "loss": 0.3908163905143738, "step": 22230 }, { "ce_loss": 0.08195307105779648, "epoch": 7.414943295530353, "step": 22230 }, { "distill_loss": 0.18970994651317596, "epoch": 7.414943295530353, "step": 22230 }, { "epoch": 7.414943295530353, "ref_ce_loss": 0.052481528371572495, "step": 22230 }, { "epoch": 7.418278852568379, "loss": 0.3054, "step": 22240 }, { "epoch": 7.418278852568379, "grad_norm": 2.6394460201263428, "step": 22240 }, { "epoch": 7.418278852568379, "learning_rate": 4.1407298294649064e-06, "step": 22240 }, { "epoch": 7.418278852568379, "loss": 0.3083247244358063, "step": 22240 }, { "ce_loss": 0.023212047293782234, "epoch": 7.418278852568379, "step": 22240 }, { "distill_loss": 0.19049547612667084, "epoch": 7.418278852568379, "step": 22240 }, { "epoch": 7.418278852568379, "ref_ce_loss": 0.07450911402702332, "step": 22240 }, { "epoch": 7.418278852568379, "loss": 0.27266043424606323, "step": 22240 }, { "ce_loss": 0.048310212790966034, "epoch": 7.418278852568379, "step": 22240 }, { "distill_loss": 0.15052545070648193, "epoch": 7.418278852568379, "step": 22240 }, { "epoch": 7.418278852568379, "ref_ce_loss": 0.06026710197329521, "step": 22240 }, { "epoch": 7.418278852568379, "loss": 0.20550397038459778, "step": 22240 }, { "ce_loss": 0.002684173174202442, "epoch": 7.418278852568379, "step": 22240 }, { "distill_loss": 0.157025545835495, "epoch": 7.418278852568379, "step": 22240 }, { "epoch": 7.418278852568379, "ref_ce_loss": 0.045708607882261276, "step": 22240 }, { "epoch": 7.418278852568379, "loss": 0.3732839822769165, "step": 22240 }, { "ce_loss": 0.048869553953409195, "epoch": 7.418278852568379, "step": 22240 }, { "distill_loss": 0.26047688722610474, "epoch": 7.418278852568379, "step": 22240 }, { "epoch": 7.418278852568379, "ref_ce_loss": 0.04291224107146263, "step": 22240 }, { "epoch": 7.421614409606404, "loss": 0.338, "step": 22250 }, { "epoch": 7.421614409606404, "grad_norm": 4.603501319885254, "step": 22250 }, { "epoch": 7.421614409606404, "learning_rate": 4.093597092171941e-06, "step": 22250 }, { "epoch": 7.421614409606404, "loss": 0.4850652813911438, "step": 22250 }, { "ce_loss": 0.014903169125318527, "epoch": 7.421614409606404, "step": 22250 }, { "distill_loss": 0.15682287514209747, "epoch": 7.421614409606404, "step": 22250 }, { "epoch": 7.421614409606404, "ref_ce_loss": 0.05811598151922226, "step": 22250 }, { "epoch": 7.421614409606404, "loss": 0.29419952630996704, "step": 22250 }, { "ce_loss": 0.01383958663791418, "epoch": 7.421614409606404, "step": 22250 }, { "distill_loss": 0.16313058137893677, "epoch": 7.421614409606404, "step": 22250 }, { "epoch": 7.421614409606404, "ref_ce_loss": 0.032282061874866486, "step": 22250 }, { "epoch": 7.421614409606404, "loss": 0.3315328359603882, "step": 22250 }, { "ce_loss": 0.0404106043279171, "epoch": 7.421614409606404, "step": 22250 }, { "distill_loss": 0.19743554294109344, "epoch": 7.421614409606404, "step": 22250 }, { "epoch": 7.421614409606404, "ref_ce_loss": 0.07366403937339783, "step": 22250 }, { "epoch": 7.421614409606404, "loss": 0.29002076387405396, "step": 22250 }, { "ce_loss": 0.024073943495750427, "epoch": 7.421614409606404, "step": 22250 }, { "distill_loss": 0.1868630051612854, "epoch": 7.421614409606404, "step": 22250 }, { "epoch": 7.421614409606404, "ref_ce_loss": 0.04270247742533684, "step": 22250 }, { "epoch": 7.42494996664443, "loss": 0.3519, "step": 22260 }, { "epoch": 7.42494996664443, "grad_norm": 2.3166913986206055, "step": 22260 }, { "epoch": 7.42494996664443, "learning_rate": 4.0467304306025125e-06, "step": 22260 }, { "epoch": 7.42494996664443, "loss": 0.3277677595615387, "step": 22260 }, { "ce_loss": 0.021930452436208725, "epoch": 7.42494996664443, "step": 22260 }, { "distill_loss": 0.16001535952091217, "epoch": 7.42494996664443, "step": 22260 }, { "epoch": 7.42494996664443, "ref_ce_loss": 0.050045643001794815, "step": 22260 }, { "epoch": 7.42494996664443, "loss": 0.40044260025024414, "step": 22260 }, { "ce_loss": 0.03517067804932594, "epoch": 7.42494996664443, "step": 22260 }, { "distill_loss": 0.16368120908737183, "epoch": 7.42494996664443, "step": 22260 }, { "epoch": 7.42494996664443, "ref_ce_loss": 0.05305195599794388, "step": 22260 }, { "epoch": 7.42494996664443, "loss": 0.38687756657600403, "step": 22260 }, { "ce_loss": 0.06107117980718613, "epoch": 7.42494996664443, "step": 22260 }, { "distill_loss": 0.1886368691921234, "epoch": 7.42494996664443, "step": 22260 }, { "epoch": 7.42494996664443, "ref_ce_loss": 0.06111163645982742, "step": 22260 }, { "epoch": 7.42494996664443, "loss": 0.3298104405403137, "step": 22260 }, { "ce_loss": 0.050029948353767395, "epoch": 7.42494996664443, "step": 22260 }, { "distill_loss": 0.13902169466018677, "epoch": 7.42494996664443, "step": 22260 }, { "epoch": 7.42494996664443, "ref_ce_loss": 0.056891169399023056, "step": 22260 }, { "epoch": 7.4282855236824545, "loss": 0.3252, "step": 22270 }, { "epoch": 7.4282855236824545, "grad_norm": 2.7258355617523193, "step": 22270 }, { "epoch": 7.4282855236824545, "learning_rate": 4.000129930222906e-06, "step": 22270 }, { "epoch": 7.4282855236824545, "loss": 0.2858698070049286, "step": 22270 }, { "ce_loss": 0.019824549555778503, "epoch": 7.4282855236824545, "step": 22270 }, { "distill_loss": 0.15271005034446716, "epoch": 7.4282855236824545, "step": 22270 }, { "epoch": 7.4282855236824545, "ref_ce_loss": 0.06389021873474121, "step": 22270 }, { "epoch": 7.4282855236824545, "loss": 0.2244950234889984, "step": 22270 }, { "ce_loss": 0.01788237690925598, "epoch": 7.4282855236824545, "step": 22270 }, { "distill_loss": 0.11896365880966187, "epoch": 7.4282855236824545, "step": 22270 }, { "epoch": 7.4282855236824545, "ref_ce_loss": 0.053938981145620346, "step": 22270 }, { "epoch": 7.4282855236824545, "loss": 0.2784402370452881, "step": 22270 }, { "ce_loss": 0.02266446314752102, "epoch": 7.4282855236824545, "step": 22270 }, { "distill_loss": 0.12554806470870972, "epoch": 7.4282855236824545, "step": 22270 }, { "epoch": 7.4282855236824545, "ref_ce_loss": 0.056586507707834244, "step": 22270 }, { "epoch": 7.4282855236824545, "loss": 0.2199580818414688, "step": 22270 }, { "ce_loss": 0.01635870523750782, "epoch": 7.4282855236824545, "step": 22270 }, { "distill_loss": 0.16090208292007446, "epoch": 7.4282855236824545, "step": 22270 }, { "epoch": 7.4282855236824545, "ref_ce_loss": 0.03102097287774086, "step": 22270 }, { "epoch": 7.431621080720481, "loss": 0.2772, "step": 22280 }, { "epoch": 7.431621080720481, "grad_norm": 3.1755130290985107, "step": 22280 }, { "epoch": 7.431621080720481, "learning_rate": 3.95379567601406e-06, "step": 22280 }, { "epoch": 7.431621080720481, "loss": 0.25050878524780273, "step": 22280 }, { "ce_loss": 0.041921354830265045, "epoch": 7.431621080720481, "step": 22280 }, { "distill_loss": 0.13335081934928894, "epoch": 7.431621080720481, "step": 22280 }, { "epoch": 7.431621080720481, "ref_ce_loss": 0.05765026807785034, "step": 22280 }, { "epoch": 7.431621080720481, "loss": 0.24195145070552826, "step": 22280 }, { "ce_loss": 0.06510787457227707, "epoch": 7.431621080720481, "step": 22280 }, { "distill_loss": 0.13477462530136108, "epoch": 7.431621080720481, "step": 22280 }, { "epoch": 7.431621080720481, "ref_ce_loss": 0.03168313577771187, "step": 22280 }, { "epoch": 7.431621080720481, "loss": 0.3614014983177185, "step": 22280 }, { "ce_loss": 0.06072500720620155, "epoch": 7.431621080720481, "step": 22280 }, { "distill_loss": 0.2287401407957077, "epoch": 7.431621080720481, "step": 22280 }, { "epoch": 7.431621080720481, "ref_ce_loss": 0.07151369005441666, "step": 22280 }, { "epoch": 7.431621080720481, "loss": 0.24856488406658173, "step": 22280 }, { "ce_loss": 0.009639018215239048, "epoch": 7.431621080720481, "step": 22280 }, { "distill_loss": 0.16313186287879944, "epoch": 7.431621080720481, "step": 22280 }, { "epoch": 7.431621080720481, "ref_ce_loss": 0.048035651445388794, "step": 22280 }, { "epoch": 7.434956637758505, "loss": 0.3219, "step": 22290 }, { "epoch": 7.434956637758505, "grad_norm": 2.436527729034424, "step": 22290 }, { "epoch": 7.434956637758505, "learning_rate": 3.9077277524714015e-06, "step": 22290 }, { "epoch": 7.434956637758505, "loss": 0.2814217805862427, "step": 22290 }, { "ce_loss": 0.03163493424654007, "epoch": 7.434956637758505, "step": 22290 }, { "distill_loss": 0.14511892199516296, "epoch": 7.434956637758505, "step": 22290 }, { "epoch": 7.434956637758505, "ref_ce_loss": 0.033653974533081055, "step": 22290 }, { "epoch": 7.434956637758505, "loss": 0.22175005078315735, "step": 22290 }, { "ce_loss": 0.03425311669707298, "epoch": 7.434956637758505, "step": 22290 }, { "distill_loss": 0.132270947098732, "epoch": 7.434956637758505, "step": 22290 }, { "epoch": 7.434956637758505, "ref_ce_loss": 0.03548649325966835, "step": 22290 }, { "epoch": 7.434956637758505, "loss": 0.2341788113117218, "step": 22290 }, { "ce_loss": 0.017922593280673027, "epoch": 7.434956637758505, "step": 22290 }, { "distill_loss": 0.18605180084705353, "epoch": 7.434956637758505, "step": 22290 }, { "epoch": 7.434956637758505, "ref_ce_loss": 0.030135784298181534, "step": 22290 }, { "epoch": 7.434956637758505, "loss": 0.24607765674591064, "step": 22290 }, { "ce_loss": 0.022272905334830284, "epoch": 7.434956637758505, "step": 22290 }, { "distill_loss": 0.12093356996774673, "epoch": 7.434956637758505, "step": 22290 }, { "epoch": 7.434956637758505, "ref_ce_loss": 0.03579564765095711, "step": 22290 }, { "epoch": 7.4382921947965315, "loss": 0.3158, "step": 22300 }, { "epoch": 7.4382921947965315, "grad_norm": 3.7742974758148193, "step": 22300 }, { "epoch": 7.4382921947965315, "learning_rate": 3.861926243604596e-06, "step": 22300 }, { "epoch": 7.4382921947965315, "loss": 0.14824345707893372, "step": 22300 }, { "ce_loss": 0.00874150637537241, "epoch": 7.4382921947965315, "step": 22300 }, { "distill_loss": 0.0849747583270073, "epoch": 7.4382921947965315, "step": 22300 }, { "epoch": 7.4382921947965315, "ref_ce_loss": 0.04229980707168579, "step": 22300 }, { "epoch": 7.4382921947965315, "loss": 0.25204765796661377, "step": 22300 }, { "ce_loss": 0.06930907815694809, "epoch": 7.4382921947965315, "step": 22300 }, { "distill_loss": 0.13660091161727905, "epoch": 7.4382921947965315, "step": 22300 }, { "epoch": 7.4382921947965315, "ref_ce_loss": 0.046087075024843216, "step": 22300 }, { "epoch": 7.4382921947965315, "loss": 0.2791946530342102, "step": 22300 }, { "ce_loss": 0.0685519129037857, "epoch": 7.4382921947965315, "step": 22300 }, { "distill_loss": 0.1319180130958557, "epoch": 7.4382921947965315, "step": 22300 }, { "epoch": 7.4382921947965315, "ref_ce_loss": 0.054504599422216415, "step": 22300 }, { "epoch": 7.4382921947965315, "loss": 0.2951914072036743, "step": 22300 }, { "ce_loss": 0.009827563539147377, "epoch": 7.4382921947965315, "step": 22300 }, { "distill_loss": 0.21003487706184387, "epoch": 7.4382921947965315, "step": 22300 }, { "epoch": 7.4382921947965315, "ref_ce_loss": 0.04733233526349068, "step": 22300 }, { "epoch": 7.441627751834556, "loss": 0.3302, "step": 22310 }, { "epoch": 7.441627751834556, "grad_norm": 4.366042137145996, "step": 22310 }, { "epoch": 7.441627751834556, "learning_rate": 3.816391232937549e-06, "step": 22310 }, { "epoch": 7.441627751834556, "loss": 0.33673936128616333, "step": 22310 }, { "ce_loss": 0.01169640477746725, "epoch": 7.441627751834556, "step": 22310 }, { "distill_loss": 0.1271243542432785, "epoch": 7.441627751834556, "step": 22310 }, { "epoch": 7.441627751834556, "ref_ce_loss": 0.067172110080719, "step": 22310 }, { "epoch": 7.441627751834556, "loss": 0.2582665681838989, "step": 22310 }, { "ce_loss": 0.043444760143756866, "epoch": 7.441627751834556, "step": 22310 }, { "distill_loss": 0.13749757409095764, "epoch": 7.441627751834556, "step": 22310 }, { "epoch": 7.441627751834556, "ref_ce_loss": 0.061841681599617004, "step": 22310 }, { "epoch": 7.441627751834556, "loss": 0.23331327736377716, "step": 22310 }, { "ce_loss": 0.015050852671265602, "epoch": 7.441627751834556, "step": 22310 }, { "distill_loss": 0.11916429549455643, "epoch": 7.441627751834556, "step": 22310 }, { "epoch": 7.441627751834556, "ref_ce_loss": 0.047242291271686554, "step": 22310 }, { "epoch": 7.441627751834556, "loss": 0.278873085975647, "step": 22310 }, { "ce_loss": 0.02831905707716942, "epoch": 7.441627751834556, "step": 22310 }, { "distill_loss": 0.17653679847717285, "epoch": 7.441627751834556, "step": 22310 }, { "epoch": 7.441627751834556, "ref_ce_loss": 0.03725513815879822, "step": 22310 }, { "epoch": 7.444963308872582, "loss": 0.3126, "step": 22320 }, { "epoch": 7.444963308872582, "grad_norm": 2.6882822513580322, "step": 22320 }, { "epoch": 7.444963308872582, "learning_rate": 3.7711228035081863e-06, "step": 22320 }, { "epoch": 7.444963308872582, "loss": 0.16673119366168976, "step": 22320 }, { "ce_loss": 0.0011848431313410401, "epoch": 7.444963308872582, "step": 22320 }, { "distill_loss": 0.13170509040355682, "epoch": 7.444963308872582, "step": 22320 }, { "epoch": 7.444963308872582, "ref_ce_loss": 0.018590757623314857, "step": 22320 }, { "epoch": 7.444963308872582, "loss": 0.27042847871780396, "step": 22320 }, { "ce_loss": 0.030981097370386124, "epoch": 7.444963308872582, "step": 22320 }, { "distill_loss": 0.12473780661821365, "epoch": 7.444963308872582, "step": 22320 }, { "epoch": 7.444963308872582, "ref_ce_loss": 0.03478186950087547, "step": 22320 }, { "epoch": 7.444963308872582, "loss": 0.16557539999485016, "step": 22320 }, { "ce_loss": 0.005286859814077616, "epoch": 7.444963308872582, "step": 22320 }, { "distill_loss": 0.10098185390233994, "epoch": 7.444963308872582, "step": 22320 }, { "epoch": 7.444963308872582, "ref_ce_loss": 0.028873972594738007, "step": 22320 }, { "epoch": 7.444963308872582, "loss": 0.2681327760219574, "step": 22320 }, { "ce_loss": 0.028545308858156204, "epoch": 7.444963308872582, "step": 22320 }, { "distill_loss": 0.16974365711212158, "epoch": 7.444963308872582, "step": 22320 }, { "epoch": 7.444963308872582, "ref_ce_loss": 0.029505878686904907, "step": 22320 }, { "epoch": 7.448298865910607, "loss": 0.2978, "step": 22330 }, { "epoch": 7.448298865910607, "grad_norm": 3.4482717514038086, "step": 22330 }, { "epoch": 7.448298865910607, "learning_rate": 3.7261210378682238e-06, "step": 22330 }, { "epoch": 7.448298865910607, "loss": 0.38903453946113586, "step": 22330 }, { "ce_loss": 0.056097060441970825, "epoch": 7.448298865910607, "step": 22330 }, { "distill_loss": 0.23296581208705902, "epoch": 7.448298865910607, "step": 22330 }, { "epoch": 7.448298865910607, "ref_ce_loss": 0.06274658441543579, "step": 22330 }, { "epoch": 7.448298865910607, "loss": 0.36393213272094727, "step": 22330 }, { "ce_loss": 0.06572960317134857, "epoch": 7.448298865910607, "step": 22330 }, { "distill_loss": 0.2237122654914856, "epoch": 7.448298865910607, "step": 22330 }, { "epoch": 7.448298865910607, "ref_ce_loss": 0.05479707568883896, "step": 22330 }, { "epoch": 7.448298865910607, "loss": 0.19030554592609406, "step": 22330 }, { "ce_loss": 0.0062269787304103374, "epoch": 7.448298865910607, "step": 22330 }, { "distill_loss": 0.1253552883863449, "epoch": 7.448298865910607, "step": 22330 }, { "epoch": 7.448298865910607, "ref_ce_loss": 0.03615328297019005, "step": 22330 }, { "epoch": 7.448298865910607, "loss": 0.5729780793190002, "step": 22330 }, { "ce_loss": 0.09128619730472565, "epoch": 7.448298865910607, "step": 22330 }, { "distill_loss": 0.1655218005180359, "epoch": 7.448298865910607, "step": 22330 }, { "epoch": 7.448298865910607, "ref_ce_loss": 0.07864798605442047, "step": 22330 }, { "epoch": 7.451634422948633, "loss": 0.3321, "step": 22340 }, { "epoch": 7.451634422948633, "grad_norm": 4.633417129516602, "step": 22340 }, { "epoch": 7.451634422948633, "learning_rate": 3.6813860180831824e-06, "step": 22340 }, { "epoch": 7.451634422948633, "loss": 0.20244018733501434, "step": 22340 }, { "ce_loss": 0.01871795393526554, "epoch": 7.451634422948633, "step": 22340 }, { "distill_loss": 0.12031986564397812, "epoch": 7.451634422948633, "step": 22340 }, { "epoch": 7.451634422948633, "ref_ce_loss": 0.036138541996479034, "step": 22340 }, { "epoch": 7.451634422948633, "loss": 0.2242555469274521, "step": 22340 }, { "ce_loss": 0.03619326651096344, "epoch": 7.451634422948633, "step": 22340 }, { "distill_loss": 0.1221376582980156, "epoch": 7.451634422948633, "step": 22340 }, { "epoch": 7.451634422948633, "ref_ce_loss": 0.04511701688170433, "step": 22340 }, { "epoch": 7.451634422948633, "loss": 0.19007287919521332, "step": 22340 }, { "ce_loss": 0.0217959713190794, "epoch": 7.451634422948633, "step": 22340 }, { "distill_loss": 0.12164486944675446, "epoch": 7.451634422948633, "step": 22340 }, { "epoch": 7.451634422948633, "ref_ce_loss": 0.04645758494734764, "step": 22340 }, { "epoch": 7.451634422948633, "loss": 0.3102092146873474, "step": 22340 }, { "ce_loss": 0.041494760662317276, "epoch": 7.451634422948633, "step": 22340 }, { "distill_loss": 0.11831970512866974, "epoch": 7.451634422948633, "step": 22340 }, { "epoch": 7.451634422948633, "ref_ce_loss": 0.05176829546689987, "step": 22340 }, { "epoch": 7.454969979986657, "loss": 0.2923, "step": 22350 }, { "epoch": 7.454969979986657, "grad_norm": 3.0117533206939697, "step": 22350 }, { "epoch": 7.454969979986657, "learning_rate": 3.6369178257320385e-06, "step": 22350 }, { "epoch": 7.454969979986657, "loss": 0.4076274633407593, "step": 22350 }, { "ce_loss": 0.047179240733385086, "epoch": 7.454969979986657, "step": 22350 }, { "distill_loss": 0.28673622012138367, "epoch": 7.454969979986657, "step": 22350 }, { "epoch": 7.454969979986657, "ref_ce_loss": 0.0505196787416935, "step": 22350 }, { "epoch": 7.454969979986657, "loss": 0.20299524068832397, "step": 22350 }, { "ce_loss": 0.016069624572992325, "epoch": 7.454969979986657, "step": 22350 }, { "distill_loss": 0.12883426249027252, "epoch": 7.454969979986657, "step": 22350 }, { "epoch": 7.454969979986657, "ref_ce_loss": 0.057666610926389694, "step": 22350 }, { "epoch": 7.454969979986657, "loss": 0.198039710521698, "step": 22350 }, { "ce_loss": 0.013000822626054287, "epoch": 7.454969979986657, "step": 22350 }, { "distill_loss": 0.13102981448173523, "epoch": 7.454969979986657, "step": 22350 }, { "epoch": 7.454969979986657, "ref_ce_loss": 0.035978831350803375, "step": 22350 }, { "epoch": 7.454969979986657, "loss": 0.514954149723053, "step": 22350 }, { "ce_loss": 0.025409677997231483, "epoch": 7.454969979986657, "step": 22350 }, { "distill_loss": 0.1583758145570755, "epoch": 7.454969979986657, "step": 22350 }, { "epoch": 7.454969979986657, "ref_ce_loss": 0.07159221172332764, "step": 22350 }, { "epoch": 7.458305537024684, "loss": 0.3204, "step": 22360 }, { "epoch": 7.458305537024684, "grad_norm": 4.352915287017822, "step": 22360 }, { "epoch": 7.458305537024684, "learning_rate": 3.592716541907259e-06, "step": 22360 }, { "epoch": 7.458305537024684, "loss": 0.21900595724582672, "step": 22360 }, { "ce_loss": 0.026947587728500366, "epoch": 7.458305537024684, "step": 22360 }, { "distill_loss": 0.13711225986480713, "epoch": 7.458305537024684, "step": 22360 }, { "epoch": 7.458305537024684, "ref_ce_loss": 0.03863257169723511, "step": 22360 }, { "epoch": 7.458305537024684, "loss": 0.3650367259979248, "step": 22360 }, { "ce_loss": 0.06087297573685646, "epoch": 7.458305537024684, "step": 22360 }, { "distill_loss": 0.24326029419898987, "epoch": 7.458305537024684, "step": 22360 }, { "epoch": 7.458305537024684, "ref_ce_loss": 0.03927703574299812, "step": 22360 }, { "epoch": 7.458305537024684, "loss": 0.19165660440921783, "step": 22360 }, { "ce_loss": 0.02028833143413067, "epoch": 7.458305537024684, "step": 22360 }, { "distill_loss": 0.12279950082302094, "epoch": 7.458305537024684, "step": 22360 }, { "epoch": 7.458305537024684, "ref_ce_loss": 0.048306904733181, "step": 22360 }, { "epoch": 7.458305537024684, "loss": 0.23307588696479797, "step": 22360 }, { "ce_loss": 0.024532662704586983, "epoch": 7.458305537024684, "step": 22360 }, { "distill_loss": 0.13930636644363403, "epoch": 7.458305537024684, "step": 22360 }, { "epoch": 7.458305537024684, "ref_ce_loss": 0.03750590234994888, "step": 22360 }, { "epoch": 7.461641094062708, "loss": 0.3426, "step": 22370 }, { "epoch": 7.461641094062708, "grad_norm": 5.301361083984375, "step": 22370 }, { "epoch": 7.461641094062708, "learning_rate": 3.5487822472145487e-06, "step": 22370 }, { "epoch": 7.461641094062708, "loss": 0.3800077438354492, "step": 22370 }, { "ce_loss": 0.06131187081336975, "epoch": 7.461641094062708, "step": 22370 }, { "distill_loss": 0.17494143545627594, "epoch": 7.461641094062708, "step": 22370 }, { "epoch": 7.461641094062708, "ref_ce_loss": 0.06068550422787666, "step": 22370 }, { "epoch": 7.461641094062708, "loss": 0.19597984850406647, "step": 22370 }, { "ce_loss": 0.03716249763965607, "epoch": 7.461641094062708, "step": 22370 }, { "distill_loss": 0.11552690714597702, "epoch": 7.461641094062708, "step": 22370 }, { "epoch": 7.461641094062708, "ref_ce_loss": 0.043000392615795135, "step": 22370 }, { "epoch": 7.461641094062708, "loss": 0.32776740193367004, "step": 22370 }, { "ce_loss": 0.04577228054404259, "epoch": 7.461641094062708, "step": 22370 }, { "distill_loss": 0.15617133677005768, "epoch": 7.461641094062708, "step": 22370 }, { "epoch": 7.461641094062708, "ref_ce_loss": 0.051974330097436905, "step": 22370 }, { "epoch": 7.461641094062708, "loss": 0.2957746088504791, "step": 22370 }, { "ce_loss": 0.02486003190279007, "epoch": 7.461641094062708, "step": 22370 }, { "distill_loss": 0.19778414070606232, "epoch": 7.461641094062708, "step": 22370 }, { "epoch": 7.461641094062708, "ref_ce_loss": 0.04370433837175369, "step": 22370 }, { "epoch": 7.464976651100734, "loss": 0.3297, "step": 22380 }, { "epoch": 7.464976651100734, "grad_norm": 2.884024143218994, "step": 22380 }, { "epoch": 7.464976651100734, "learning_rate": 3.5051150217727197e-06, "step": 22380 }, { "epoch": 7.464976651100734, "loss": 0.31238073110580444, "step": 22380 }, { "ce_loss": 0.05873372033238411, "epoch": 7.464976651100734, "step": 22380 }, { "distill_loss": 0.1443156749010086, "epoch": 7.464976651100734, "step": 22380 }, { "epoch": 7.464976651100734, "ref_ce_loss": 0.054802797734737396, "step": 22380 }, { "epoch": 7.464976651100734, "loss": 0.648821234703064, "step": 22380 }, { "ce_loss": 0.030163580551743507, "epoch": 7.464976651100734, "step": 22380 }, { "distill_loss": 0.17426523566246033, "epoch": 7.464976651100734, "step": 22380 }, { "epoch": 7.464976651100734, "ref_ce_loss": 0.06576082855463028, "step": 22380 }, { "epoch": 7.464976651100734, "loss": 0.34878626465797424, "step": 22380 }, { "ce_loss": 0.0797041729092598, "epoch": 7.464976651100734, "step": 22380 }, { "distill_loss": 0.192304790019989, "epoch": 7.464976651100734, "step": 22380 }, { "epoch": 7.464976651100734, "ref_ce_loss": 0.03504239022731781, "step": 22380 }, { "epoch": 7.464976651100734, "loss": 0.14502675831317902, "step": 22380 }, { "ce_loss": 0.003223717911168933, "epoch": 7.464976651100734, "step": 22380 }, { "distill_loss": 0.1210360899567604, "epoch": 7.464976651100734, "step": 22380 }, { "epoch": 7.464976651100734, "ref_ce_loss": 0.020702652633190155, "step": 22380 }, { "epoch": 7.468312208138759, "loss": 0.3046, "step": 22390 }, { "epoch": 7.468312208138759, "grad_norm": 2.712825059890747, "step": 22390 }, { "epoch": 7.468312208138759, "learning_rate": 3.4617149452135897e-06, "step": 22390 }, { "epoch": 7.468312208138759, "loss": 0.31714093685150146, "step": 22390 }, { "ce_loss": 0.02619621716439724, "epoch": 7.468312208138759, "step": 22390 }, { "distill_loss": 0.15785712003707886, "epoch": 7.468312208138759, "step": 22390 }, { "epoch": 7.468312208138759, "ref_ce_loss": 0.0551065132021904, "step": 22390 }, { "epoch": 7.468312208138759, "loss": 0.24565093219280243, "step": 22390 }, { "ce_loss": 0.01495307870209217, "epoch": 7.468312208138759, "step": 22390 }, { "distill_loss": 0.08529633283615112, "epoch": 7.468312208138759, "step": 22390 }, { "epoch": 7.468312208138759, "ref_ce_loss": 0.041590429842472076, "step": 22390 }, { "epoch": 7.468312208138759, "loss": 0.4593181312084198, "step": 22390 }, { "ce_loss": 0.02708379551768303, "epoch": 7.468312208138759, "step": 22390 }, { "distill_loss": 0.15476664900779724, "epoch": 7.468312208138759, "step": 22390 }, { "epoch": 7.468312208138759, "ref_ce_loss": 0.06588542461395264, "step": 22390 }, { "epoch": 7.468312208138759, "loss": 0.42947351932525635, "step": 22390 }, { "ce_loss": 0.05233415216207504, "epoch": 7.468312208138759, "step": 22390 }, { "distill_loss": 0.19729653000831604, "epoch": 7.468312208138759, "step": 22390 }, { "epoch": 7.468312208138759, "ref_ce_loss": 0.04568365216255188, "step": 22390 }, { "epoch": 7.471647765176785, "loss": 0.319, "step": 22400 }, { "epoch": 7.471647765176785, "grad_norm": 2.807758092880249, "step": 22400 }, { "epoch": 7.471647765176785, "learning_rate": 3.418582096681766e-06, "step": 22400 }, { "epoch": 7.471647765176785, "loss": 0.2579874098300934, "step": 22400 }, { "ce_loss": 0.01579379104077816, "epoch": 7.471647765176785, "step": 22400 }, { "distill_loss": 0.12108300626277924, "epoch": 7.471647765176785, "step": 22400 }, { "epoch": 7.471647765176785, "ref_ce_loss": 0.04175504297018051, "step": 22400 }, { "epoch": 7.471647765176785, "loss": 0.3060038089752197, "step": 22400 }, { "ce_loss": 0.02407224290072918, "epoch": 7.471647765176785, "step": 22400 }, { "distill_loss": 0.18588589131832123, "epoch": 7.471647765176785, "step": 22400 }, { "epoch": 7.471647765176785, "ref_ce_loss": 0.04403572157025337, "step": 22400 }, { "epoch": 7.471647765176785, "loss": 0.20720121264457703, "step": 22400 }, { "ce_loss": 0.027171071618795395, "epoch": 7.471647765176785, "step": 22400 }, { "distill_loss": 0.10979503393173218, "epoch": 7.471647765176785, "step": 22400 }, { "epoch": 7.471647765176785, "ref_ce_loss": 0.057181816548109055, "step": 22400 }, { "epoch": 7.471647765176785, "loss": 0.24890443682670593, "step": 22400 }, { "ce_loss": 0.0304836668074131, "epoch": 7.471647765176785, "step": 22400 }, { "distill_loss": 0.12231708317995071, "epoch": 7.471647765176785, "step": 22400 }, { "epoch": 7.471647765176785, "ref_ce_loss": 0.04766134172677994, "step": 22400 }, { "epoch": 7.474983322214809, "loss": 0.3347, "step": 22410 }, { "epoch": 7.474983322214809, "grad_norm": 2.27168345451355, "step": 22410 }, { "epoch": 7.474983322214809, "learning_rate": 3.375716554834529e-06, "step": 22410 }, { "epoch": 7.474983322214809, "loss": 0.22012270987033844, "step": 22410 }, { "ce_loss": 0.010555864311754704, "epoch": 7.474983322214809, "step": 22410 }, { "distill_loss": 0.14049920439720154, "epoch": 7.474983322214809, "step": 22410 }, { "epoch": 7.474983322214809, "ref_ce_loss": 0.03784632682800293, "step": 22410 }, { "epoch": 7.474983322214809, "loss": 0.21688060462474823, "step": 22410 }, { "ce_loss": 0.003909058403223753, "epoch": 7.474983322214809, "step": 22410 }, { "distill_loss": 0.1273895502090454, "epoch": 7.474983322214809, "step": 22410 }, { "epoch": 7.474983322214809, "ref_ce_loss": 0.06180055812001228, "step": 22410 }, { "epoch": 7.474983322214809, "loss": 0.6481152772903442, "step": 22410 }, { "ce_loss": 0.07658793032169342, "epoch": 7.474983322214809, "step": 22410 }, { "distill_loss": 0.22413767874240875, "epoch": 7.474983322214809, "step": 22410 }, { "epoch": 7.474983322214809, "ref_ce_loss": 0.0503942035138607, "step": 22410 }, { "epoch": 7.474983322214809, "loss": 0.4268360733985901, "step": 22410 }, { "ce_loss": 0.046731140464544296, "epoch": 7.474983322214809, "step": 22410 }, { "distill_loss": 0.11381864547729492, "epoch": 7.474983322214809, "step": 22410 }, { "epoch": 7.474983322214809, "ref_ce_loss": 0.03432987257838249, "step": 22410 }, { "epoch": 7.478318879252836, "loss": 0.3312, "step": 22420 }, { "epoch": 7.478318879252836, "grad_norm": 3.638110399246216, "step": 22420 }, { "epoch": 7.478318879252836, "learning_rate": 3.3331183978417496e-06, "step": 22420 }, { "epoch": 7.478318879252836, "loss": 0.20778049528598785, "step": 22420 }, { "ce_loss": 0.00603119982406497, "epoch": 7.478318879252836, "step": 22420 }, { "distill_loss": 0.16420957446098328, "epoch": 7.478318879252836, "step": 22420 }, { "epoch": 7.478318879252836, "ref_ce_loss": 0.03743180260062218, "step": 22420 }, { "epoch": 7.478318879252836, "loss": 0.20351877808570862, "step": 22420 }, { "ce_loss": 0.001148868934251368, "epoch": 7.478318879252836, "step": 22420 }, { "distill_loss": 0.11724899709224701, "epoch": 7.478318879252836, "step": 22420 }, { "epoch": 7.478318879252836, "ref_ce_loss": 0.052420634776353836, "step": 22420 }, { "epoch": 7.478318879252836, "loss": 0.2200160175561905, "step": 22420 }, { "ce_loss": 0.043612752109766006, "epoch": 7.478318879252836, "step": 22420 }, { "distill_loss": 0.12745922803878784, "epoch": 7.478318879252836, "step": 22420 }, { "epoch": 7.478318879252836, "ref_ce_loss": 0.03912469744682312, "step": 22420 }, { "epoch": 7.478318879252836, "loss": 0.2272098958492279, "step": 22420 }, { "ce_loss": 0.023629697039723396, "epoch": 7.478318879252836, "step": 22420 }, { "distill_loss": 0.15925733745098114, "epoch": 7.478318879252836, "step": 22420 }, { "epoch": 7.478318879252836, "ref_ce_loss": 0.02677595056593418, "step": 22420 }, { "epoch": 7.48165443629086, "loss": 0.2968, "step": 22430 }, { "epoch": 7.48165443629086, "grad_norm": 2.6959264278411865, "step": 22430 }, { "epoch": 7.48165443629086, "learning_rate": 3.2907877033856387e-06, "step": 22430 }, { "epoch": 7.48165443629086, "loss": 0.32253384590148926, "step": 22430 }, { "ce_loss": 0.06485221534967422, "epoch": 7.48165443629086, "step": 22430 }, { "distill_loss": 0.17065764963626862, "epoch": 7.48165443629086, "step": 22430 }, { "epoch": 7.48165443629086, "ref_ce_loss": 0.042805016040802, "step": 22430 }, { "epoch": 7.48165443629086, "loss": 0.3148137629032135, "step": 22430 }, { "ce_loss": 0.022289402782917023, "epoch": 7.48165443629086, "step": 22430 }, { "distill_loss": 0.09498833119869232, "epoch": 7.48165443629086, "step": 22430 }, { "epoch": 7.48165443629086, "ref_ce_loss": 0.07857701182365417, "step": 22430 }, { "epoch": 7.48165443629086, "loss": 0.19593794643878937, "step": 22430 }, { "ce_loss": 0.0311006810516119, "epoch": 7.48165443629086, "step": 22430 }, { "distill_loss": 0.10990280658006668, "epoch": 7.48165443629086, "step": 22430 }, { "epoch": 7.48165443629086, "ref_ce_loss": 0.03655588626861572, "step": 22430 }, { "epoch": 7.48165443629086, "loss": 0.23988619446754456, "step": 22430 }, { "ce_loss": 0.049732793122529984, "epoch": 7.48165443629086, "step": 22430 }, { "distill_loss": 0.10762669891119003, "epoch": 7.48165443629086, "step": 22430 }, { "epoch": 7.48165443629086, "ref_ce_loss": 0.043731361627578735, "step": 22430 }, { "epoch": 7.484989993328886, "loss": 0.3048, "step": 22440 }, { "epoch": 7.484989993328886, "grad_norm": 3.093358278274536, "step": 22440 }, { "epoch": 7.484989993328886, "learning_rate": 3.2487245486607137e-06, "step": 22440 }, { "epoch": 7.484989993328886, "loss": 0.29418665170669556, "step": 22440 }, { "ce_loss": 0.01880057156085968, "epoch": 7.484989993328886, "step": 22440 }, { "distill_loss": 0.0965670645236969, "epoch": 7.484989993328886, "step": 22440 }, { "epoch": 7.484989993328886, "ref_ce_loss": 0.03197801858186722, "step": 22440 }, { "epoch": 7.484989993328886, "loss": 0.5496841073036194, "step": 22440 }, { "ce_loss": 0.0656195804476738, "epoch": 7.484989993328886, "step": 22440 }, { "distill_loss": 0.3139565587043762, "epoch": 7.484989993328886, "step": 22440 }, { "epoch": 7.484989993328886, "ref_ce_loss": 0.08307035267353058, "step": 22440 }, { "epoch": 7.484989993328886, "loss": 0.32252851128578186, "step": 22440 }, { "ce_loss": 0.03442087024450302, "epoch": 7.484989993328886, "step": 22440 }, { "distill_loss": 0.24391041696071625, "epoch": 7.484989993328886, "step": 22440 }, { "epoch": 7.484989993328886, "ref_ce_loss": 0.04416259750723839, "step": 22440 }, { "epoch": 7.484989993328886, "loss": 0.6654932498931885, "step": 22440 }, { "ce_loss": 0.01959805190563202, "epoch": 7.484989993328886, "step": 22440 }, { "distill_loss": 0.22524727880954742, "epoch": 7.484989993328886, "step": 22440 }, { "epoch": 7.484989993328886, "ref_ce_loss": 0.08550272136926651, "step": 22440 }, { "epoch": 7.488325550366911, "loss": 0.3117, "step": 22450 }, { "epoch": 7.488325550366911, "grad_norm": 3.314997434616089, "step": 22450 }, { "epoch": 7.488325550366911, "learning_rate": 3.206929010373549e-06, "step": 22450 }, { "epoch": 7.488325550366911, "loss": 0.2675893306732178, "step": 22450 }, { "ce_loss": 0.001301237614825368, "epoch": 7.488325550366911, "step": 22450 }, { "distill_loss": 0.1589747667312622, "epoch": 7.488325550366911, "step": 22450 }, { "epoch": 7.488325550366911, "ref_ce_loss": 0.03607611358165741, "step": 22450 }, { "epoch": 7.488325550366911, "loss": 0.2627130150794983, "step": 22450 }, { "ce_loss": 0.03996428847312927, "epoch": 7.488325550366911, "step": 22450 }, { "distill_loss": 0.1684572398662567, "epoch": 7.488325550366911, "step": 22450 }, { "epoch": 7.488325550366911, "ref_ce_loss": 0.03458402305841446, "step": 22450 }, { "epoch": 7.488325550366911, "loss": 0.21059346199035645, "step": 22450 }, { "ce_loss": 0.0070762732066214085, "epoch": 7.488325550366911, "step": 22450 }, { "distill_loss": 0.10879668593406677, "epoch": 7.488325550366911, "step": 22450 }, { "epoch": 7.488325550366911, "ref_ce_loss": 0.05947238579392433, "step": 22450 }, { "epoch": 7.488325550366911, "loss": 0.3560434579849243, "step": 22450 }, { "ce_loss": 0.028698332607746124, "epoch": 7.488325550366911, "step": 22450 }, { "distill_loss": 0.13503122329711914, "epoch": 7.488325550366911, "step": 22450 }, { "epoch": 7.488325550366911, "ref_ce_loss": 0.072860948741436, "step": 22450 }, { "epoch": 7.491661107404937, "loss": 0.3396, "step": 22460 }, { "epoch": 7.491661107404937, "grad_norm": 3.122793674468994, "step": 22460 }, { "epoch": 7.491661107404937, "learning_rate": 3.165401164742709e-06, "step": 22460 }, { "epoch": 7.491661107404937, "loss": 0.6378544569015503, "step": 22460 }, { "ce_loss": 0.030728649348020554, "epoch": 7.491661107404937, "step": 22460 }, { "distill_loss": 0.4074413776397705, "epoch": 7.491661107404937, "step": 22460 }, { "epoch": 7.491661107404937, "ref_ce_loss": 0.05243341997265816, "step": 22460 }, { "epoch": 7.491661107404937, "loss": 0.22806666791439056, "step": 22460 }, { "ce_loss": 0.033266473561525345, "epoch": 7.491661107404937, "step": 22460 }, { "distill_loss": 0.11113715171813965, "epoch": 7.491661107404937, "step": 22460 }, { "epoch": 7.491661107404937, "ref_ce_loss": 0.053928226232528687, "step": 22460 }, { "epoch": 7.491661107404937, "loss": 0.2969411313533783, "step": 22460 }, { "ce_loss": 0.0053944881074130535, "epoch": 7.491661107404937, "step": 22460 }, { "distill_loss": 0.17704196274280548, "epoch": 7.491661107404937, "step": 22460 }, { "epoch": 7.491661107404937, "ref_ce_loss": 0.04825448989868164, "step": 22460 }, { "epoch": 7.491661107404937, "loss": 0.314153790473938, "step": 22460 }, { "ce_loss": 0.031788185238838196, "epoch": 7.491661107404937, "step": 22460 }, { "distill_loss": 0.19077731668949127, "epoch": 7.491661107404937, "step": 22460 }, { "epoch": 7.491661107404937, "ref_ce_loss": 0.0667947381734848, "step": 22460 }, { "epoch": 7.4949966644429615, "loss": 0.3497, "step": 22470 }, { "epoch": 7.4949966644429615, "grad_norm": 3.5940418243408203, "step": 22470 }, { "epoch": 7.4949966644429615, "learning_rate": 3.1241410874986495e-06, "step": 22470 }, { "epoch": 7.4949966644429615, "loss": 0.38101842999458313, "step": 22470 }, { "ce_loss": 0.02718617394566536, "epoch": 7.4949966644429615, "step": 22470 }, { "distill_loss": 0.16715385019779205, "epoch": 7.4949966644429615, "step": 22470 }, { "epoch": 7.4949966644429615, "ref_ce_loss": 0.05261871591210365, "step": 22470 }, { "epoch": 7.4949966644429615, "loss": 0.3260224163532257, "step": 22470 }, { "ce_loss": 0.04991668462753296, "epoch": 7.4949966644429615, "step": 22470 }, { "distill_loss": 0.13207031786441803, "epoch": 7.4949966644429615, "step": 22470 }, { "epoch": 7.4949966644429615, "ref_ce_loss": 0.0459817498922348, "step": 22470 }, { "epoch": 7.4949966644429615, "loss": 0.21622683107852936, "step": 22470 }, { "ce_loss": 0.021841280162334442, "epoch": 7.4949966644429615, "step": 22470 }, { "distill_loss": 0.14691105484962463, "epoch": 7.4949966644429615, "step": 22470 }, { "epoch": 7.4949966644429615, "ref_ce_loss": 0.033241719007492065, "step": 22470 }, { "epoch": 7.4949966644429615, "loss": 0.42072075605392456, "step": 22470 }, { "ce_loss": 0.04634197801351547, "epoch": 7.4949966644429615, "step": 22470 }, { "distill_loss": 0.16500091552734375, "epoch": 7.4949966644429615, "step": 22470 }, { "epoch": 7.4949966644429615, "ref_ce_loss": 0.05034958943724632, "step": 22470 }, { "epoch": 7.498332221480988, "loss": 0.3099, "step": 22480 }, { "epoch": 7.498332221480988, "grad_norm": 2.503765821456909, "step": 22480 }, { "epoch": 7.498332221480988, "learning_rate": 3.0831488538834328e-06, "step": 22480 }, { "epoch": 7.498332221480988, "loss": 0.24750734865665436, "step": 22480 }, { "ce_loss": 0.017951542511582375, "epoch": 7.498332221480988, "step": 22480 }, { "distill_loss": 0.17765724658966064, "epoch": 7.498332221480988, "step": 22480 }, { "epoch": 7.498332221480988, "ref_ce_loss": 0.04299882426857948, "step": 22480 }, { "epoch": 7.498332221480988, "loss": 0.530583918094635, "step": 22480 }, { "ce_loss": 0.05396854504942894, "epoch": 7.498332221480988, "step": 22480 }, { "distill_loss": 0.19391027092933655, "epoch": 7.498332221480988, "step": 22480 }, { "epoch": 7.498332221480988, "ref_ce_loss": 0.03469100221991539, "step": 22480 }, { "epoch": 7.498332221480988, "loss": 0.3143228590488434, "step": 22480 }, { "ce_loss": 0.003916238900274038, "epoch": 7.498332221480988, "step": 22480 }, { "distill_loss": 0.23221957683563232, "epoch": 7.498332221480988, "step": 22480 }, { "epoch": 7.498332221480988, "ref_ce_loss": 0.04830393195152283, "step": 22480 }, { "epoch": 7.498332221480988, "loss": 0.2439468801021576, "step": 22480 }, { "ce_loss": 0.03384243696928024, "epoch": 7.498332221480988, "step": 22480 }, { "distill_loss": 0.12823912501335144, "epoch": 7.498332221480988, "step": 22480 }, { "epoch": 7.498332221480988, "ref_ce_loss": 0.05655447021126747, "step": 22480 }, { "epoch": 7.501667778519012, "loss": 0.3374, "step": 22490 }, { "epoch": 7.501667778519012, "grad_norm": 3.2659122943878174, "step": 22490 }, { "epoch": 7.501667778519012, "learning_rate": 3.0424245386507286e-06, "step": 22490 }, { "epoch": 7.501667778519012, "loss": 0.1496151238679886, "step": 22490 }, { "ce_loss": 0.005388977937400341, "epoch": 7.501667778519012, "step": 22490 }, { "distill_loss": 0.10397054255008698, "epoch": 7.501667778519012, "step": 22490 }, { "epoch": 7.501667778519012, "ref_ce_loss": 0.040133412927389145, "step": 22490 }, { "epoch": 7.501667778519012, "loss": 0.41377997398376465, "step": 22490 }, { "ce_loss": 0.05353647097945213, "epoch": 7.501667778519012, "step": 22490 }, { "distill_loss": 0.21046878397464752, "epoch": 7.501667778519012, "step": 22490 }, { "epoch": 7.501667778519012, "ref_ce_loss": 0.05055622756481171, "step": 22490 }, { "epoch": 7.501667778519012, "loss": 0.298939049243927, "step": 22490 }, { "ce_loss": 0.07888401299715042, "epoch": 7.501667778519012, "step": 22490 }, { "distill_loss": 0.13656878471374512, "epoch": 7.501667778519012, "step": 22490 }, { "epoch": 7.501667778519012, "ref_ce_loss": 0.05380229651927948, "step": 22490 }, { "epoch": 7.501667778519012, "loss": 0.2905265688896179, "step": 22490 }, { "ce_loss": 0.022800395265221596, "epoch": 7.501667778519012, "step": 22490 }, { "distill_loss": 0.1571730673313141, "epoch": 7.501667778519012, "step": 22490 }, { "epoch": 7.501667778519012, "ref_ce_loss": 0.05390867590904236, "step": 22490 }, { "epoch": 7.5050033355570385, "loss": 0.321, "step": 22500 }, { "epoch": 7.5050033355570385, "grad_norm": 3.711552381515503, "step": 22500 }, { "epoch": 7.5050033355570385, "learning_rate": 3.0019682160656642e-06, "step": 22500 }, { "epoch": 7.5050033355570385, "loss": 0.2089131623506546, "step": 22500 }, { "ce_loss": 0.02487696148455143, "epoch": 7.5050033355570385, "step": 22500 }, { "distill_loss": 0.13498246669769287, "epoch": 7.5050033355570385, "step": 22500 }, { "epoch": 7.5050033355570385, "ref_ce_loss": 0.04884064197540283, "step": 22500 }, { "epoch": 7.5050033355570385, "loss": 0.5317951440811157, "step": 22500 }, { "ce_loss": 0.018521640449762344, "epoch": 7.5050033355570385, "step": 22500 }, { "distill_loss": 0.10497358441352844, "epoch": 7.5050033355570385, "step": 22500 }, { "epoch": 7.5050033355570385, "ref_ce_loss": 0.07159274071455002, "step": 22500 }, { "epoch": 7.5050033355570385, "loss": 0.33921340107917786, "step": 22500 }, { "ce_loss": 0.060679879039525986, "epoch": 7.5050033355570385, "step": 22500 }, { "distill_loss": 0.14240986108779907, "epoch": 7.5050033355570385, "step": 22500 }, { "epoch": 7.5050033355570385, "ref_ce_loss": 0.06749631464481354, "step": 22500 }, { "epoch": 7.5050033355570385, "loss": 0.3387434482574463, "step": 22500 }, { "ce_loss": 0.0036824748385697603, "epoch": 7.5050033355570385, "step": 22500 }, { "distill_loss": 0.14505694806575775, "epoch": 7.5050033355570385, "step": 22500 }, { "epoch": 7.5050033355570385, "ref_ce_loss": 0.04898282513022423, "step": 22500 }, { "epoch": 7.508338892595063, "loss": 0.343, "step": 22510 }, { "epoch": 7.508338892595063, "grad_norm": 3.4600985050201416, "step": 22510 }, { "epoch": 7.508338892595063, "learning_rate": 2.9617799599045588e-06, "step": 22510 }, { "epoch": 7.508338892595063, "loss": 0.23449355363845825, "step": 22510 }, { "ce_loss": 0.010881789028644562, "epoch": 7.508338892595063, "step": 22510 }, { "distill_loss": 0.17654874920845032, "epoch": 7.508338892595063, "step": 22510 }, { "epoch": 7.508338892595063, "ref_ce_loss": 0.02779705449938774, "step": 22510 }, { "epoch": 7.508338892595063, "loss": 0.5081146359443665, "step": 22510 }, { "ce_loss": 0.06562300026416779, "epoch": 7.508338892595063, "step": 22510 }, { "distill_loss": 0.2453111857175827, "epoch": 7.508338892595063, "step": 22510 }, { "epoch": 7.508338892595063, "ref_ce_loss": 0.07138422876596451, "step": 22510 }, { "epoch": 7.508338892595063, "loss": 0.17814725637435913, "step": 22510 }, { "ce_loss": 0.004538268316537142, "epoch": 7.508338892595063, "step": 22510 }, { "distill_loss": 0.1134893000125885, "epoch": 7.508338892595063, "step": 22510 }, { "epoch": 7.508338892595063, "ref_ce_loss": 0.03875197842717171, "step": 22510 }, { "epoch": 7.508338892595063, "loss": 0.24402591586112976, "step": 22510 }, { "ce_loss": 0.023568812757730484, "epoch": 7.508338892595063, "step": 22510 }, { "distill_loss": 0.16957291960716248, "epoch": 7.508338892595063, "step": 22510 }, { "epoch": 7.508338892595063, "ref_ce_loss": 0.0506262481212616, "step": 22510 }, { "epoch": 7.511674449633089, "loss": 0.3148, "step": 22520 }, { "epoch": 7.511674449633089, "grad_norm": 5.0570549964904785, "step": 22520 }, { "epoch": 7.511674449633089, "learning_rate": 2.9218598434549876e-06, "step": 22520 }, { "epoch": 7.511674449633089, "loss": 0.30400851368904114, "step": 22520 }, { "ce_loss": 0.02792799100279808, "epoch": 7.511674449633089, "step": 22520 }, { "distill_loss": 0.2112709879875183, "epoch": 7.511674449633089, "step": 22520 }, { "epoch": 7.511674449633089, "ref_ce_loss": 0.053309354931116104, "step": 22520 }, { "epoch": 7.511674449633089, "loss": 0.2785424590110779, "step": 22520 }, { "ce_loss": 0.041298575699329376, "epoch": 7.511674449633089, "step": 22520 }, { "distill_loss": 0.18324530124664307, "epoch": 7.511674449633089, "step": 22520 }, { "epoch": 7.511674449633089, "ref_ce_loss": 0.05387912318110466, "step": 22520 }, { "epoch": 7.511674449633089, "loss": 0.3883103132247925, "step": 22520 }, { "ce_loss": 0.010573752224445343, "epoch": 7.511674449633089, "step": 22520 }, { "distill_loss": 0.317501962184906, "epoch": 7.511674449633089, "step": 22520 }, { "epoch": 7.511674449633089, "ref_ce_loss": 0.036491647362709045, "step": 22520 }, { "epoch": 7.511674449633089, "loss": 0.18496163189411163, "step": 22520 }, { "ce_loss": 0.01726536639034748, "epoch": 7.511674449633089, "step": 22520 }, { "distill_loss": 0.13057807087898254, "epoch": 7.511674449633089, "step": 22520 }, { "epoch": 7.511674449633089, "ref_ce_loss": 0.03697721287608147, "step": 22520 }, { "epoch": 7.515010006671114, "loss": 0.3219, "step": 22530 }, { "epoch": 7.515010006671114, "grad_norm": 4.546923637390137, "step": 22530 }, { "epoch": 7.515010006671114, "learning_rate": 2.882207939515435e-06, "step": 22530 }, { "epoch": 7.515010006671114, "loss": 0.5167214274406433, "step": 22530 }, { "ce_loss": 0.02184062823653221, "epoch": 7.515010006671114, "step": 22530 }, { "distill_loss": 0.2753063440322876, "epoch": 7.515010006671114, "step": 22530 }, { "epoch": 7.515010006671114, "ref_ce_loss": 0.057973772287368774, "step": 22530 }, { "epoch": 7.515010006671114, "loss": 0.2361687868833542, "step": 22530 }, { "ce_loss": 0.043745920062065125, "epoch": 7.515010006671114, "step": 22530 }, { "distill_loss": 0.10686971247196198, "epoch": 7.515010006671114, "step": 22530 }, { "epoch": 7.515010006671114, "ref_ce_loss": 0.05657653138041496, "step": 22530 }, { "epoch": 7.515010006671114, "loss": 0.26156085729599, "step": 22530 }, { "ce_loss": 0.014919279143214226, "epoch": 7.515010006671114, "step": 22530 }, { "distill_loss": 0.17853476107120514, "epoch": 7.515010006671114, "step": 22530 }, { "epoch": 7.515010006671114, "ref_ce_loss": 0.05031314119696617, "step": 22530 }, { "epoch": 7.515010006671114, "loss": 0.22457215189933777, "step": 22530 }, { "ce_loss": 0.05354100838303566, "epoch": 7.515010006671114, "step": 22530 }, { "distill_loss": 0.11665277928113937, "epoch": 7.515010006671114, "step": 22530 }, { "epoch": 7.515010006671114, "ref_ce_loss": 0.05427921563386917, "step": 22530 }, { "epoch": 7.51834556370914, "loss": 0.3382, "step": 22540 }, { "epoch": 7.51834556370914, "grad_norm": 3.0563387870788574, "step": 22540 }, { "epoch": 7.51834556370914, "learning_rate": 2.842824320395376e-06, "step": 22540 }, { "epoch": 7.51834556370914, "loss": 0.3265399932861328, "step": 22540 }, { "ce_loss": 0.036622580140829086, "epoch": 7.51834556370914, "step": 22540 }, { "distill_loss": 0.18803177773952484, "epoch": 7.51834556370914, "step": 22540 }, { "epoch": 7.51834556370914, "ref_ce_loss": 0.05568910762667656, "step": 22540 }, { "epoch": 7.51834556370914, "loss": 0.3589942455291748, "step": 22540 }, { "ce_loss": 0.07598867267370224, "epoch": 7.51834556370914, "step": 22540 }, { "distill_loss": 0.2123461663722992, "epoch": 7.51834556370914, "step": 22540 }, { "epoch": 7.51834556370914, "ref_ce_loss": 0.051019832491874695, "step": 22540 }, { "epoch": 7.51834556370914, "loss": 0.19223347306251526, "step": 22540 }, { "ce_loss": 0.024045389145612717, "epoch": 7.51834556370914, "step": 22540 }, { "distill_loss": 0.12357673048973083, "epoch": 7.51834556370914, "step": 22540 }, { "epoch": 7.51834556370914, "ref_ce_loss": 0.0445307120680809, "step": 22540 }, { "epoch": 7.51834556370914, "loss": 0.242498517036438, "step": 22540 }, { "ce_loss": 0.021375758573412895, "epoch": 7.51834556370914, "step": 22540 }, { "distill_loss": 0.1691853255033493, "epoch": 7.51834556370914, "step": 22540 }, { "epoch": 7.51834556370914, "ref_ce_loss": 0.051571715623140335, "step": 22540 }, { "epoch": 7.521681120747164, "loss": 0.3048, "step": 22550 }, { "epoch": 7.521681120747164, "grad_norm": 3.6586086750030518, "step": 22550 }, { "epoch": 7.521681120747164, "learning_rate": 2.80370905791501e-06, "step": 22550 }, { "epoch": 7.521681120747164, "loss": 0.18601231276988983, "step": 22550 }, { "ce_loss": 0.018930969759821892, "epoch": 7.521681120747164, "step": 22550 }, { "distill_loss": 0.12149959802627563, "epoch": 7.521681120747164, "step": 22550 }, { "epoch": 7.521681120747164, "ref_ce_loss": 0.03169435262680054, "step": 22550 }, { "epoch": 7.521681120747164, "loss": 0.21780389547348022, "step": 22550 }, { "ce_loss": 0.017960339784622192, "epoch": 7.521681120747164, "step": 22550 }, { "distill_loss": 0.10064493864774704, "epoch": 7.521681120747164, "step": 22550 }, { "epoch": 7.521681120747164, "ref_ce_loss": 0.03087751753628254, "step": 22550 }, { "epoch": 7.521681120747164, "loss": 0.26542627811431885, "step": 22550 }, { "ce_loss": 0.04006009176373482, "epoch": 7.521681120747164, "step": 22550 }, { "distill_loss": 0.19296851754188538, "epoch": 7.521681120747164, "step": 22550 }, { "epoch": 7.521681120747164, "ref_ce_loss": 0.032322581857442856, "step": 22550 }, { "epoch": 7.521681120747164, "loss": 0.5728837251663208, "step": 22550 }, { "ce_loss": 0.023503512144088745, "epoch": 7.521681120747164, "step": 22550 }, { "distill_loss": 0.22009573876857758, "epoch": 7.521681120747164, "step": 22550 }, { "epoch": 7.521681120747164, "ref_ce_loss": 0.05492442473769188, "step": 22550 }, { "epoch": 7.525016677785191, "loss": 0.3103, "step": 22560 }, { "epoch": 7.525016677785191, "grad_norm": 5.0174641609191895, "step": 22560 }, { "epoch": 7.525016677785191, "learning_rate": 2.7648622234050955e-06, "step": 22560 }, { "epoch": 7.525016677785191, "loss": 0.27467140555381775, "step": 22560 }, { "ce_loss": 0.02688334323465824, "epoch": 7.525016677785191, "step": 22560 }, { "distill_loss": 0.17189669609069824, "epoch": 7.525016677785191, "step": 22560 }, { "epoch": 7.525016677785191, "ref_ce_loss": 0.050332751125097275, "step": 22560 }, { "epoch": 7.525016677785191, "loss": 0.24028046429157257, "step": 22560 }, { "ce_loss": 0.021113159134984016, "epoch": 7.525016677785191, "step": 22560 }, { "distill_loss": 0.11697718501091003, "epoch": 7.525016677785191, "step": 22560 }, { "epoch": 7.525016677785191, "ref_ce_loss": 0.061597343534231186, "step": 22560 }, { "epoch": 7.525016677785191, "loss": 0.19227828085422516, "step": 22560 }, { "ce_loss": 0.008966523222625256, "epoch": 7.525016677785191, "step": 22560 }, { "distill_loss": 0.122565358877182, "epoch": 7.525016677785191, "step": 22560 }, { "epoch": 7.525016677785191, "ref_ce_loss": 0.038611847907304764, "step": 22560 }, { "epoch": 7.525016677785191, "loss": 0.3993758261203766, "step": 22560 }, { "ce_loss": 0.018132777884602547, "epoch": 7.525016677785191, "step": 22560 }, { "distill_loss": 0.21471086144447327, "epoch": 7.525016677785191, "step": 22560 }, { "epoch": 7.525016677785191, "ref_ce_loss": 0.0625390112400055, "step": 22560 }, { "epoch": 7.528352234823215, "loss": 0.3156, "step": 22570 }, { "epoch": 7.528352234823215, "grad_norm": 3.8104166984558105, "step": 22570 }, { "epoch": 7.528352234823215, "learning_rate": 2.7262838877069982e-06, "step": 22570 }, { "epoch": 7.528352234823215, "loss": 0.36098623275756836, "step": 22570 }, { "ce_loss": 0.04272822290658951, "epoch": 7.528352234823215, "step": 22570 }, { "distill_loss": 0.17549145221710205, "epoch": 7.528352234823215, "step": 22570 }, { "epoch": 7.528352234823215, "ref_ce_loss": 0.05325407162308693, "step": 22570 }, { "epoch": 7.528352234823215, "loss": 0.31836822628974915, "step": 22570 }, { "ce_loss": 0.03900127857923508, "epoch": 7.528352234823215, "step": 22570 }, { "distill_loss": 0.1361355483531952, "epoch": 7.528352234823215, "step": 22570 }, { "epoch": 7.528352234823215, "ref_ce_loss": 0.07174643129110336, "step": 22570 }, { "epoch": 7.528352234823215, "loss": 0.37250638008117676, "step": 22570 }, { "ce_loss": 0.02075023390352726, "epoch": 7.528352234823215, "step": 22570 }, { "distill_loss": 0.19733430445194244, "epoch": 7.528352234823215, "step": 22570 }, { "epoch": 7.528352234823215, "ref_ce_loss": 0.05269167572259903, "step": 22570 }, { "epoch": 7.528352234823215, "loss": 0.4085127115249634, "step": 22570 }, { "ce_loss": 0.08139225840568542, "epoch": 7.528352234823215, "step": 22570 }, { "distill_loss": 0.15659675002098083, "epoch": 7.528352234823215, "step": 22570 }, { "epoch": 7.528352234823215, "ref_ce_loss": 0.0547124482691288, "step": 22570 }, { "epoch": 7.531687791861241, "loss": 0.3288, "step": 22580 }, { "epoch": 7.531687791861241, "grad_norm": 3.1944923400878906, "step": 22580 }, { "epoch": 7.531687791861241, "learning_rate": 2.687974121172326e-06, "step": 22580 }, { "epoch": 7.531687791861241, "loss": 0.4330209791660309, "step": 22580 }, { "ce_loss": 0.05265730619430542, "epoch": 7.531687791861241, "step": 22580 }, { "distill_loss": 0.21802186965942383, "epoch": 7.531687791861241, "step": 22580 }, { "epoch": 7.531687791861241, "ref_ce_loss": 0.05064508691430092, "step": 22580 }, { "epoch": 7.531687791861241, "loss": 0.2763577401638031, "step": 22580 }, { "ce_loss": 0.0059791612438857555, "epoch": 7.531687791861241, "step": 22580 }, { "distill_loss": 0.19465699791908264, "epoch": 7.531687791861241, "step": 22580 }, { "epoch": 7.531687791861241, "ref_ce_loss": 0.04413512349128723, "step": 22580 }, { "epoch": 7.531687791861241, "loss": 0.15393657982349396, "step": 22580 }, { "ce_loss": 0.031021667644381523, "epoch": 7.531687791861241, "step": 22580 }, { "distill_loss": 0.08659686893224716, "epoch": 7.531687791861241, "step": 22580 }, { "epoch": 7.531687791861241, "ref_ce_loss": 0.02658427506685257, "step": 22580 }, { "epoch": 7.531687791861241, "loss": 0.21959280967712402, "step": 22580 }, { "ce_loss": 0.0054153925739228725, "epoch": 7.531687791861241, "step": 22580 }, { "distill_loss": 0.14491917192935944, "epoch": 7.531687791861241, "step": 22580 }, { "epoch": 7.531687791861241, "ref_ce_loss": 0.036969490349292755, "step": 22580 }, { "epoch": 7.535023348899266, "loss": 0.3174, "step": 22590 }, { "epoch": 7.535023348899266, "grad_norm": 3.328404188156128, "step": 22590 }, { "epoch": 7.535023348899266, "learning_rate": 2.649932993663012e-06, "step": 22590 }, { "epoch": 7.535023348899266, "loss": 0.42319804430007935, "step": 22590 }, { "ce_loss": 0.030848875641822815, "epoch": 7.535023348899266, "step": 22590 }, { "distill_loss": 0.3383166790008545, "epoch": 7.535023348899266, "step": 22590 }, { "epoch": 7.535023348899266, "ref_ce_loss": 0.041436757892370224, "step": 22590 }, { "epoch": 7.535023348899266, "loss": 0.5148337483406067, "step": 22590 }, { "ce_loss": 0.012426701374351978, "epoch": 7.535023348899266, "step": 22590 }, { "distill_loss": 0.17431752383708954, "epoch": 7.535023348899266, "step": 22590 }, { "epoch": 7.535023348899266, "ref_ce_loss": 0.10234980285167694, "step": 22590 }, { "epoch": 7.535023348899266, "loss": 0.24014095962047577, "step": 22590 }, { "ce_loss": 0.0075201112776994705, "epoch": 7.535023348899266, "step": 22590 }, { "distill_loss": 0.16021908819675446, "epoch": 7.535023348899266, "step": 22590 }, { "epoch": 7.535023348899266, "ref_ce_loss": 0.044868361204862595, "step": 22590 }, { "epoch": 7.535023348899266, "loss": 0.27141350507736206, "step": 22590 }, { "ce_loss": 0.042211223393678665, "epoch": 7.535023348899266, "step": 22590 }, { "distill_loss": 0.10830758512020111, "epoch": 7.535023348899266, "step": 22590 }, { "epoch": 7.535023348899266, "ref_ce_loss": 0.061452366411685944, "step": 22590 }, { "epoch": 7.538358905937292, "loss": 0.3597, "step": 22600 }, { "epoch": 7.538358905937292, "grad_norm": 2.722214460372925, "step": 22600 }, { "epoch": 7.538358905937292, "learning_rate": 2.6121605745510475e-06, "step": 22600 }, { "epoch": 7.538358905937292, "loss": 0.3262496888637543, "step": 22600 }, { "ce_loss": 0.034692954272031784, "epoch": 7.538358905937292, "step": 22600 }, { "distill_loss": 0.23076552152633667, "epoch": 7.538358905937292, "step": 22600 }, { "epoch": 7.538358905937292, "ref_ce_loss": 0.04774779826402664, "step": 22600 }, { "epoch": 7.538358905937292, "loss": 0.4575178325176239, "step": 22600 }, { "ce_loss": 0.05895520746707916, "epoch": 7.538358905937292, "step": 22600 }, { "distill_loss": 0.2657552659511566, "epoch": 7.538358905937292, "step": 22600 }, { "epoch": 7.538358905937292, "ref_ce_loss": 0.09668312966823578, "step": 22600 }, { "epoch": 7.538358905937292, "loss": 0.3082955479621887, "step": 22600 }, { "ce_loss": 0.013990761712193489, "epoch": 7.538358905937292, "step": 22600 }, { "distill_loss": 0.2193032056093216, "epoch": 7.538358905937292, "step": 22600 }, { "epoch": 7.538358905937292, "ref_ce_loss": 0.04674745351076126, "step": 22600 }, { "epoch": 7.538358905937292, "loss": 0.21660171449184418, "step": 22600 }, { "ce_loss": 0.020478934049606323, "epoch": 7.538358905937292, "step": 22600 }, { "distill_loss": 0.13979895412921906, "epoch": 7.538358905937292, "step": 22600 }, { "epoch": 7.538358905937292, "ref_ce_loss": 0.0560581237077713, "step": 22600 }, { "epoch": 7.541694462975316, "loss": 0.2894, "step": 22610 }, { "epoch": 7.541694462975316, "grad_norm": 1.874458909034729, "step": 22610 }, { "epoch": 7.541694462975316, "learning_rate": 2.574656932718433e-06, "step": 22610 }, { "epoch": 7.541694462975316, "loss": 0.32055795192718506, "step": 22610 }, { "ce_loss": 0.028949877247214317, "epoch": 7.541694462975316, "step": 22610 }, { "distill_loss": 0.1181693822145462, "epoch": 7.541694462975316, "step": 22610 }, { "epoch": 7.541694462975316, "ref_ce_loss": 0.048266347497701645, "step": 22610 }, { "epoch": 7.541694462975316, "loss": 0.26999005675315857, "step": 22610 }, { "ce_loss": 0.0427066795527935, "epoch": 7.541694462975316, "step": 22610 }, { "distill_loss": 0.13798220455646515, "epoch": 7.541694462975316, "step": 22610 }, { "epoch": 7.541694462975316, "ref_ce_loss": 0.041861649602651596, "step": 22610 }, { "epoch": 7.541694462975316, "loss": 0.37607231736183167, "step": 22610 }, { "ce_loss": 0.023039717227220535, "epoch": 7.541694462975316, "step": 22610 }, { "distill_loss": 0.21862857043743134, "epoch": 7.541694462975316, "step": 22610 }, { "epoch": 7.541694462975316, "ref_ce_loss": 0.05255983769893646, "step": 22610 }, { "epoch": 7.541694462975316, "loss": 0.3798355758190155, "step": 22610 }, { "ce_loss": 0.06908748298883438, "epoch": 7.541694462975316, "step": 22610 }, { "distill_loss": 0.27530989050865173, "epoch": 7.541694462975316, "step": 22610 }, { "epoch": 7.541694462975316, "ref_ce_loss": 0.03530760481953621, "step": 22610 }, { "epoch": 7.545030020013343, "loss": 0.3135, "step": 22620 }, { "epoch": 7.545030020013343, "grad_norm": 3.8495988845825195, "step": 22620 }, { "epoch": 7.545030020013343, "learning_rate": 2.5374221365570435e-06, "step": 22620 }, { "epoch": 7.545030020013343, "loss": 0.3512667715549469, "step": 22620 }, { "ce_loss": 0.04440957307815552, "epoch": 7.545030020013343, "step": 22620 }, { "distill_loss": 0.20252563059329987, "epoch": 7.545030020013343, "step": 22620 }, { "epoch": 7.545030020013343, "ref_ce_loss": 0.0638410672545433, "step": 22620 }, { "epoch": 7.545030020013343, "loss": 0.25497967004776, "step": 22620 }, { "ce_loss": 0.025934748351573944, "epoch": 7.545030020013343, "step": 22620 }, { "distill_loss": 0.118730828166008, "epoch": 7.545030020013343, "step": 22620 }, { "epoch": 7.545030020013343, "ref_ce_loss": 0.04510124400258064, "step": 22620 }, { "epoch": 7.545030020013343, "loss": 0.33090710639953613, "step": 22620 }, { "ce_loss": 0.055013976991176605, "epoch": 7.545030020013343, "step": 22620 }, { "distill_loss": 0.22313421964645386, "epoch": 7.545030020013343, "step": 22620 }, { "epoch": 7.545030020013343, "ref_ce_loss": 0.05255810543894768, "step": 22620 }, { "epoch": 7.545030020013343, "loss": 0.3884758949279785, "step": 22620 }, { "ce_loss": 0.015666665509343147, "epoch": 7.545030020013343, "step": 22620 }, { "distill_loss": 0.23140020668506622, "epoch": 7.545030020013343, "step": 22620 }, { "epoch": 7.545030020013343, "ref_ce_loss": 0.061135634779930115, "step": 22620 }, { "epoch": 7.548365577051367, "loss": 0.3352, "step": 22630 }, { "epoch": 7.548365577051367, "grad_norm": 3.073249101638794, "step": 22630 }, { "epoch": 7.548365577051367, "learning_rate": 2.50045625396843e-06, "step": 22630 }, { "epoch": 7.548365577051367, "loss": 0.2203093022108078, "step": 22630 }, { "ce_loss": 0.002241781447082758, "epoch": 7.548365577051367, "step": 22630 }, { "distill_loss": 0.17604470252990723, "epoch": 7.548365577051367, "step": 22630 }, { "epoch": 7.548365577051367, "ref_ce_loss": 0.04158555343747139, "step": 22630 }, { "epoch": 7.548365577051367, "loss": 0.17070531845092773, "step": 22630 }, { "ce_loss": 0.015439015813171864, "epoch": 7.548365577051367, "step": 22630 }, { "distill_loss": 0.10008960217237473, "epoch": 7.548365577051367, "step": 22630 }, { "epoch": 7.548365577051367, "ref_ce_loss": 0.03315176069736481, "step": 22630 }, { "epoch": 7.548365577051367, "loss": 0.21806734800338745, "step": 22630 }, { "ce_loss": 0.044264134019613266, "epoch": 7.548365577051367, "step": 22630 }, { "distill_loss": 0.12509466707706451, "epoch": 7.548365577051367, "step": 22630 }, { "epoch": 7.548365577051367, "ref_ce_loss": 0.01920865662395954, "step": 22630 }, { "epoch": 7.548365577051367, "loss": 0.3088948428630829, "step": 22630 }, { "ce_loss": 0.027766257524490356, "epoch": 7.548365577051367, "step": 22630 }, { "distill_loss": 0.22647210955619812, "epoch": 7.548365577051367, "step": 22630 }, { "epoch": 7.548365577051367, "ref_ce_loss": 0.040508657693862915, "step": 22630 }, { "epoch": 7.551701134089393, "loss": 0.306, "step": 22640 }, { "epoch": 7.551701134089393, "grad_norm": 3.8144543170928955, "step": 22640 }, { "epoch": 7.551701134089393, "learning_rate": 2.4637593523637866e-06, "step": 22640 }, { "epoch": 7.551701134089393, "loss": 0.23312872648239136, "step": 22640 }, { "ce_loss": 0.032705824822187424, "epoch": 7.551701134089393, "step": 22640 }, { "distill_loss": 0.12312594056129456, "epoch": 7.551701134089393, "step": 22640 }, { "epoch": 7.551701134089393, "ref_ce_loss": 0.05955107882618904, "step": 22640 }, { "epoch": 7.551701134089393, "loss": 0.5067400932312012, "step": 22640 }, { "ce_loss": 0.019309131428599358, "epoch": 7.551701134089393, "step": 22640 }, { "distill_loss": 0.3579646050930023, "epoch": 7.551701134089393, "step": 22640 }, { "epoch": 7.551701134089393, "ref_ce_loss": 0.0795922577381134, "step": 22640 }, { "epoch": 7.551701134089393, "loss": 0.2373778223991394, "step": 22640 }, { "ce_loss": 0.03473667427897453, "epoch": 7.551701134089393, "step": 22640 }, { "distill_loss": 0.13048987090587616, "epoch": 7.551701134089393, "step": 22640 }, { "epoch": 7.551701134089393, "ref_ce_loss": 0.05845621973276138, "step": 22640 }, { "epoch": 7.551701134089393, "loss": 0.21917320787906647, "step": 22640 }, { "ce_loss": 0.044745367020368576, "epoch": 7.551701134089393, "step": 22640 }, { "distill_loss": 0.13405704498291016, "epoch": 7.551701134089393, "step": 22640 }, { "epoch": 7.551701134089393, "ref_ce_loss": 0.031917721033096313, "step": 22640 }, { "epoch": 7.555036691127418, "loss": 0.3399, "step": 22650 }, { "epoch": 7.555036691127418, "grad_norm": 2.414365768432617, "step": 22650 }, { "epoch": 7.555036691127418, "learning_rate": 2.4273314986637813e-06, "step": 22650 }, { "epoch": 7.555036691127418, "loss": 0.22703129053115845, "step": 22650 }, { "ce_loss": 0.024940047413110733, "epoch": 7.555036691127418, "step": 22650 }, { "distill_loss": 0.13973286747932434, "epoch": 7.555036691127418, "step": 22650 }, { "epoch": 7.555036691127418, "ref_ce_loss": 0.02829953469336033, "step": 22650 }, { "epoch": 7.555036691127418, "loss": 0.21240630745887756, "step": 22650 }, { "ce_loss": 0.0018051156075671315, "epoch": 7.555036691127418, "step": 22650 }, { "distill_loss": 0.14401955902576447, "epoch": 7.555036691127418, "step": 22650 }, { "epoch": 7.555036691127418, "ref_ce_loss": 0.024986490607261658, "step": 22650 }, { "epoch": 7.555036691127418, "loss": 0.4377570152282715, "step": 22650 }, { "ce_loss": 0.03963511064648628, "epoch": 7.555036691127418, "step": 22650 }, { "distill_loss": 0.20817264914512634, "epoch": 7.555036691127418, "step": 22650 }, { "epoch": 7.555036691127418, "ref_ce_loss": 0.055689506232738495, "step": 22650 }, { "epoch": 7.555036691127418, "loss": 0.28146106004714966, "step": 22650 }, { "ce_loss": 0.03253614529967308, "epoch": 7.555036691127418, "step": 22650 }, { "distill_loss": 0.13565385341644287, "epoch": 7.555036691127418, "step": 22650 }, { "epoch": 7.555036691127418, "ref_ce_loss": 0.04473499208688736, "step": 22650 }, { "epoch": 7.558372248165444, "loss": 0.3169, "step": 22660 }, { "epoch": 7.558372248165444, "grad_norm": 2.6511383056640625, "step": 22660 }, { "epoch": 7.558372248165444, "learning_rate": 2.3911727592984597e-06, "step": 22660 }, { "epoch": 7.558372248165444, "loss": 0.38556116819381714, "step": 22660 }, { "ce_loss": 0.033701617270708084, "epoch": 7.558372248165444, "step": 22660 }, { "distill_loss": 0.2773969769477844, "epoch": 7.558372248165444, "step": 22660 }, { "epoch": 7.558372248165444, "ref_ce_loss": 0.04286494478583336, "step": 22660 }, { "epoch": 7.558372248165444, "loss": 0.2758292853832245, "step": 22660 }, { "ce_loss": 0.0252090934664011, "epoch": 7.558372248165444, "step": 22660 }, { "distill_loss": 0.12194667011499405, "epoch": 7.558372248165444, "step": 22660 }, { "epoch": 7.558372248165444, "ref_ce_loss": 0.04058091342449188, "step": 22660 }, { "epoch": 7.558372248165444, "loss": 0.4743751287460327, "step": 22660 }, { "ce_loss": 0.10802572220563889, "epoch": 7.558372248165444, "step": 22660 }, { "distill_loss": 0.2059323936700821, "epoch": 7.558372248165444, "step": 22660 }, { "epoch": 7.558372248165444, "ref_ce_loss": 0.04329385235905647, "step": 22660 }, { "epoch": 7.558372248165444, "loss": 0.26553261280059814, "step": 22660 }, { "ce_loss": 0.027559636160731316, "epoch": 7.558372248165444, "step": 22660 }, { "distill_loss": 0.11232174932956696, "epoch": 7.558372248165444, "step": 22660 }, { "epoch": 7.558372248165444, "ref_ce_loss": 0.03847008943557739, "step": 22660 }, { "epoch": 7.5617078052034685, "loss": 0.3085, "step": 22670 }, { "epoch": 7.5617078052034685, "grad_norm": 3.0139541625976562, "step": 22670 }, { "epoch": 7.5617078052034685, "learning_rate": 2.355283200207092e-06, "step": 22670 }, { "epoch": 7.5617078052034685, "loss": 0.4569222927093506, "step": 22670 }, { "ce_loss": 0.10298973321914673, "epoch": 7.5617078052034685, "step": 22670 }, { "distill_loss": 0.23029737174510956, "epoch": 7.5617078052034685, "step": 22670 }, { "epoch": 7.5617078052034685, "ref_ce_loss": 0.08486660569906235, "step": 22670 }, { "epoch": 7.5617078052034685, "loss": 0.2001618891954422, "step": 22670 }, { "ce_loss": 0.03799591213464737, "epoch": 7.5617078052034685, "step": 22670 }, { "distill_loss": 0.1290132999420166, "epoch": 7.5617078052034685, "step": 22670 }, { "epoch": 7.5617078052034685, "ref_ce_loss": 0.03295820206403732, "step": 22670 }, { "epoch": 7.5617078052034685, "loss": 0.23542585968971252, "step": 22670 }, { "ce_loss": 0.07174541801214218, "epoch": 7.5617078052034685, "step": 22670 }, { "distill_loss": 0.10788275301456451, "epoch": 7.5617078052034685, "step": 22670 }, { "epoch": 7.5617078052034685, "ref_ce_loss": 0.038297832012176514, "step": 22670 }, { "epoch": 7.5617078052034685, "loss": 0.269972026348114, "step": 22670 }, { "ce_loss": 0.02759367786347866, "epoch": 7.5617078052034685, "step": 22670 }, { "distill_loss": 0.16550669074058533, "epoch": 7.5617078052034685, "step": 22670 }, { "epoch": 7.5617078052034685, "ref_ce_loss": 0.04217568784952164, "step": 22670 }, { "epoch": 7.565043362241495, "loss": 0.3112, "step": 22680 }, { "epoch": 7.565043362241495, "grad_norm": 3.507418155670166, "step": 22680 }, { "epoch": 7.565043362241495, "learning_rate": 2.319662886838075e-06, "step": 22680 }, { "epoch": 7.565043362241495, "loss": 0.18114669620990753, "step": 22680 }, { "ce_loss": 0.031178679317235947, "epoch": 7.565043362241495, "step": 22680 }, { "distill_loss": 0.10116644203662872, "epoch": 7.565043362241495, "step": 22680 }, { "epoch": 7.565043362241495, "ref_ce_loss": 0.04861505329608917, "step": 22680 }, { "epoch": 7.565043362241495, "loss": 0.32580307126045227, "step": 22680 }, { "ce_loss": 0.04682064428925514, "epoch": 7.565043362241495, "step": 22680 }, { "distill_loss": 0.2161271870136261, "epoch": 7.565043362241495, "step": 22680 }, { "epoch": 7.565043362241495, "ref_ce_loss": 0.062445301562547684, "step": 22680 }, { "epoch": 7.565043362241495, "loss": 0.31231689453125, "step": 22680 }, { "ce_loss": 0.012379839085042477, "epoch": 7.565043362241495, "step": 22680 }, { "distill_loss": 0.19624949991703033, "epoch": 7.565043362241495, "step": 22680 }, { "epoch": 7.565043362241495, "ref_ce_loss": 0.054930321872234344, "step": 22680 }, { "epoch": 7.565043362241495, "loss": 0.43421971797943115, "step": 22680 }, { "ce_loss": 0.07423277199268341, "epoch": 7.565043362241495, "step": 22680 }, { "distill_loss": 0.15245267748832703, "epoch": 7.565043362241495, "step": 22680 }, { "epoch": 7.565043362241495, "ref_ce_loss": 0.06348980963230133, "step": 22680 }, { "epoch": 7.568378919279519, "loss": 0.3025, "step": 22690 }, { "epoch": 7.568378919279519, "grad_norm": 2.772374153137207, "step": 22690 }, { "epoch": 7.568378919279519, "learning_rate": 2.2843118841488315e-06, "step": 22690 }, { "epoch": 7.568378919279519, "loss": 0.3740849494934082, "step": 22690 }, { "ce_loss": 0.05600299686193466, "epoch": 7.568378919279519, "step": 22690 }, { "distill_loss": 0.1683557629585266, "epoch": 7.568378919279519, "step": 22690 }, { "epoch": 7.568378919279519, "ref_ce_loss": 0.051150351762771606, "step": 22690 }, { "epoch": 7.568378919279519, "loss": 0.172612264752388, "step": 22690 }, { "ce_loss": 0.013495842926204205, "epoch": 7.568378919279519, "step": 22690 }, { "distill_loss": 0.1024136021733284, "epoch": 7.568378919279519, "step": 22690 }, { "epoch": 7.568378919279519, "ref_ce_loss": 0.03655596449971199, "step": 22690 }, { "epoch": 7.568378919279519, "loss": 0.22008052468299866, "step": 22690 }, { "ce_loss": 0.029409054666757584, "epoch": 7.568378919279519, "step": 22690 }, { "distill_loss": 0.1552516669034958, "epoch": 7.568378919279519, "step": 22690 }, { "epoch": 7.568378919279519, "ref_ce_loss": 0.03526454046368599, "step": 22690 }, { "epoch": 7.568378919279519, "loss": 0.4758220314979553, "step": 22690 }, { "ce_loss": 0.04872387647628784, "epoch": 7.568378919279519, "step": 22690 }, { "distill_loss": 0.2343422919511795, "epoch": 7.568378919279519, "step": 22690 }, { "epoch": 7.568378919279519, "ref_ce_loss": 0.04983096569776535, "step": 22690 }, { "epoch": 7.5717144763175455, "loss": 0.3149, "step": 22700 }, { "epoch": 7.5717144763175455, "grad_norm": 4.173081874847412, "step": 22700 }, { "epoch": 7.5717144763175455, "learning_rate": 2.249230256605611e-06, "step": 22700 }, { "epoch": 7.5717144763175455, "loss": 0.2925843894481659, "step": 22700 }, { "ce_loss": 0.03678572177886963, "epoch": 7.5717144763175455, "step": 22700 }, { "distill_loss": 0.17235167324543, "epoch": 7.5717144763175455, "step": 22700 }, { "epoch": 7.5717144763175455, "ref_ce_loss": 0.06548989564180374, "step": 22700 }, { "epoch": 7.5717144763175455, "loss": 0.2966710925102234, "step": 22700 }, { "ce_loss": 0.0343879796564579, "epoch": 7.5717144763175455, "step": 22700 }, { "distill_loss": 0.20626069605350494, "epoch": 7.5717144763175455, "step": 22700 }, { "epoch": 7.5717144763175455, "ref_ce_loss": 0.03526905179023743, "step": 22700 }, { "epoch": 7.5717144763175455, "loss": 0.27978846430778503, "step": 22700 }, { "ce_loss": 0.05245629698038101, "epoch": 7.5717144763175455, "step": 22700 }, { "distill_loss": 0.18162769079208374, "epoch": 7.5717144763175455, "step": 22700 }, { "epoch": 7.5717144763175455, "ref_ce_loss": 0.04530799016356468, "step": 22700 }, { "epoch": 7.5717144763175455, "loss": 0.18752911686897278, "step": 22700 }, { "ce_loss": 0.008691946044564247, "epoch": 7.5717144763175455, "step": 22700 }, { "distill_loss": 0.10002614557743073, "epoch": 7.5717144763175455, "step": 22700 }, { "epoch": 7.5717144763175455, "ref_ce_loss": 0.03438608720898628, "step": 22700 }, { "epoch": 7.57505003335557, "loss": 0.3044, "step": 22710 }, { "epoch": 7.57505003335557, "grad_norm": 3.8426270484924316, "step": 22710 }, { "epoch": 7.57505003335557, "learning_rate": 2.214418068183471e-06, "step": 22710 }, { "epoch": 7.57505003335557, "loss": 0.29993754625320435, "step": 22710 }, { "ce_loss": 0.002770091639831662, "epoch": 7.57505003335557, "step": 22710 }, { "distill_loss": 0.2069040983915329, "epoch": 7.57505003335557, "step": 22710 }, { "epoch": 7.57505003335557, "ref_ce_loss": 0.0428432896733284, "step": 22710 }, { "epoch": 7.57505003335557, "loss": 0.27584877610206604, "step": 22710 }, { "ce_loss": 0.07741032540798187, "epoch": 7.57505003335557, "step": 22710 }, { "distill_loss": 0.14802144467830658, "epoch": 7.57505003335557, "step": 22710 }, { "epoch": 7.57505003335557, "ref_ce_loss": 0.038278382271528244, "step": 22710 }, { "epoch": 7.57505003335557, "loss": 0.1868162751197815, "step": 22710 }, { "ce_loss": 0.024370383471250534, "epoch": 7.57505003335557, "step": 22710 }, { "distill_loss": 0.10414793342351913, "epoch": 7.57505003335557, "step": 22710 }, { "epoch": 7.57505003335557, "ref_ce_loss": 0.03851045295596123, "step": 22710 }, { "epoch": 7.57505003335557, "loss": 0.24429672956466675, "step": 22710 }, { "ce_loss": 0.009652115404605865, "epoch": 7.57505003335557, "step": 22710 }, { "distill_loss": 0.10027230530977249, "epoch": 7.57505003335557, "step": 22710 }, { "epoch": 7.57505003335557, "ref_ce_loss": 0.04407824948430061, "step": 22710 }, { "epoch": 7.578385590393596, "loss": 0.3243, "step": 22720 }, { "epoch": 7.578385590393596, "grad_norm": 3.664708375930786, "step": 22720 }, { "epoch": 7.578385590393596, "learning_rate": 2.1798753823661308e-06, "step": 22720 }, { "epoch": 7.578385590393596, "loss": 0.3406020700931549, "step": 22720 }, { "ce_loss": 0.06698433309793472, "epoch": 7.578385590393596, "step": 22720 }, { "distill_loss": 0.1397649645805359, "epoch": 7.578385590393596, "step": 22720 }, { "epoch": 7.578385590393596, "ref_ce_loss": 0.04336464777588844, "step": 22720 }, { "epoch": 7.578385590393596, "loss": 0.7421815395355225, "step": 22720 }, { "ce_loss": 0.09118698537349701, "epoch": 7.578385590393596, "step": 22720 }, { "distill_loss": 0.29913344979286194, "epoch": 7.578385590393596, "step": 22720 }, { "epoch": 7.578385590393596, "ref_ce_loss": 0.09334400296211243, "step": 22720 }, { "epoch": 7.578385590393596, "loss": 0.2611655294895172, "step": 22720 }, { "ce_loss": 0.0070807188749313354, "epoch": 7.578385590393596, "step": 22720 }, { "distill_loss": 0.14511138200759888, "epoch": 7.578385590393596, "step": 22720 }, { "epoch": 7.578385590393596, "ref_ce_loss": 0.038073521107435226, "step": 22720 }, { "epoch": 7.578385590393596, "loss": 0.26241040229797363, "step": 22720 }, { "ce_loss": 0.04022922366857529, "epoch": 7.578385590393596, "step": 22720 }, { "distill_loss": 0.1806827187538147, "epoch": 7.578385590393596, "step": 22720 }, { "epoch": 7.578385590393596, "ref_ce_loss": 0.03285058215260506, "step": 22720 }, { "epoch": 7.581721147431621, "loss": 0.337, "step": 22730 }, { "epoch": 7.581721147431621, "grad_norm": 3.567667007446289, "step": 22730 }, { "epoch": 7.581721147431621, "learning_rate": 2.1456022621458347e-06, "step": 22730 }, { "epoch": 7.581721147431621, "loss": 0.3689475655555725, "step": 22730 }, { "ce_loss": 0.10052093863487244, "epoch": 7.581721147431621, "step": 22730 }, { "distill_loss": 0.17419344186782837, "epoch": 7.581721147431621, "step": 22730 }, { "epoch": 7.581721147431621, "ref_ce_loss": 0.038105469197034836, "step": 22730 }, { "epoch": 7.581721147431621, "loss": 0.3528992533683777, "step": 22730 }, { "ce_loss": 0.025013385340571404, "epoch": 7.581721147431621, "step": 22730 }, { "distill_loss": 0.25239109992980957, "epoch": 7.581721147431621, "step": 22730 }, { "epoch": 7.581721147431621, "ref_ce_loss": 0.039866313338279724, "step": 22730 }, { "epoch": 7.581721147431621, "loss": 0.3315299451351166, "step": 22730 }, { "ce_loss": 0.045371163636446, "epoch": 7.581721147431621, "step": 22730 }, { "distill_loss": 0.18333026766777039, "epoch": 7.581721147431621, "step": 22730 }, { "epoch": 7.581721147431621, "ref_ce_loss": 0.05455062538385391, "step": 22730 }, { "epoch": 7.581721147431621, "loss": 0.1749919056892395, "step": 22730 }, { "ce_loss": 0.004356476478278637, "epoch": 7.581721147431621, "step": 22730 }, { "distill_loss": 0.12413419783115387, "epoch": 7.581721147431621, "step": 22730 }, { "epoch": 7.581721147431621, "ref_ce_loss": 0.03222940117120743, "step": 22730 }, { "epoch": 7.585056704469647, "loss": 0.2909, "step": 22740 }, { "epoch": 7.585056704469647, "grad_norm": 3.138673782348633, "step": 22740 }, { "epoch": 7.585056704469647, "learning_rate": 2.1115987700231873e-06, "step": 22740 }, { "epoch": 7.585056704469647, "loss": 0.20833194255828857, "step": 22740 }, { "ce_loss": 0.04004382714629173, "epoch": 7.585056704469647, "step": 22740 }, { "distill_loss": 0.12045399099588394, "epoch": 7.585056704469647, "step": 22740 }, { "epoch": 7.585056704469647, "ref_ce_loss": 0.02083931490778923, "step": 22740 }, { "epoch": 7.585056704469647, "loss": 0.2998196482658386, "step": 22740 }, { "ce_loss": 0.04018447920680046, "epoch": 7.585056704469647, "step": 22740 }, { "distill_loss": 0.15190063416957855, "epoch": 7.585056704469647, "step": 22740 }, { "epoch": 7.585056704469647, "ref_ce_loss": 0.03249194100499153, "step": 22740 }, { "epoch": 7.585056704469647, "loss": 0.1679462492465973, "step": 22740 }, { "ce_loss": 0.007802645675837994, "epoch": 7.585056704469647, "step": 22740 }, { "distill_loss": 0.11544067412614822, "epoch": 7.585056704469647, "step": 22740 }, { "epoch": 7.585056704469647, "ref_ce_loss": 0.032458383589982986, "step": 22740 }, { "epoch": 7.585056704469647, "loss": 0.31334739923477173, "step": 22740 }, { "ce_loss": 0.0546376071870327, "epoch": 7.585056704469647, "step": 22740 }, { "distill_loss": 0.20922823250293732, "epoch": 7.585056704469647, "step": 22740 }, { "epoch": 7.585056704469647, "ref_ce_loss": 0.04924513027071953, "step": 22740 }, { "epoch": 7.588392261507671, "loss": 0.2875, "step": 22750 }, { "epoch": 7.588392261507671, "grad_norm": 2.8379061222076416, "step": 22750 }, { "epoch": 7.588392261507671, "learning_rate": 2.0778649680071867e-06, "step": 22750 }, { "epoch": 7.588392261507671, "loss": 0.5469861030578613, "step": 22750 }, { "ce_loss": 0.0687357634305954, "epoch": 7.588392261507671, "step": 22750 }, { "distill_loss": 0.3410850465297699, "epoch": 7.588392261507671, "step": 22750 }, { "epoch": 7.588392261507671, "ref_ce_loss": 0.08484728634357452, "step": 22750 }, { "epoch": 7.588392261507671, "loss": 0.5010547041893005, "step": 22750 }, { "ce_loss": 0.010975878685712814, "epoch": 7.588392261507671, "step": 22750 }, { "distill_loss": 0.329995721578598, "epoch": 7.588392261507671, "step": 22750 }, { "epoch": 7.588392261507671, "ref_ce_loss": 0.05636501684784889, "step": 22750 }, { "epoch": 7.588392261507671, "loss": 0.25946545600891113, "step": 22750 }, { "ce_loss": 0.02155529521405697, "epoch": 7.588392261507671, "step": 22750 }, { "distill_loss": 0.1841026395559311, "epoch": 7.588392261507671, "step": 22750 }, { "epoch": 7.588392261507671, "ref_ce_loss": 0.05362752079963684, "step": 22750 }, { "epoch": 7.588392261507671, "loss": 0.20447981357574463, "step": 22750 }, { "ce_loss": 0.015658462420105934, "epoch": 7.588392261507671, "step": 22750 }, { "distill_loss": 0.13859112560749054, "epoch": 7.588392261507671, "step": 22750 }, { "epoch": 7.588392261507671, "ref_ce_loss": 0.050109222531318665, "step": 22750 }, { "epoch": 7.591727818545698, "loss": 0.3008, "step": 22760 }, { "epoch": 7.591727818545698, "grad_norm": 2.1122565269470215, "step": 22760 }, { "epoch": 7.591727818545698, "learning_rate": 2.0444009176149414e-06, "step": 22760 }, { "epoch": 7.591727818545698, "loss": 0.2298000007867813, "step": 22760 }, { "ce_loss": 0.018491631373763084, "epoch": 7.591727818545698, "step": 22760 }, { "distill_loss": 0.12337653338909149, "epoch": 7.591727818545698, "step": 22760 }, { "epoch": 7.591727818545698, "ref_ce_loss": 0.03470870107412338, "step": 22760 }, { "epoch": 7.591727818545698, "loss": 0.30644336342811584, "step": 22760 }, { "ce_loss": 0.022262774407863617, "epoch": 7.591727818545698, "step": 22760 }, { "distill_loss": 0.21743980050086975, "epoch": 7.591727818545698, "step": 22760 }, { "epoch": 7.591727818545698, "ref_ce_loss": 0.027649324387311935, "step": 22760 }, { "epoch": 7.591727818545698, "loss": 0.15351411700248718, "step": 22760 }, { "ce_loss": 0.014782496728003025, "epoch": 7.591727818545698, "step": 22760 }, { "distill_loss": 0.09903530776500702, "epoch": 7.591727818545698, "step": 22760 }, { "epoch": 7.591727818545698, "ref_ce_loss": 0.03957769274711609, "step": 22760 }, { "epoch": 7.591727818545698, "loss": 0.3583006262779236, "step": 22760 }, { "ce_loss": 0.07922881096601486, "epoch": 7.591727818545698, "step": 22760 }, { "distill_loss": 0.15731580555438995, "epoch": 7.591727818545698, "step": 22760 }, { "epoch": 7.591727818545698, "ref_ce_loss": 0.05712786689400673, "step": 22760 }, { "epoch": 7.595063375583722, "loss": 0.3052, "step": 22770 }, { "epoch": 7.595063375583722, "grad_norm": 4.922539234161377, "step": 22770 }, { "epoch": 7.595063375583722, "learning_rate": 2.011206679871702e-06, "step": 22770 }, { "epoch": 7.595063375583722, "loss": 0.2932712733745575, "step": 22770 }, { "ce_loss": 0.03609336167573929, "epoch": 7.595063375583722, "step": 22770 }, { "distill_loss": 0.17725861072540283, "epoch": 7.595063375583722, "step": 22770 }, { "epoch": 7.595063375583722, "ref_ce_loss": 0.056249652057886124, "step": 22770 }, { "epoch": 7.595063375583722, "loss": 0.2576265335083008, "step": 22770 }, { "ce_loss": 0.030701031908392906, "epoch": 7.595063375583722, "step": 22770 }, { "distill_loss": 0.16068829596042633, "epoch": 7.595063375583722, "step": 22770 }, { "epoch": 7.595063375583722, "ref_ce_loss": 0.04590294510126114, "step": 22770 }, { "epoch": 7.595063375583722, "loss": 0.5291174054145813, "step": 22770 }, { "ce_loss": 0.012448878027498722, "epoch": 7.595063375583722, "step": 22770 }, { "distill_loss": 0.2315932810306549, "epoch": 7.595063375583722, "step": 22770 }, { "epoch": 7.595063375583722, "ref_ce_loss": 0.052382610738277435, "step": 22770 }, { "epoch": 7.595063375583722, "loss": 0.5043610334396362, "step": 22770 }, { "ce_loss": 0.06939585506916046, "epoch": 7.595063375583722, "step": 22770 }, { "distill_loss": 0.2098284661769867, "epoch": 7.595063375583722, "step": 22770 }, { "epoch": 7.595063375583722, "ref_ce_loss": 0.07914602756500244, "step": 22770 }, { "epoch": 7.598398932621748, "loss": 0.3557, "step": 22780 }, { "epoch": 7.598398932621748, "grad_norm": 5.053133487701416, "step": 22780 }, { "epoch": 7.598398932621748, "learning_rate": 1.9782823153106808e-06, "step": 22780 }, { "epoch": 7.598398932621748, "loss": 0.3108091354370117, "step": 22780 }, { "ce_loss": 0.008028519339859486, "epoch": 7.598398932621748, "step": 22780 }, { "distill_loss": 0.15519407391548157, "epoch": 7.598398932621748, "step": 22780 }, { "epoch": 7.598398932621748, "ref_ce_loss": 0.0441800020635128, "step": 22780 }, { "epoch": 7.598398932621748, "loss": 0.2354048192501068, "step": 22780 }, { "ce_loss": 0.04343513026833534, "epoch": 7.598398932621748, "step": 22780 }, { "distill_loss": 0.1344708949327469, "epoch": 7.598398932621748, "step": 22780 }, { "epoch": 7.598398932621748, "ref_ce_loss": 0.0410119891166687, "step": 22780 }, { "epoch": 7.598398932621748, "loss": 0.3621920049190521, "step": 22780 }, { "ce_loss": 0.08181270211935043, "epoch": 7.598398932621748, "step": 22780 }, { "distill_loss": 0.21230238676071167, "epoch": 7.598398932621748, "step": 22780 }, { "epoch": 7.598398932621748, "ref_ce_loss": 0.056477129459381104, "step": 22780 }, { "epoch": 7.598398932621748, "loss": 0.2663940489292145, "step": 22780 }, { "ce_loss": 0.036799490451812744, "epoch": 7.598398932621748, "step": 22780 }, { "distill_loss": 0.1391897201538086, "epoch": 7.598398932621748, "step": 22780 }, { "epoch": 7.598398932621748, "ref_ce_loss": 0.05978408828377724, "step": 22780 }, { "epoch": 7.601734489659773, "loss": 0.3431, "step": 22790 }, { "epoch": 7.601734489659773, "grad_norm": 5.396562576293945, "step": 22790 }, { "epoch": 7.601734489659773, "learning_rate": 1.9456278839729165e-06, "step": 22790 }, { "epoch": 7.601734489659773, "loss": 0.2972421944141388, "step": 22790 }, { "ce_loss": 0.00912957452237606, "epoch": 7.601734489659773, "step": 22790 }, { "distill_loss": 0.10416600853204727, "epoch": 7.601734489659773, "step": 22790 }, { "epoch": 7.601734489659773, "ref_ce_loss": 0.024486741051077843, "step": 22790 }, { "epoch": 7.601734489659773, "loss": 0.3115427494049072, "step": 22790 }, { "ce_loss": 0.09411104768514633, "epoch": 7.601734489659773, "step": 22790 }, { "distill_loss": 0.13358083367347717, "epoch": 7.601734489659773, "step": 22790 }, { "epoch": 7.601734489659773, "ref_ce_loss": 0.05374515801668167, "step": 22790 }, { "epoch": 7.601734489659773, "loss": 0.25783830881118774, "step": 22790 }, { "ce_loss": 0.0035260599106550217, "epoch": 7.601734489659773, "step": 22790 }, { "distill_loss": 0.16822956502437592, "epoch": 7.601734489659773, "step": 22790 }, { "epoch": 7.601734489659773, "ref_ce_loss": 0.05114159360527992, "step": 22790 }, { "epoch": 7.601734489659773, "loss": 0.17619512975215912, "step": 22790 }, { "ce_loss": 0.0018359140958637, "epoch": 7.601734489659773, "step": 22790 }, { "distill_loss": 0.1140362024307251, "epoch": 7.601734489659773, "step": 22790 }, { "epoch": 7.601734489659773, "ref_ce_loss": 0.021149538457393646, "step": 22790 }, { "epoch": 7.605070046697799, "loss": 0.302, "step": 22800 }, { "epoch": 7.605070046697799, "grad_norm": 2.289646863937378, "step": 22800 }, { "epoch": 7.605070046697799, "learning_rate": 1.913243445407192e-06, "step": 22800 }, { "epoch": 7.605070046697799, "loss": 0.4405611455440521, "step": 22800 }, { "ce_loss": 0.03355856612324715, "epoch": 7.605070046697799, "step": 22800 }, { "distill_loss": 0.11231415718793869, "epoch": 7.605070046697799, "step": 22800 }, { "epoch": 7.605070046697799, "ref_ce_loss": 0.041811395436525345, "step": 22800 }, { "epoch": 7.605070046697799, "loss": 0.30768993496894836, "step": 22800 }, { "ce_loss": 0.02752743847668171, "epoch": 7.605070046697799, "step": 22800 }, { "distill_loss": 0.15471151471138, "epoch": 7.605070046697799, "step": 22800 }, { "epoch": 7.605070046697799, "ref_ce_loss": 0.03334573283791542, "step": 22800 }, { "epoch": 7.605070046697799, "loss": 0.18778252601623535, "step": 22800 }, { "ce_loss": 0.005070790182799101, "epoch": 7.605070046697799, "step": 22800 }, { "distill_loss": 0.10848856717348099, "epoch": 7.605070046697799, "step": 22800 }, { "epoch": 7.605070046697799, "ref_ce_loss": 0.03893861174583435, "step": 22800 }, { "epoch": 7.605070046697799, "loss": 0.4052788019180298, "step": 22800 }, { "ce_loss": 0.05161317065358162, "epoch": 7.605070046697799, "step": 22800 }, { "distill_loss": 0.2581633925437927, "epoch": 7.605070046697799, "step": 22800 }, { "epoch": 7.605070046697799, "ref_ce_loss": 0.06759831309318542, "step": 22800 }, { "epoch": 7.608405603735823, "loss": 0.3476, "step": 22810 }, { "epoch": 7.608405603735823, "grad_norm": 3.0757687091827393, "step": 22810 }, { "epoch": 7.608405603735823, "learning_rate": 1.8811290586699834e-06, "step": 22810 }, { "epoch": 7.608405603735823, "loss": 0.26413461565971375, "step": 22810 }, { "ce_loss": 0.03497738763689995, "epoch": 7.608405603735823, "step": 22810 }, { "distill_loss": 0.1722661554813385, "epoch": 7.608405603735823, "step": 22810 }, { "epoch": 7.608405603735823, "ref_ce_loss": 0.05670714005827904, "step": 22810 }, { "epoch": 7.608405603735823, "loss": 0.3696792721748352, "step": 22810 }, { "ce_loss": 0.03719585761427879, "epoch": 7.608405603735823, "step": 22810 }, { "distill_loss": 0.19359298050403595, "epoch": 7.608405603735823, "step": 22810 }, { "epoch": 7.608405603735823, "ref_ce_loss": 0.05511196702718735, "step": 22810 }, { "epoch": 7.608405603735823, "loss": 0.2747287154197693, "step": 22810 }, { "ce_loss": 0.02445542812347412, "epoch": 7.608405603735823, "step": 22810 }, { "distill_loss": 0.1185799315571785, "epoch": 7.608405603735823, "step": 22810 }, { "epoch": 7.608405603735823, "ref_ce_loss": 0.042290497571229935, "step": 22810 }, { "epoch": 7.608405603735823, "loss": 0.5267252326011658, "step": 22810 }, { "ce_loss": 0.009008359163999557, "epoch": 7.608405603735823, "step": 22810 }, { "distill_loss": 0.27464696764945984, "epoch": 7.608405603735823, "step": 22810 }, { "epoch": 7.608405603735823, "ref_ce_loss": 0.1000165343284607, "step": 22810 }, { "epoch": 7.61174116077385, "loss": 0.3708, "step": 22820 }, { "epoch": 7.61174116077385, "grad_norm": 4.702095031738281, "step": 22820 }, { "epoch": 7.61174116077385, "learning_rate": 1.849284782325211e-06, "step": 22820 }, { "epoch": 7.61174116077385, "loss": 0.5826479196548462, "step": 22820 }, { "ce_loss": 0.055447544902563095, "epoch": 7.61174116077385, "step": 22820 }, { "distill_loss": 0.3116193115711212, "epoch": 7.61174116077385, "step": 22820 }, { "epoch": 7.61174116077385, "ref_ce_loss": 0.08707407116889954, "step": 22820 }, { "epoch": 7.61174116077385, "loss": 0.2422042340040207, "step": 22820 }, { "ce_loss": 0.004890437703579664, "epoch": 7.61174116077385, "step": 22820 }, { "distill_loss": 0.17459377646446228, "epoch": 7.61174116077385, "step": 22820 }, { "epoch": 7.61174116077385, "ref_ce_loss": 0.062421608716249466, "step": 22820 }, { "epoch": 7.61174116077385, "loss": 0.8745191097259521, "step": 22820 }, { "ce_loss": 0.024280430749058723, "epoch": 7.61174116077385, "step": 22820 }, { "distill_loss": 0.18831610679626465, "epoch": 7.61174116077385, "step": 22820 }, { "epoch": 7.61174116077385, "ref_ce_loss": 0.06424184143543243, "step": 22820 }, { "epoch": 7.61174116077385, "loss": 0.27096831798553467, "step": 22820 }, { "ce_loss": 0.02626364678144455, "epoch": 7.61174116077385, "step": 22820 }, { "distill_loss": 0.15062075853347778, "epoch": 7.61174116077385, "step": 22820 }, { "epoch": 7.61174116077385, "ref_ce_loss": 0.03873790428042412, "step": 22820 }, { "epoch": 7.615076717811874, "loss": 0.3557, "step": 22830 }, { "epoch": 7.615076717811874, "grad_norm": 2.347228765487671, "step": 22830 }, { "epoch": 7.615076717811874, "learning_rate": 1.8177106744443392e-06, "step": 22830 }, { "epoch": 7.615076717811874, "loss": 0.4428894519805908, "step": 22830 }, { "ce_loss": 0.011006324551999569, "epoch": 7.615076717811874, "step": 22830 }, { "distill_loss": 0.10822677612304688, "epoch": 7.615076717811874, "step": 22830 }, { "epoch": 7.615076717811874, "ref_ce_loss": 0.04102815315127373, "step": 22830 }, { "epoch": 7.615076717811874, "loss": 0.4443509578704834, "step": 22830 }, { "ce_loss": 0.012050008401274681, "epoch": 7.615076717811874, "step": 22830 }, { "distill_loss": 0.31021803617477417, "epoch": 7.615076717811874, "step": 22830 }, { "epoch": 7.615076717811874, "ref_ce_loss": 0.0780838206410408, "step": 22830 }, { "epoch": 7.615076717811874, "loss": 0.3268548846244812, "step": 22830 }, { "ce_loss": 0.0733487457036972, "epoch": 7.615076717811874, "step": 22830 }, { "distill_loss": 0.14904393255710602, "epoch": 7.615076717811874, "step": 22830 }, { "epoch": 7.615076717811874, "ref_ce_loss": 0.06276282668113708, "step": 22830 }, { "epoch": 7.615076717811874, "loss": 0.3149307370185852, "step": 22830 }, { "ce_loss": 0.012128917500376701, "epoch": 7.615076717811874, "step": 22830 }, { "distill_loss": 0.17375248670578003, "epoch": 7.615076717811874, "step": 22830 }, { "epoch": 7.615076717811874, "ref_ce_loss": 0.03347927704453468, "step": 22830 }, { "epoch": 7.6184122748499, "loss": 0.3563, "step": 22840 }, { "epoch": 7.6184122748499, "grad_norm": 7.628005504608154, "step": 22840 }, { "epoch": 7.6184122748499, "learning_rate": 1.7864067926060432e-06, "step": 22840 }, { "epoch": 7.6184122748499, "loss": 0.45842376351356506, "step": 22840 }, { "ce_loss": 0.07494988292455673, "epoch": 7.6184122748499, "step": 22840 }, { "distill_loss": 0.21095065772533417, "epoch": 7.6184122748499, "step": 22840 }, { "epoch": 7.6184122748499, "ref_ce_loss": 0.0487661212682724, "step": 22840 }, { "epoch": 7.6184122748499, "loss": 0.3503536581993103, "step": 22840 }, { "ce_loss": 0.029765352606773376, "epoch": 7.6184122748499, "step": 22840 }, { "distill_loss": 0.19903971254825592, "epoch": 7.6184122748499, "step": 22840 }, { "epoch": 7.6184122748499, "ref_ce_loss": 0.05890416353940964, "step": 22840 }, { "epoch": 7.6184122748499, "loss": 0.2923668324947357, "step": 22840 }, { "ce_loss": 0.0437467023730278, "epoch": 7.6184122748499, "step": 22840 }, { "distill_loss": 0.1662764549255371, "epoch": 7.6184122748499, "step": 22840 }, { "epoch": 7.6184122748499, "ref_ce_loss": 0.08174384385347366, "step": 22840 }, { "epoch": 7.6184122748499, "loss": 0.21276073157787323, "step": 22840 }, { "ce_loss": 0.014563250355422497, "epoch": 7.6184122748499, "step": 22840 }, { "distill_loss": 0.12940552830696106, "epoch": 7.6184122748499, "step": 22840 }, { "epoch": 7.6184122748499, "ref_ce_loss": 0.04358154535293579, "step": 22840 }, { "epoch": 7.621747831887925, "loss": 0.32, "step": 22850 }, { "epoch": 7.621747831887925, "grad_norm": 3.632291316986084, "step": 22850 }, { "epoch": 7.621747831887925, "learning_rate": 1.7553731938962756e-06, "step": 22850 }, { "epoch": 7.621747831887925, "loss": 0.37080612778663635, "step": 22850 }, { "ce_loss": 0.03212735056877136, "epoch": 7.621747831887925, "step": 22850 }, { "distill_loss": 0.2639005780220032, "epoch": 7.621747831887925, "step": 22850 }, { "epoch": 7.621747831887925, "ref_ce_loss": 0.050537895411252975, "step": 22850 }, { "epoch": 7.621747831887925, "loss": 0.30988186597824097, "step": 22850 }, { "ce_loss": 0.07211822271347046, "epoch": 7.621747831887925, "step": 22850 }, { "distill_loss": 0.15322273969650269, "epoch": 7.621747831887925, "step": 22850 }, { "epoch": 7.621747831887925, "ref_ce_loss": 0.05520307272672653, "step": 22850 }, { "epoch": 7.621747831887925, "loss": 0.18619802594184875, "step": 22850 }, { "ce_loss": 0.0006898845313116908, "epoch": 7.621747831887925, "step": 22850 }, { "distill_loss": 0.10551872849464417, "epoch": 7.621747831887925, "step": 22850 }, { "epoch": 7.621747831887925, "ref_ce_loss": 0.02266588620841503, "step": 22850 }, { "epoch": 7.621747831887925, "loss": 0.16827353835105896, "step": 22850 }, { "ce_loss": 0.029503915458917618, "epoch": 7.621747831887925, "step": 22850 }, { "distill_loss": 0.09243550896644592, "epoch": 7.621747831887925, "step": 22850 }, { "epoch": 7.621747831887925, "ref_ce_loss": 0.03407926857471466, "step": 22850 }, { "epoch": 7.625083388925951, "loss": 0.2909, "step": 22860 }, { "epoch": 7.625083388925951, "grad_norm": 2.7291767597198486, "step": 22860 }, { "epoch": 7.625083388925951, "learning_rate": 1.7246099349080665e-06, "step": 22860 }, { "epoch": 7.625083388925951, "loss": 0.213385671377182, "step": 22860 }, { "ce_loss": 0.028410280123353004, "epoch": 7.625083388925951, "step": 22860 }, { "distill_loss": 0.12494742125272751, "epoch": 7.625083388925951, "step": 22860 }, { "epoch": 7.625083388925951, "ref_ce_loss": 0.04404313489794731, "step": 22860 }, { "epoch": 7.625083388925951, "loss": 0.28631460666656494, "step": 22860 }, { "ce_loss": 0.014288785867393017, "epoch": 7.625083388925951, "step": 22860 }, { "distill_loss": 0.17968155443668365, "epoch": 7.625083388925951, "step": 22860 }, { "epoch": 7.625083388925951, "ref_ce_loss": 0.0649716928601265, "step": 22860 }, { "epoch": 7.625083388925951, "loss": 0.2869929075241089, "step": 22860 }, { "ce_loss": 0.041470956057310104, "epoch": 7.625083388925951, "step": 22860 }, { "distill_loss": 0.19409871101379395, "epoch": 7.625083388925951, "step": 22860 }, { "epoch": 7.625083388925951, "ref_ce_loss": 0.037282612174749374, "step": 22860 }, { "epoch": 7.625083388925951, "loss": 0.23375526070594788, "step": 22860 }, { "ce_loss": 0.048656996339559555, "epoch": 7.625083388925951, "step": 22860 }, { "distill_loss": 0.115402951836586, "epoch": 7.625083388925951, "step": 22860 }, { "epoch": 7.625083388925951, "ref_ce_loss": 0.047733623534440994, "step": 22860 }, { "epoch": 7.6284189459639755, "loss": 0.3182, "step": 22870 }, { "epoch": 7.6284189459639755, "grad_norm": 1.8855068683624268, "step": 22870 }, { "epoch": 7.6284189459639755, "learning_rate": 1.6941170717414577e-06, "step": 22870 }, { "epoch": 7.6284189459639755, "loss": 0.23261196911334991, "step": 22870 }, { "ce_loss": 0.02123933471739292, "epoch": 7.6284189459639755, "step": 22870 }, { "distill_loss": 0.10758798569440842, "epoch": 7.6284189459639755, "step": 22870 }, { "epoch": 7.6284189459639755, "ref_ce_loss": 0.049759477376937866, "step": 22870 }, { "epoch": 7.6284189459639755, "loss": 0.2860763370990753, "step": 22870 }, { "ce_loss": 0.06312866508960724, "epoch": 7.6284189459639755, "step": 22870 }, { "distill_loss": 0.11870720982551575, "epoch": 7.6284189459639755, "step": 22870 }, { "epoch": 7.6284189459639755, "ref_ce_loss": 0.041579004377126694, "step": 22870 }, { "epoch": 7.6284189459639755, "loss": 0.5179301500320435, "step": 22870 }, { "ce_loss": 0.03702806681394577, "epoch": 7.6284189459639755, "step": 22870 }, { "distill_loss": 0.2259015291929245, "epoch": 7.6284189459639755, "step": 22870 }, { "epoch": 7.6284189459639755, "ref_ce_loss": 0.0677097737789154, "step": 22870 }, { "epoch": 7.6284189459639755, "loss": 0.20624791085720062, "step": 22870 }, { "ce_loss": 0.01307401992380619, "epoch": 7.6284189459639755, "step": 22870 }, { "distill_loss": 0.13857774436473846, "epoch": 7.6284189459639755, "step": 22870 }, { "epoch": 7.6284189459639755, "ref_ce_loss": 0.054351504892110825, "step": 22870 }, { "epoch": 7.631754503002002, "loss": 0.3066, "step": 22880 }, { "epoch": 7.631754503002002, "grad_norm": 3.6401045322418213, "step": 22880 }, { "epoch": 7.631754503002002, "learning_rate": 1.6638946600034175e-06, "step": 22880 }, { "epoch": 7.631754503002002, "loss": 0.22979536652565002, "step": 22880 }, { "ce_loss": 0.02197353169322014, "epoch": 7.631754503002002, "step": 22880 }, { "distill_loss": 0.1502780318260193, "epoch": 7.631754503002002, "step": 22880 }, { "epoch": 7.631754503002002, "ref_ce_loss": 0.05745101720094681, "step": 22880 }, { "epoch": 7.631754503002002, "loss": 0.4088671803474426, "step": 22880 }, { "ce_loss": 0.06212860345840454, "epoch": 7.631754503002002, "step": 22880 }, { "distill_loss": 0.12440603971481323, "epoch": 7.631754503002002, "step": 22880 }, { "epoch": 7.631754503002002, "ref_ce_loss": 0.06890290975570679, "step": 22880 }, { "epoch": 7.631754503002002, "loss": 0.3185475170612335, "step": 22880 }, { "ce_loss": 0.045057594776153564, "epoch": 7.631754503002002, "step": 22880 }, { "distill_loss": 0.2054925560951233, "epoch": 7.631754503002002, "step": 22880 }, { "epoch": 7.631754503002002, "ref_ce_loss": 0.024748776108026505, "step": 22880 }, { "epoch": 7.631754503002002, "loss": 0.27203235030174255, "step": 22880 }, { "ce_loss": 0.015324220061302185, "epoch": 7.631754503002002, "step": 22880 }, { "distill_loss": 0.14852091670036316, "epoch": 7.631754503002002, "step": 22880 }, { "epoch": 7.631754503002002, "ref_ce_loss": 0.03557129576802254, "step": 22880 }, { "epoch": 7.635090060040026, "loss": 0.3026, "step": 22890 }, { "epoch": 7.635090060040026, "grad_norm": 4.086019515991211, "step": 22890 }, { "epoch": 7.635090060040026, "learning_rate": 1.6339427548076934e-06, "step": 22890 }, { "epoch": 7.635090060040026, "loss": 0.20336809754371643, "step": 22890 }, { "ce_loss": 0.015538395382463932, "epoch": 7.635090060040026, "step": 22890 }, { "distill_loss": 0.11135879158973694, "epoch": 7.635090060040026, "step": 22890 }, { "epoch": 7.635090060040026, "ref_ce_loss": 0.04804733768105507, "step": 22890 }, { "epoch": 7.635090060040026, "loss": 0.24145036935806274, "step": 22890 }, { "ce_loss": 0.033301178365945816, "epoch": 7.635090060040026, "step": 22890 }, { "distill_loss": 0.12386928498744965, "epoch": 7.635090060040026, "step": 22890 }, { "epoch": 7.635090060040026, "ref_ce_loss": 0.04221475124359131, "step": 22890 }, { "epoch": 7.635090060040026, "loss": 0.197758749127388, "step": 22890 }, { "ce_loss": 0.0030058466363698244, "epoch": 7.635090060040026, "step": 22890 }, { "distill_loss": 0.11973407119512558, "epoch": 7.635090060040026, "step": 22890 }, { "epoch": 7.635090060040026, "ref_ce_loss": 0.05511556565761566, "step": 22890 }, { "epoch": 7.635090060040026, "loss": 0.24757030606269836, "step": 22890 }, { "ce_loss": 0.04953973740339279, "epoch": 7.635090060040026, "step": 22890 }, { "distill_loss": 0.13940845429897308, "epoch": 7.635090060040026, "step": 22890 }, { "epoch": 7.635090060040026, "ref_ce_loss": 0.037517912685871124, "step": 22890 }, { "epoch": 7.6384256170780525, "loss": 0.3258, "step": 22900 }, { "epoch": 7.6384256170780525, "grad_norm": 5.1843180656433105, "step": 22900 }, { "epoch": 7.6384256170780525, "learning_rate": 1.6042614107747597e-06, "step": 22900 }, { "epoch": 7.6384256170780525, "loss": 0.2708713114261627, "step": 22900 }, { "ce_loss": 0.026853064075112343, "epoch": 7.6384256170780525, "step": 22900 }, { "distill_loss": 0.1342555582523346, "epoch": 7.6384256170780525, "step": 22900 }, { "epoch": 7.6384256170780525, "ref_ce_loss": 0.04602363333106041, "step": 22900 }, { "epoch": 7.6384256170780525, "loss": 0.2061038613319397, "step": 22900 }, { "ce_loss": 0.021708890795707703, "epoch": 7.6384256170780525, "step": 22900 }, { "distill_loss": 0.10948806256055832, "epoch": 7.6384256170780525, "step": 22900 }, { "epoch": 7.6384256170780525, "ref_ce_loss": 0.0385892391204834, "step": 22900 }, { "epoch": 7.6384256170780525, "loss": 0.20218154788017273, "step": 22900 }, { "ce_loss": 0.03984569385647774, "epoch": 7.6384256170780525, "step": 22900 }, { "distill_loss": 0.1096130758523941, "epoch": 7.6384256170780525, "step": 22900 }, { "epoch": 7.6384256170780525, "ref_ce_loss": 0.03738179802894592, "step": 22900 }, { "epoch": 7.6384256170780525, "loss": 0.4746226966381073, "step": 22900 }, { "ce_loss": 0.04614961892366409, "epoch": 7.6384256170780525, "step": 22900 }, { "distill_loss": 0.25767990946769714, "epoch": 7.6384256170780525, "step": 22900 }, { "epoch": 7.6384256170780525, "ref_ce_loss": 0.059538114815950394, "step": 22900 }, { "epoch": 7.641761174116077, "loss": 0.3217, "step": 22910 }, { "epoch": 7.641761174116077, "grad_norm": 3.924365997314453, "step": 22910 }, { "epoch": 7.641761174116077, "learning_rate": 1.5748506820316697e-06, "step": 22910 }, { "epoch": 7.641761174116077, "loss": 0.5141680836677551, "step": 22910 }, { "ce_loss": 0.08010391145944595, "epoch": 7.641761174116077, "step": 22910 }, { "distill_loss": 0.21418240666389465, "epoch": 7.641761174116077, "step": 22910 }, { "epoch": 7.641761174116077, "ref_ce_loss": 0.05140373855829239, "step": 22910 }, { "epoch": 7.641761174116077, "loss": 0.2225308120250702, "step": 22910 }, { "ce_loss": 0.02417745813727379, "epoch": 7.641761174116077, "step": 22910 }, { "distill_loss": 0.11761415004730225, "epoch": 7.641761174116077, "step": 22910 }, { "epoch": 7.641761174116077, "ref_ce_loss": 0.05385226756334305, "step": 22910 }, { "epoch": 7.641761174116077, "loss": 0.2561490833759308, "step": 22910 }, { "ce_loss": 0.037857986986637115, "epoch": 7.641761174116077, "step": 22910 }, { "distill_loss": 0.1793629229068756, "epoch": 7.641761174116077, "step": 22910 }, { "epoch": 7.641761174116077, "ref_ce_loss": 0.03873812034726143, "step": 22910 }, { "epoch": 7.641761174116077, "loss": 0.2256598174571991, "step": 22910 }, { "ce_loss": 0.03303292766213417, "epoch": 7.641761174116077, "step": 22910 }, { "distill_loss": 0.11392819881439209, "epoch": 7.641761174116077, "step": 22910 }, { "epoch": 7.641761174116077, "ref_ce_loss": 0.05216430127620697, "step": 22910 }, { "epoch": 7.645096731154103, "loss": 0.3363, "step": 22920 }, { "epoch": 7.645096731154103, "grad_norm": 5.04368257522583, "step": 22920 }, { "epoch": 7.645096731154103, "learning_rate": 1.5457106222120042e-06, "step": 22920 }, { "epoch": 7.645096731154103, "loss": 0.45411041378974915, "step": 22920 }, { "ce_loss": 0.033568039536476135, "epoch": 7.645096731154103, "step": 22920 }, { "distill_loss": 0.2195194661617279, "epoch": 7.645096731154103, "step": 22920 }, { "epoch": 7.645096731154103, "ref_ce_loss": 0.038000307977199554, "step": 22920 }, { "epoch": 7.645096731154103, "loss": 0.30707457661628723, "step": 22920 }, { "ce_loss": 0.027316883206367493, "epoch": 7.645096731154103, "step": 22920 }, { "distill_loss": 0.2306128442287445, "epoch": 7.645096731154103, "step": 22920 }, { "epoch": 7.645096731154103, "ref_ce_loss": 0.04899785295128822, "step": 22920 }, { "epoch": 7.645096731154103, "loss": 0.26368802785873413, "step": 22920 }, { "ce_loss": 0.04824545979499817, "epoch": 7.645096731154103, "step": 22920 }, { "distill_loss": 0.1530880630016327, "epoch": 7.645096731154103, "step": 22920 }, { "epoch": 7.645096731154103, "ref_ce_loss": 0.050180789083242416, "step": 22920 }, { "epoch": 7.645096731154103, "loss": 0.25279903411865234, "step": 22920 }, { "ce_loss": 0.030409902334213257, "epoch": 7.645096731154103, "step": 22920 }, { "distill_loss": 0.13697512447834015, "epoch": 7.645096731154103, "step": 22920 }, { "epoch": 7.645096731154103, "ref_ce_loss": 0.05594718083739281, "step": 22920 }, { "epoch": 7.648432288192128, "loss": 0.3521, "step": 22930 }, { "epoch": 7.648432288192128, "grad_norm": 2.7559688091278076, "step": 22930 }, { "epoch": 7.648432288192128, "learning_rate": 1.5168412844557055e-06, "step": 22930 }, { "epoch": 7.648432288192128, "loss": 0.399747759103775, "step": 22930 }, { "ce_loss": 0.016245001927018166, "epoch": 7.648432288192128, "step": 22930 }, { "distill_loss": 0.12318704277276993, "epoch": 7.648432288192128, "step": 22930 }, { "epoch": 7.648432288192128, "ref_ce_loss": 0.08122338354587555, "step": 22930 }, { "epoch": 7.648432288192128, "loss": 0.22639451920986176, "step": 22930 }, { "ce_loss": 0.0014325794763863087, "epoch": 7.648432288192128, "step": 22930 }, { "distill_loss": 0.12101196497678757, "epoch": 7.648432288192128, "step": 22930 }, { "epoch": 7.648432288192128, "ref_ce_loss": 0.059808388352394104, "step": 22930 }, { "epoch": 7.648432288192128, "loss": 0.34030061960220337, "step": 22930 }, { "ce_loss": 0.04827611520886421, "epoch": 7.648432288192128, "step": 22930 }, { "distill_loss": 0.09970467537641525, "epoch": 7.648432288192128, "step": 22930 }, { "epoch": 7.648432288192128, "ref_ce_loss": 0.030907966196537018, "step": 22930 }, { "epoch": 7.648432288192128, "loss": 0.1849261373281479, "step": 22930 }, { "ce_loss": 0.018460562452673912, "epoch": 7.648432288192128, "step": 22930 }, { "distill_loss": 0.10423801094293594, "epoch": 7.648432288192128, "step": 22930 }, { "epoch": 7.648432288192128, "ref_ce_loss": 0.03969254717230797, "step": 22930 }, { "epoch": 7.651767845230154, "loss": 0.2977, "step": 22940 }, { "epoch": 7.651767845230154, "grad_norm": 3.0878891944885254, "step": 22940 }, { "epoch": 7.651767845230154, "learning_rate": 1.4882427214090776e-06, "step": 22940 }, { "epoch": 7.651767845230154, "loss": 0.4689764380455017, "step": 22940 }, { "ce_loss": 0.006221160292625427, "epoch": 7.651767845230154, "step": 22940 }, { "distill_loss": 0.3133019208908081, "epoch": 7.651767845230154, "step": 22940 }, { "epoch": 7.651767845230154, "ref_ce_loss": 0.06789840757846832, "step": 22940 }, { "epoch": 7.651767845230154, "loss": 0.18552368879318237, "step": 22940 }, { "ce_loss": 0.015950776636600494, "epoch": 7.651767845230154, "step": 22940 }, { "distill_loss": 0.11873391270637512, "epoch": 7.651767845230154, "step": 22940 }, { "epoch": 7.651767845230154, "ref_ce_loss": 0.05071733891963959, "step": 22940 }, { "epoch": 7.651767845230154, "loss": 0.31022709608078003, "step": 22940 }, { "ce_loss": 0.04275962710380554, "epoch": 7.651767845230154, "step": 22940 }, { "distill_loss": 0.1239824965596199, "epoch": 7.651767845230154, "step": 22940 }, { "epoch": 7.651767845230154, "ref_ce_loss": 0.029431330040097237, "step": 22940 }, { "epoch": 7.651767845230154, "loss": 0.151072695851326, "step": 22940 }, { "ce_loss": 0.027431553229689598, "epoch": 7.651767845230154, "step": 22940 }, { "distill_loss": 0.08233068883419037, "epoch": 7.651767845230154, "step": 22940 }, { "epoch": 7.651767845230154, "ref_ce_loss": 0.030642185360193253, "step": 22940 }, { "epoch": 7.655103402268178, "loss": 0.2989, "step": 22950 }, { "epoch": 7.655103402268178, "grad_norm": 4.026222229003906, "step": 22950 }, { "epoch": 7.655103402268178, "learning_rate": 1.4599149852246361e-06, "step": 22950 }, { "epoch": 7.655103402268178, "loss": 0.5482908487319946, "step": 22950 }, { "ce_loss": 0.012102135457098484, "epoch": 7.655103402268178, "step": 22950 }, { "distill_loss": 0.13015960156917572, "epoch": 7.655103402268178, "step": 22950 }, { "epoch": 7.655103402268178, "ref_ce_loss": 0.050248291343450546, "step": 22950 }, { "epoch": 7.655103402268178, "loss": 0.3930932581424713, "step": 22950 }, { "ce_loss": 0.032274968922138214, "epoch": 7.655103402268178, "step": 22950 }, { "distill_loss": 0.19311977922916412, "epoch": 7.655103402268178, "step": 22950 }, { "epoch": 7.655103402268178, "ref_ce_loss": 0.06739521026611328, "step": 22950 }, { "epoch": 7.655103402268178, "loss": 0.5229855179786682, "step": 22950 }, { "ce_loss": 0.14910782873630524, "epoch": 7.655103402268178, "step": 22950 }, { "distill_loss": 0.2856809198856354, "epoch": 7.655103402268178, "step": 22950 }, { "epoch": 7.655103402268178, "ref_ce_loss": 0.08798830956220627, "step": 22950 }, { "epoch": 7.655103402268178, "loss": 0.4588366448879242, "step": 22950 }, { "ce_loss": 0.04331749305129051, "epoch": 7.655103402268178, "step": 22950 }, { "distill_loss": 0.3251328468322754, "epoch": 7.655103402268178, "step": 22950 }, { "epoch": 7.655103402268178, "ref_ce_loss": 0.050014346837997437, "step": 22950 }, { "epoch": 7.6584389593062046, "loss": 0.3257, "step": 22960 }, { "epoch": 7.6584389593062046, "grad_norm": 2.500479221343994, "step": 22960 }, { "epoch": 7.6584389593062046, "learning_rate": 1.4318581275609754e-06, "step": 22960 }, { "epoch": 7.6584389593062046, "loss": 0.27828431129455566, "step": 22960 }, { "ce_loss": 0.019125934690237045, "epoch": 7.6584389593062046, "step": 22960 }, { "distill_loss": 0.16130469739437103, "epoch": 7.6584389593062046, "step": 22960 }, { "epoch": 7.6584389593062046, "ref_ce_loss": 0.0303462203592062, "step": 22960 }, { "epoch": 7.6584389593062046, "loss": 0.27863505482673645, "step": 22960 }, { "ce_loss": 0.02259085513651371, "epoch": 7.6584389593062046, "step": 22960 }, { "distill_loss": 0.17387576401233673, "epoch": 7.6584389593062046, "step": 22960 }, { "epoch": 7.6584389593062046, "ref_ce_loss": 0.05217726156115532, "step": 22960 }, { "epoch": 7.6584389593062046, "loss": 0.21874089539051056, "step": 22960 }, { "ce_loss": 0.014435866847634315, "epoch": 7.6584389593062046, "step": 22960 }, { "distill_loss": 0.13992641866207123, "epoch": 7.6584389593062046, "step": 22960 }, { "epoch": 7.6584389593062046, "ref_ce_loss": 0.03417731821537018, "step": 22960 }, { "epoch": 7.6584389593062046, "loss": 0.30610159039497375, "step": 22960 }, { "ce_loss": 0.029911501333117485, "epoch": 7.6584389593062046, "step": 22960 }, { "distill_loss": 0.13687187433242798, "epoch": 7.6584389593062046, "step": 22960 }, { "epoch": 7.6584389593062046, "ref_ce_loss": 0.07079162448644638, "step": 22960 }, { "epoch": 7.661774516344229, "loss": 0.3206, "step": 22970 }, { "epoch": 7.661774516344229, "grad_norm": 3.2441189289093018, "step": 22970 }, { "epoch": 7.661774516344229, "learning_rate": 1.4040721995827342e-06, "step": 22970 }, { "epoch": 7.661774516344229, "loss": 0.30744755268096924, "step": 22970 }, { "ce_loss": 0.01652863807976246, "epoch": 7.661774516344229, "step": 22970 }, { "distill_loss": 0.09780027717351913, "epoch": 7.661774516344229, "step": 22970 }, { "epoch": 7.661774516344229, "ref_ce_loss": 0.04824933409690857, "step": 22970 }, { "epoch": 7.661774516344229, "loss": 0.2842020094394684, "step": 22970 }, { "ce_loss": 0.026345144957304, "epoch": 7.661774516344229, "step": 22970 }, { "distill_loss": 0.20223809778690338, "epoch": 7.661774516344229, "step": 22970 }, { "epoch": 7.661774516344229, "ref_ce_loss": 0.05555181950330734, "step": 22970 }, { "epoch": 7.661774516344229, "loss": 0.49395424127578735, "step": 22970 }, { "ce_loss": 0.0228904839605093, "epoch": 7.661774516344229, "step": 22970 }, { "distill_loss": 0.2043241262435913, "epoch": 7.661774516344229, "step": 22970 }, { "epoch": 7.661774516344229, "ref_ce_loss": 0.04845643788576126, "step": 22970 }, { "epoch": 7.661774516344229, "loss": 0.2669130265712738, "step": 22970 }, { "ce_loss": 0.05929362401366234, "epoch": 7.661774516344229, "step": 22970 }, { "distill_loss": 0.1629556119441986, "epoch": 7.661774516344229, "step": 22970 }, { "epoch": 7.661774516344229, "ref_ce_loss": 0.04456076771020889, "step": 22970 }, { "epoch": 7.665110073382255, "loss": 0.3014, "step": 22980 }, { "epoch": 7.665110073382255, "grad_norm": 3.482917070388794, "step": 22980 }, { "epoch": 7.665110073382255, "learning_rate": 1.3765572519604806e-06, "step": 22980 }, { "epoch": 7.665110073382255, "loss": 0.3842131793498993, "step": 22980 }, { "ce_loss": 0.02830090932548046, "epoch": 7.665110073382255, "step": 22980 }, { "distill_loss": 0.20186612010002136, "epoch": 7.665110073382255, "step": 22980 }, { "epoch": 7.665110073382255, "ref_ce_loss": 0.037983302026987076, "step": 22980 }, { "epoch": 7.665110073382255, "loss": 0.34967049956321716, "step": 22980 }, { "ce_loss": 0.042643673717975616, "epoch": 7.665110073382255, "step": 22980 }, { "distill_loss": 0.1744934767484665, "epoch": 7.665110073382255, "step": 22980 }, { "epoch": 7.665110073382255, "ref_ce_loss": 0.05821855738759041, "step": 22980 }, { "epoch": 7.665110073382255, "loss": 0.19610543549060822, "step": 22980 }, { "ce_loss": 0.029282161965966225, "epoch": 7.665110073382255, "step": 22980 }, { "distill_loss": 0.12504170835018158, "epoch": 7.665110073382255, "step": 22980 }, { "epoch": 7.665110073382255, "ref_ce_loss": 0.04139018431305885, "step": 22980 }, { "epoch": 7.665110073382255, "loss": 0.345589816570282, "step": 22980 }, { "ce_loss": 0.03254815936088562, "epoch": 7.665110073382255, "step": 22980 }, { "distill_loss": 0.2645869851112366, "epoch": 7.665110073382255, "step": 22980 }, { "epoch": 7.665110073382255, "ref_ce_loss": 0.04817293584346771, "step": 22980 }, { "epoch": 7.66844563042028, "loss": 0.3194, "step": 22990 }, { "epoch": 7.66844563042028, "grad_norm": 2.9004547595977783, "step": 22990 }, { "epoch": 7.66844563042028, "learning_rate": 1.3493133348706442e-06, "step": 22990 }, { "epoch": 7.66844563042028, "loss": 0.2697019875049591, "step": 22990 }, { "ce_loss": 0.05502264201641083, "epoch": 7.66844563042028, "step": 22990 }, { "distill_loss": 0.15867778658866882, "epoch": 7.66844563042028, "step": 22990 }, { "epoch": 7.66844563042028, "ref_ce_loss": 0.05582212656736374, "step": 22990 }, { "epoch": 7.66844563042028, "loss": 0.3151005804538727, "step": 22990 }, { "ce_loss": 0.06821805983781815, "epoch": 7.66844563042028, "step": 22990 }, { "distill_loss": 0.16759486496448517, "epoch": 7.66844563042028, "step": 22990 }, { "epoch": 7.66844563042028, "ref_ce_loss": 0.059118907898664474, "step": 22990 }, { "epoch": 7.66844563042028, "loss": 0.6699935793876648, "step": 22990 }, { "ce_loss": 0.020239880308508873, "epoch": 7.66844563042028, "step": 22990 }, { "distill_loss": 0.1337202489376068, "epoch": 7.66844563042028, "step": 22990 }, { "epoch": 7.66844563042028, "ref_ce_loss": 0.04875611513853073, "step": 22990 }, { "epoch": 7.66844563042028, "loss": 0.43389075994491577, "step": 22990 }, { "ce_loss": 0.013298127800226212, "epoch": 7.66844563042028, "step": 22990 }, { "distill_loss": 0.16998136043548584, "epoch": 7.66844563042028, "step": 22990 }, { "epoch": 7.66844563042028, "ref_ce_loss": 0.04834935814142227, "step": 22990 }, { "epoch": 7.671781187458306, "loss": 0.3221, "step": 23000 }, { "epoch": 7.671781187458306, "grad_norm": 5.315511226654053, "step": 23000 }, { "epoch": 7.671781187458306, "learning_rate": 1.3223404979953834e-06, "step": 23000 }, { "epoch": 7.671781187458306, "loss": 0.30739280581474304, "step": 23000 }, { "ce_loss": 0.01956716738641262, "epoch": 7.671781187458306, "step": 23000 }, { "distill_loss": 0.21019117534160614, "epoch": 7.671781187458306, "step": 23000 }, { "epoch": 7.671781187458306, "ref_ce_loss": 0.05563579127192497, "step": 23000 }, { "epoch": 7.671781187458306, "loss": 0.2983548939228058, "step": 23000 }, { "ce_loss": 0.025492293760180473, "epoch": 7.671781187458306, "step": 23000 }, { "distill_loss": 0.20712508261203766, "epoch": 7.671781187458306, "step": 23000 }, { "epoch": 7.671781187458306, "ref_ce_loss": 0.044836029410362244, "step": 23000 }, { "epoch": 7.671781187458306, "loss": 0.16253502666950226, "step": 23000 }, { "ce_loss": 0.015675950795412064, "epoch": 7.671781187458306, "step": 23000 }, { "distill_loss": 0.0947803407907486, "epoch": 7.671781187458306, "step": 23000 }, { "epoch": 7.671781187458306, "ref_ce_loss": 0.0519745759665966, "step": 23000 }, { "epoch": 7.671781187458306, "loss": 0.792258620262146, "step": 23000 }, { "ce_loss": 0.02057049050927162, "epoch": 7.671781187458306, "step": 23000 }, { "distill_loss": 0.26838886737823486, "epoch": 7.671781187458306, "step": 23000 }, { "epoch": 7.671781187458306, "ref_ce_loss": 0.04232442378997803, "step": 23000 }, { "epoch": 7.67511674449633, "loss": 0.3352, "step": 23010 }, { "epoch": 7.67511674449633, "grad_norm": 6.571993827819824, "step": 23010 }, { "epoch": 7.67511674449633, "learning_rate": 1.2956387905225018e-06, "step": 23010 }, { "epoch": 7.67511674449633, "loss": 0.2995764911174774, "step": 23010 }, { "ce_loss": 0.04184538498520851, "epoch": 7.67511674449633, "step": 23010 }, { "distill_loss": 0.13831311464309692, "epoch": 7.67511674449633, "step": 23010 }, { "epoch": 7.67511674449633, "ref_ce_loss": 0.05882956087589264, "step": 23010 }, { "epoch": 7.67511674449633, "loss": 0.5072600245475769, "step": 23010 }, { "ce_loss": 0.03169850632548332, "epoch": 7.67511674449633, "step": 23010 }, { "distill_loss": 0.13568931818008423, "epoch": 7.67511674449633, "step": 23010 }, { "epoch": 7.67511674449633, "ref_ce_loss": 0.032809965312480927, "step": 23010 }, { "epoch": 7.67511674449633, "loss": 0.4805125296115875, "step": 23010 }, { "ce_loss": 0.05861205235123634, "epoch": 7.67511674449633, "step": 23010 }, { "distill_loss": 0.17325712740421295, "epoch": 7.67511674449633, "step": 23010 }, { "epoch": 7.67511674449633, "ref_ce_loss": 0.09007645398378372, "step": 23010 }, { "epoch": 7.67511674449633, "loss": 0.21181714534759521, "step": 23010 }, { "ce_loss": 0.02114477939903736, "epoch": 7.67511674449633, "step": 23010 }, { "distill_loss": 0.1146874874830246, "epoch": 7.67511674449633, "step": 23010 }, { "epoch": 7.67511674449633, "ref_ce_loss": 0.04349588602781296, "step": 23010 }, { "epoch": 7.678452301534357, "loss": 0.3521, "step": 23020 }, { "epoch": 7.678452301534357, "grad_norm": 3.2868006229400635, "step": 23020 }, { "epoch": 7.678452301534357, "learning_rate": 1.2692082611453825e-06, "step": 23020 }, { "epoch": 7.678452301534357, "loss": 0.45636117458343506, "step": 23020 }, { "ce_loss": 0.047584645450115204, "epoch": 7.678452301534357, "step": 23020 }, { "distill_loss": 0.3129931092262268, "epoch": 7.678452301534357, "step": 23020 }, { "epoch": 7.678452301534357, "ref_ce_loss": 0.05031001567840576, "step": 23020 }, { "epoch": 7.678452301534357, "loss": 0.43598026037216187, "step": 23020 }, { "ce_loss": 0.019500087946653366, "epoch": 7.678452301534357, "step": 23020 }, { "distill_loss": 0.2599097192287445, "epoch": 7.678452301534357, "step": 23020 }, { "epoch": 7.678452301534357, "ref_ce_loss": 0.0634281113743782, "step": 23020 }, { "epoch": 7.678452301534357, "loss": 0.32026126980781555, "step": 23020 }, { "ce_loss": 0.025082221254706383, "epoch": 7.678452301534357, "step": 23020 }, { "distill_loss": 0.18352770805358887, "epoch": 7.678452301534357, "step": 23020 }, { "epoch": 7.678452301534357, "ref_ce_loss": 0.06303086876869202, "step": 23020 }, { "epoch": 7.678452301534357, "loss": 0.3685997426509857, "step": 23020 }, { "ce_loss": 0.03256651759147644, "epoch": 7.678452301534357, "step": 23020 }, { "distill_loss": 0.19047406315803528, "epoch": 7.678452301534357, "step": 23020 }, { "epoch": 7.678452301534357, "ref_ce_loss": 0.04560491442680359, "step": 23020 }, { "epoch": 7.681787858572381, "loss": 0.3419, "step": 23030 }, { "epoch": 7.681787858572381, "grad_norm": 3.9646894931793213, "step": 23030 }, { "epoch": 7.681787858572381, "learning_rate": 1.2430489580628699e-06, "step": 23030 }, { "epoch": 7.681787858572381, "loss": 0.3957512378692627, "step": 23030 }, { "ce_loss": 0.07101224362850189, "epoch": 7.681787858572381, "step": 23030 }, { "distill_loss": 0.1624712198972702, "epoch": 7.681787858572381, "step": 23030 }, { "epoch": 7.681787858572381, "ref_ce_loss": 0.07757756114006042, "step": 23030 }, { "epoch": 7.681787858572381, "loss": 0.24718983471393585, "step": 23030 }, { "ce_loss": 0.02744249440729618, "epoch": 7.681787858572381, "step": 23030 }, { "distill_loss": 0.13793756067752838, "epoch": 7.681787858572381, "step": 23030 }, { "epoch": 7.681787858572381, "ref_ce_loss": 0.04307285323739052, "step": 23030 }, { "epoch": 7.681787858572381, "loss": 0.5288066864013672, "step": 23030 }, { "ce_loss": 0.002461702097207308, "epoch": 7.681787858572381, "step": 23030 }, { "distill_loss": 0.14066945016384125, "epoch": 7.681787858572381, "step": 23030 }, { "epoch": 7.681787858572381, "ref_ce_loss": 0.037495292723178864, "step": 23030 }, { "epoch": 7.681787858572381, "loss": 0.6094015836715698, "step": 23030 }, { "ce_loss": 0.047975365072488785, "epoch": 7.681787858572381, "step": 23030 }, { "distill_loss": 0.20297566056251526, "epoch": 7.681787858572381, "step": 23030 }, { "epoch": 7.681787858572381, "ref_ce_loss": 0.07441578805446625, "step": 23030 }, { "epoch": 7.685123415610407, "loss": 0.3273, "step": 23040 }, { "epoch": 7.685123415610407, "grad_norm": 2.976410150527954, "step": 23040 }, { "epoch": 7.685123415610407, "learning_rate": 1.2171609289792384e-06, "step": 23040 }, { "epoch": 7.685123415610407, "loss": 0.3225436806678772, "step": 23040 }, { "ce_loss": 0.009163063950836658, "epoch": 7.685123415610407, "step": 23040 }, { "distill_loss": 0.16070663928985596, "epoch": 7.685123415610407, "step": 23040 }, { "epoch": 7.685123415610407, "ref_ce_loss": 0.053454380482435226, "step": 23040 }, { "epoch": 7.685123415610407, "loss": 0.5536080598831177, "step": 23040 }, { "ce_loss": 0.05467384308576584, "epoch": 7.685123415610407, "step": 23040 }, { "distill_loss": 0.2630139887332916, "epoch": 7.685123415610407, "step": 23040 }, { "epoch": 7.685123415610407, "ref_ce_loss": 0.07442136108875275, "step": 23040 }, { "epoch": 7.685123415610407, "loss": 0.24188895523548126, "step": 23040 }, { "ce_loss": 0.0048294877633452415, "epoch": 7.685123415610407, "step": 23040 }, { "distill_loss": 0.11708530783653259, "epoch": 7.685123415610407, "step": 23040 }, { "epoch": 7.685123415610407, "ref_ce_loss": 0.05692865327000618, "step": 23040 }, { "epoch": 7.685123415610407, "loss": 0.30179455876350403, "step": 23040 }, { "ce_loss": 0.05799272656440735, "epoch": 7.685123415610407, "step": 23040 }, { "distill_loss": 0.1916697472333908, "epoch": 7.685123415610407, "step": 23040 }, { "epoch": 7.685123415610407, "ref_ce_loss": 0.051968324929475784, "step": 23040 }, { "epoch": 7.688458972648432, "loss": 0.3115, "step": 23050 }, { "epoch": 7.688458972648432, "grad_norm": 4.0610222816467285, "step": 23050 }, { "epoch": 7.688458972648432, "learning_rate": 1.1915442211040404e-06, "step": 23050 }, { "epoch": 7.688458972648432, "loss": 0.3361358642578125, "step": 23050 }, { "ce_loss": 0.0818992331624031, "epoch": 7.688458972648432, "step": 23050 }, { "distill_loss": 0.20039232075214386, "epoch": 7.688458972648432, "step": 23050 }, { "epoch": 7.688458972648432, "ref_ce_loss": 0.05358118191361427, "step": 23050 }, { "epoch": 7.688458972648432, "loss": 0.28487151861190796, "step": 23050 }, { "ce_loss": 0.009340011514723301, "epoch": 7.688458972648432, "step": 23050 }, { "distill_loss": 0.15453654527664185, "epoch": 7.688458972648432, "step": 23050 }, { "epoch": 7.688458972648432, "ref_ce_loss": 0.051413096487522125, "step": 23050 }, { "epoch": 7.688458972648432, "loss": 0.34228986501693726, "step": 23050 }, { "ce_loss": 0.035694170743227005, "epoch": 7.688458972648432, "step": 23050 }, { "distill_loss": 0.15729156136512756, "epoch": 7.688458972648432, "step": 23050 }, { "epoch": 7.688458972648432, "ref_ce_loss": 0.04107082262635231, "step": 23050 }, { "epoch": 7.688458972648432, "loss": 0.3071814775466919, "step": 23050 }, { "ce_loss": 0.035663481801748276, "epoch": 7.688458972648432, "step": 23050 }, { "distill_loss": 0.2025829255580902, "epoch": 7.688458972648432, "step": 23050 }, { "epoch": 7.688458972648432, "ref_ce_loss": 0.03947122022509575, "step": 23050 }, { "epoch": 7.691794529686458, "loss": 0.3508, "step": 23060 }, { "epoch": 7.691794529686458, "grad_norm": 5.8116679191589355, "step": 23060 }, { "epoch": 7.691794529686458, "learning_rate": 1.166198881152025e-06, "step": 23060 }, { "epoch": 7.691794529686458, "loss": 0.2547825276851654, "step": 23060 }, { "ce_loss": 0.010203652083873749, "epoch": 7.691794529686458, "step": 23060 }, { "distill_loss": 0.14617383480072021, "epoch": 7.691794529686458, "step": 23060 }, { "epoch": 7.691794529686458, "ref_ce_loss": 0.056542348116636276, "step": 23060 }, { "epoch": 7.691794529686458, "loss": 0.34109601378440857, "step": 23060 }, { "ce_loss": 0.014679406769573689, "epoch": 7.691794529686458, "step": 23060 }, { "distill_loss": 0.24016183614730835, "epoch": 7.691794529686458, "step": 23060 }, { "epoch": 7.691794529686458, "ref_ce_loss": 0.04968947917222977, "step": 23060 }, { "epoch": 7.691794529686458, "loss": 0.20898187160491943, "step": 23060 }, { "ce_loss": 0.03771166130900383, "epoch": 7.691794529686458, "step": 23060 }, { "distill_loss": 0.1140361949801445, "epoch": 7.691794529686458, "step": 23060 }, { "epoch": 7.691794529686458, "ref_ce_loss": 0.03934255987405777, "step": 23060 }, { "epoch": 7.691794529686458, "loss": 0.2710397243499756, "step": 23060 }, { "ce_loss": 0.005295777693390846, "epoch": 7.691794529686458, "step": 23060 }, { "distill_loss": 0.1349426507949829, "epoch": 7.691794529686458, "step": 23060 }, { "epoch": 7.691794529686458, "ref_ce_loss": 0.04271606355905533, "step": 23060 }, { "epoch": 7.6951300867244825, "loss": 0.3044, "step": 23070 }, { "epoch": 7.6951300867244825, "grad_norm": 3.171736478805542, "step": 23070 }, { "epoch": 7.6951300867244825, "learning_rate": 1.14112495534312e-06, "step": 23070 }, { "epoch": 7.6951300867244825, "loss": 0.2496335506439209, "step": 23070 }, { "ce_loss": 0.04608842357993126, "epoch": 7.6951300867244825, "step": 23070 }, { "distill_loss": 0.13245657086372375, "epoch": 7.6951300867244825, "step": 23070 }, { "epoch": 7.6951300867244825, "ref_ce_loss": 0.049435753375291824, "step": 23070 }, { "epoch": 7.6951300867244825, "loss": 0.33521968126296997, "step": 23070 }, { "ce_loss": 0.03709598258137703, "epoch": 7.6951300867244825, "step": 23070 }, { "distill_loss": 0.22455614805221558, "epoch": 7.6951300867244825, "step": 23070 }, { "epoch": 7.6951300867244825, "ref_ce_loss": 0.032426513731479645, "step": 23070 }, { "epoch": 7.6951300867244825, "loss": 0.5236812829971313, "step": 23070 }, { "ce_loss": 0.04154251515865326, "epoch": 7.6951300867244825, "step": 23070 }, { "distill_loss": 0.20564574003219604, "epoch": 7.6951300867244825, "step": 23070 }, { "epoch": 7.6951300867244825, "ref_ce_loss": 0.0866141989827156, "step": 23070 }, { "epoch": 7.6951300867244825, "loss": 0.2064809799194336, "step": 23070 }, { "ce_loss": 0.011886204592883587, "epoch": 7.6951300867244825, "step": 23070 }, { "distill_loss": 0.11368348449468613, "epoch": 7.6951300867244825, "step": 23070 }, { "epoch": 7.6951300867244825, "ref_ce_loss": 0.05952262133359909, "step": 23070 }, { "epoch": 7.698465643762509, "loss": 0.3111, "step": 23080 }, { "epoch": 7.698465643762509, "grad_norm": 4.591440200805664, "step": 23080 }, { "epoch": 7.698465643762509, "learning_rate": 1.116322489402266e-06, "step": 23080 }, { "epoch": 7.698465643762509, "loss": 0.22700554132461548, "step": 23080 }, { "ce_loss": 0.029819414019584656, "epoch": 7.698465643762509, "step": 23080 }, { "distill_loss": 0.121553435921669, "epoch": 7.698465643762509, "step": 23080 }, { "epoch": 7.698465643762509, "ref_ce_loss": 0.056140560656785965, "step": 23080 }, { "epoch": 7.698465643762509, "loss": 0.36817777156829834, "step": 23080 }, { "ce_loss": 0.03735628351569176, "epoch": 7.698465643762509, "step": 23080 }, { "distill_loss": 0.19975467026233673, "epoch": 7.698465643762509, "step": 23080 }, { "epoch": 7.698465643762509, "ref_ce_loss": 0.06319694221019745, "step": 23080 }, { "epoch": 7.698465643762509, "loss": 0.21822097897529602, "step": 23080 }, { "ce_loss": 0.02268623188138008, "epoch": 7.698465643762509, "step": 23080 }, { "distill_loss": 0.13283170759677887, "epoch": 7.698465643762509, "step": 23080 }, { "epoch": 7.698465643762509, "ref_ce_loss": 0.06267096102237701, "step": 23080 }, { "epoch": 7.698465643762509, "loss": 0.26581940054893494, "step": 23080 }, { "ce_loss": 0.029165010899305344, "epoch": 7.698465643762509, "step": 23080 }, { "distill_loss": 0.155491441488266, "epoch": 7.698465643762509, "step": 23080 }, { "epoch": 7.698465643762509, "ref_ce_loss": 0.049425188452005386, "step": 23080 }, { "epoch": 7.701801200800533, "loss": 0.3056, "step": 23090 }, { "epoch": 7.701801200800533, "grad_norm": 2.542473554611206, "step": 23090 }, { "epoch": 7.701801200800533, "learning_rate": 1.091791528559366e-06, "step": 23090 }, { "epoch": 7.701801200800533, "loss": 0.19253060221672058, "step": 23090 }, { "ce_loss": 0.0050561269745230675, "epoch": 7.701801200800533, "step": 23090 }, { "distill_loss": 0.14840315282344818, "epoch": 7.701801200800533, "step": 23090 }, { "epoch": 7.701801200800533, "ref_ce_loss": 0.027028419077396393, "step": 23090 }, { "epoch": 7.701801200800533, "loss": 0.3827259838581085, "step": 23090 }, { "ce_loss": 0.02695002406835556, "epoch": 7.701801200800533, "step": 23090 }, { "distill_loss": 0.16917379200458527, "epoch": 7.701801200800533, "step": 23090 }, { "epoch": 7.701801200800533, "ref_ce_loss": 0.06104246899485588, "step": 23090 }, { "epoch": 7.701801200800533, "loss": 0.9057022333145142, "step": 23090 }, { "ce_loss": 0.030367007479071617, "epoch": 7.701801200800533, "step": 23090 }, { "distill_loss": 0.1319291591644287, "epoch": 7.701801200800533, "step": 23090 }, { "epoch": 7.701801200800533, "ref_ce_loss": 0.03348112851381302, "step": 23090 }, { "epoch": 7.701801200800533, "loss": 0.5137628316879272, "step": 23090 }, { "ce_loss": 0.060257963836193085, "epoch": 7.701801200800533, "step": 23090 }, { "distill_loss": 0.31021299958229065, "epoch": 7.701801200800533, "step": 23090 }, { "epoch": 7.701801200800533, "ref_ce_loss": 0.07672536373138428, "step": 23090 }, { "epoch": 7.7051367578385594, "loss": 0.3217, "step": 23100 }, { "epoch": 7.7051367578385594, "grad_norm": 2.8845362663269043, "step": 23100 }, { "epoch": 7.7051367578385594, "learning_rate": 1.0675321175492025e-06, "step": 23100 }, { "epoch": 7.7051367578385594, "loss": 0.20346535742282867, "step": 23100 }, { "ce_loss": 0.007688038982450962, "epoch": 7.7051367578385594, "step": 23100 }, { "distill_loss": 0.15419723093509674, "epoch": 7.7051367578385594, "step": 23100 }, { "epoch": 7.7051367578385594, "ref_ce_loss": 0.04151067137718201, "step": 23100 }, { "epoch": 7.7051367578385594, "loss": 0.16337968409061432, "step": 23100 }, { "ce_loss": 0.002642042702063918, "epoch": 7.7051367578385594, "step": 23100 }, { "distill_loss": 0.11293807625770569, "epoch": 7.7051367578385594, "step": 23100 }, { "epoch": 7.7051367578385594, "ref_ce_loss": 0.0326145775616169, "step": 23100 }, { "epoch": 7.7051367578385594, "loss": 0.4473174512386322, "step": 23100 }, { "ce_loss": 0.038927558809518814, "epoch": 7.7051367578385594, "step": 23100 }, { "distill_loss": 0.12315823137760162, "epoch": 7.7051367578385594, "step": 23100 }, { "epoch": 7.7051367578385594, "ref_ce_loss": 0.04285747930407524, "step": 23100 }, { "epoch": 7.7051367578385594, "loss": 0.3779004216194153, "step": 23100 }, { "ce_loss": 0.054333869367837906, "epoch": 7.7051367578385594, "step": 23100 }, { "distill_loss": 0.1537066102027893, "epoch": 7.7051367578385594, "step": 23100 }, { "epoch": 7.7051367578385594, "ref_ce_loss": 0.04151177778840065, "step": 23100 }, { "epoch": 7.708472314876584, "loss": 0.3035, "step": 23110 }, { "epoch": 7.708472314876584, "grad_norm": 3.3525681495666504, "step": 23110 }, { "epoch": 7.708472314876584, "learning_rate": 1.0435443006114208e-06, "step": 23110 }, { "epoch": 7.708472314876584, "loss": 0.44415926933288574, "step": 23110 }, { "ce_loss": 0.02013283036649227, "epoch": 7.708472314876584, "step": 23110 }, { "distill_loss": 0.21743977069854736, "epoch": 7.708472314876584, "step": 23110 }, { "epoch": 7.708472314876584, "ref_ce_loss": 0.07050774246454239, "step": 23110 }, { "epoch": 7.708472314876584, "loss": 0.32459118962287903, "step": 23110 }, { "ce_loss": 0.031134208664298058, "epoch": 7.708472314876584, "step": 23110 }, { "distill_loss": 0.13974878191947937, "epoch": 7.708472314876584, "step": 23110 }, { "epoch": 7.708472314876584, "ref_ce_loss": 0.05988594517111778, "step": 23110 }, { "epoch": 7.708472314876584, "loss": 0.3135908246040344, "step": 23110 }, { "ce_loss": 0.028349503874778748, "epoch": 7.708472314876584, "step": 23110 }, { "distill_loss": 0.20856799185276031, "epoch": 7.708472314876584, "step": 23110 }, { "epoch": 7.708472314876584, "ref_ce_loss": 0.05278262495994568, "step": 23110 }, { "epoch": 7.708472314876584, "loss": 0.18645304441452026, "step": 23110 }, { "ce_loss": 0.012953277677297592, "epoch": 7.708472314876584, "step": 23110 }, { "distill_loss": 0.1352105736732483, "epoch": 7.708472314876584, "step": 23110 }, { "epoch": 7.708472314876584, "ref_ce_loss": 0.03790950030088425, "step": 23110 }, { "epoch": 7.71180787191461, "loss": 0.3338, "step": 23120 }, { "epoch": 7.71180787191461, "grad_norm": 3.6223158836364746, "step": 23120 }, { "epoch": 7.71180787191461, "learning_rate": 1.019828121490296e-06, "step": 23120 }, { "epoch": 7.71180787191461, "loss": 0.22437624633312225, "step": 23120 }, { "ce_loss": 0.05144285783171654, "epoch": 7.71180787191461, "step": 23120 }, { "distill_loss": 0.13139377534389496, "epoch": 7.71180787191461, "step": 23120 }, { "epoch": 7.71180787191461, "ref_ce_loss": 0.04147119075059891, "step": 23120 }, { "epoch": 7.71180787191461, "loss": 0.4318474531173706, "step": 23120 }, { "ce_loss": 0.09833890199661255, "epoch": 7.71180787191461, "step": 23120 }, { "distill_loss": 0.28609535098075867, "epoch": 7.71180787191461, "step": 23120 }, { "epoch": 7.71180787191461, "ref_ce_loss": 0.04700387641787529, "step": 23120 }, { "epoch": 7.71180787191461, "loss": 0.159266397356987, "step": 23120 }, { "ce_loss": 0.006263875402510166, "epoch": 7.71180787191461, "step": 23120 }, { "distill_loss": 0.09080880135297775, "epoch": 7.71180787191461, "step": 23120 }, { "epoch": 7.71180787191461, "ref_ce_loss": 0.04199573025107384, "step": 23120 }, { "epoch": 7.71180787191461, "loss": 0.2531209886074066, "step": 23120 }, { "ce_loss": 0.0459630973637104, "epoch": 7.71180787191461, "step": 23120 }, { "distill_loss": 0.09766028076410294, "epoch": 7.71180787191461, "step": 23120 }, { "epoch": 7.71180787191461, "ref_ce_loss": 0.04684332385659218, "step": 23120 }, { "epoch": 7.715143428952635, "loss": 0.3347, "step": 23130 }, { "epoch": 7.715143428952635, "grad_norm": 3.626303195953369, "step": 23130 }, { "epoch": 7.715143428952635, "learning_rate": 9.963836234347988e-07, "step": 23130 }, { "epoch": 7.715143428952635, "loss": 0.31000179052352905, "step": 23130 }, { "ce_loss": 0.009761415421962738, "epoch": 7.715143428952635, "step": 23130 }, { "distill_loss": 0.11593847721815109, "epoch": 7.715143428952635, "step": 23130 }, { "epoch": 7.715143428952635, "ref_ce_loss": 0.060613133013248444, "step": 23130 }, { "epoch": 7.715143428952635, "loss": 0.23310451209545135, "step": 23130 }, { "ce_loss": 0.018389590084552765, "epoch": 7.715143428952635, "step": 23130 }, { "distill_loss": 0.17988187074661255, "epoch": 7.715143428952635, "step": 23130 }, { "epoch": 7.715143428952635, "ref_ce_loss": 0.02529945969581604, "step": 23130 }, { "epoch": 7.715143428952635, "loss": 0.18636879324913025, "step": 23130 }, { "ce_loss": 0.019744787365198135, "epoch": 7.715143428952635, "step": 23130 }, { "distill_loss": 0.10091696679592133, "epoch": 7.715143428952635, "step": 23130 }, { "epoch": 7.715143428952635, "ref_ce_loss": 0.04688052833080292, "step": 23130 }, { "epoch": 7.715143428952635, "loss": 0.44403672218322754, "step": 23130 }, { "ce_loss": 0.018276330083608627, "epoch": 7.715143428952635, "step": 23130 }, { "distill_loss": 0.12965573370456696, "epoch": 7.715143428952635, "step": 23130 }, { "epoch": 7.715143428952635, "ref_ce_loss": 0.02683100290596485, "step": 23130 }, { "epoch": 7.718478985990661, "loss": 0.3175, "step": 23140 }, { "epoch": 7.718478985990661, "grad_norm": 3.112769842147827, "step": 23140 }, { "epoch": 7.718478985990661, "learning_rate": 9.73210849198447e-07, "step": 23140 }, { "epoch": 7.718478985990661, "loss": 0.1915590912103653, "step": 23140 }, { "ce_loss": 0.02874593995511532, "epoch": 7.718478985990661, "step": 23140 }, { "distill_loss": 0.11782795190811157, "epoch": 7.718478985990661, "step": 23140 }, { "epoch": 7.718478985990661, "ref_ce_loss": 0.04489807412028313, "step": 23140 }, { "epoch": 7.718478985990661, "loss": 0.39852508902549744, "step": 23140 }, { "ce_loss": 0.045446548610925674, "epoch": 7.718478985990661, "step": 23140 }, { "distill_loss": 0.2548648416996002, "epoch": 7.718478985990661, "step": 23140 }, { "epoch": 7.718478985990661, "ref_ce_loss": 0.07269423454999924, "step": 23140 }, { "epoch": 7.718478985990661, "loss": 0.30995240807533264, "step": 23140 }, { "ce_loss": 0.06380195170640945, "epoch": 7.718478985990661, "step": 23140 }, { "distill_loss": 0.13276419043540955, "epoch": 7.718478985990661, "step": 23140 }, { "epoch": 7.718478985990661, "ref_ce_loss": 0.06999576836824417, "step": 23140 }, { "epoch": 7.718478985990661, "loss": 0.14055275917053223, "step": 23140 }, { "ce_loss": 0.022481508553028107, "epoch": 7.718478985990661, "step": 23140 }, { "distill_loss": 0.09223896265029907, "epoch": 7.718478985990661, "step": 23140 }, { "epoch": 7.718478985990661, "ref_ce_loss": 0.02576570026576519, "step": 23140 }, { "epoch": 7.721814543028685, "loss": 0.3163, "step": 23150 }, { "epoch": 7.721814543028685, "grad_norm": 4.643634796142578, "step": 23150 }, { "epoch": 7.721814543028685, "learning_rate": 9.503098410392207e-07, "step": 23150 }, { "epoch": 7.721814543028685, "loss": 0.31578657031059265, "step": 23150 }, { "ce_loss": 0.014320285059511662, "epoch": 7.721814543028685, "step": 23150 }, { "distill_loss": 0.18919619917869568, "epoch": 7.721814543028685, "step": 23150 }, { "epoch": 7.721814543028685, "ref_ce_loss": 0.0538095161318779, "step": 23150 }, { "epoch": 7.721814543028685, "loss": 0.3112615942955017, "step": 23150 }, { "ce_loss": 0.04894857853651047, "epoch": 7.721814543028685, "step": 23150 }, { "distill_loss": 0.1551634967327118, "epoch": 7.721814543028685, "step": 23150 }, { "epoch": 7.721814543028685, "ref_ce_loss": 0.028474921360611916, "step": 23150 }, { "epoch": 7.721814543028685, "loss": 0.31689026951789856, "step": 23150 }, { "ce_loss": 0.011591101996600628, "epoch": 7.721814543028685, "step": 23150 }, { "distill_loss": 0.1778896301984787, "epoch": 7.721814543028685, "step": 23150 }, { "epoch": 7.721814543028685, "ref_ce_loss": 0.0530037060379982, "step": 23150 }, { "epoch": 7.721814543028685, "loss": 0.21250712871551514, "step": 23150 }, { "ce_loss": 0.01693120412528515, "epoch": 7.721814543028685, "step": 23150 }, { "distill_loss": 0.1388123333454132, "epoch": 7.721814543028685, "step": 23150 }, { "epoch": 7.721814543028685, "ref_ce_loss": 0.05666331201791763, "step": 23150 }, { "epoch": 7.7251501000667115, "loss": 0.3276, "step": 23160 }, { "epoch": 7.7251501000667115, "grad_norm": 2.5653464794158936, "step": 23160 }, { "epoch": 7.7251501000667115, "learning_rate": 9.27680640719547e-07, "step": 23160 }, { "epoch": 7.7251501000667115, "loss": 0.20201954245567322, "step": 23160 }, { "ce_loss": 0.01078097615391016, "epoch": 7.7251501000667115, "step": 23160 }, { "distill_loss": 0.09477313607931137, "epoch": 7.7251501000667115, "step": 23160 }, { "epoch": 7.7251501000667115, "ref_ce_loss": 0.03961151838302612, "step": 23160 }, { "epoch": 7.7251501000667115, "loss": 0.3385741114616394, "step": 23160 }, { "ce_loss": 0.020411452278494835, "epoch": 7.7251501000667115, "step": 23160 }, { "distill_loss": 0.13669410347938538, "epoch": 7.7251501000667115, "step": 23160 }, { "epoch": 7.7251501000667115, "ref_ce_loss": 0.06803715229034424, "step": 23160 }, { "epoch": 7.7251501000667115, "loss": 0.4764685034751892, "step": 23160 }, { "ce_loss": 0.028775783255696297, "epoch": 7.7251501000667115, "step": 23160 }, { "distill_loss": 0.214456707239151, "epoch": 7.7251501000667115, "step": 23160 }, { "epoch": 7.7251501000667115, "ref_ce_loss": 0.07601587474346161, "step": 23160 }, { "epoch": 7.7251501000667115, "loss": 0.4041188955307007, "step": 23160 }, { "ce_loss": 0.09439730644226074, "epoch": 7.7251501000667115, "step": 23160 }, { "distill_loss": 0.1673288494348526, "epoch": 7.7251501000667115, "step": 23160 }, { "epoch": 7.7251501000667115, "ref_ce_loss": 0.06416967511177063, "step": 23160 }, { "epoch": 7.728485657104736, "loss": 0.3091, "step": 23170 }, { "epoch": 7.728485657104736, "grad_norm": 2.889651298522949, "step": 23170 }, { "epoch": 7.728485657104736, "learning_rate": 9.053232895061657e-07, "step": 23170 }, { "epoch": 7.728485657104736, "loss": 0.5144741535186768, "step": 23170 }, { "ce_loss": 0.020566776394844055, "epoch": 7.728485657104736, "step": 23170 }, { "distill_loss": 0.3035084009170532, "epoch": 7.728485657104736, "step": 23170 }, { "epoch": 7.728485657104736, "ref_ce_loss": 0.062379274517297745, "step": 23170 }, { "epoch": 7.728485657104736, "loss": 0.23927778005599976, "step": 23170 }, { "ce_loss": 0.012099779210984707, "epoch": 7.728485657104736, "step": 23170 }, { "distill_loss": 0.13171517848968506, "epoch": 7.728485657104736, "step": 23170 }, { "epoch": 7.728485657104736, "ref_ce_loss": 0.06338847428560257, "step": 23170 }, { "epoch": 7.728485657104736, "loss": 0.20670422911643982, "step": 23170 }, { "ce_loss": 0.01207758579403162, "epoch": 7.728485657104736, "step": 23170 }, { "distill_loss": 0.12016580253839493, "epoch": 7.728485657104736, "step": 23170 }, { "epoch": 7.728485657104736, "ref_ce_loss": 0.031029202044010162, "step": 23170 }, { "epoch": 7.728485657104736, "loss": 0.4285022020339966, "step": 23170 }, { "ce_loss": 0.06939056515693665, "epoch": 7.728485657104736, "step": 23170 }, { "distill_loss": 0.2485809475183487, "epoch": 7.728485657104736, "step": 23170 }, { "epoch": 7.728485657104736, "ref_ce_loss": 0.048880282789468765, "step": 23170 }, { "epoch": 7.731821214142762, "loss": 0.3049, "step": 23180 }, { "epoch": 7.731821214142762, "grad_norm": 3.228677749633789, "step": 23180 }, { "epoch": 7.731821214142762, "learning_rate": 8.832378281700303e-07, "step": 23180 }, { "epoch": 7.731821214142762, "loss": 0.17703618109226227, "step": 23180 }, { "ce_loss": 0.013402396813035011, "epoch": 7.731821214142762, "step": 23180 }, { "distill_loss": 0.10235219448804855, "epoch": 7.731821214142762, "step": 23180 }, { "epoch": 7.731821214142762, "ref_ce_loss": 0.06116965785622597, "step": 23180 }, { "epoch": 7.731821214142762, "loss": 0.31809985637664795, "step": 23180 }, { "ce_loss": 0.023722652345895767, "epoch": 7.731821214142762, "step": 23180 }, { "distill_loss": 0.11942629516124725, "epoch": 7.731821214142762, "step": 23180 }, { "epoch": 7.731821214142762, "ref_ce_loss": 0.049432169646024704, "step": 23180 }, { "epoch": 7.731821214142762, "loss": 0.23215220868587494, "step": 23180 }, { "ce_loss": 0.05213412642478943, "epoch": 7.731821214142762, "step": 23180 }, { "distill_loss": 0.1340891718864441, "epoch": 7.731821214142762, "step": 23180 }, { "epoch": 7.731821214142762, "ref_ce_loss": 0.03527505323290825, "step": 23180 }, { "epoch": 7.731821214142762, "loss": 0.2313213050365448, "step": 23180 }, { "ce_loss": 0.03938008099794388, "epoch": 7.731821214142762, "step": 23180 }, { "distill_loss": 0.15864311158657074, "epoch": 7.731821214142762, "step": 23180 }, { "epoch": 7.731821214142762, "ref_ce_loss": 0.0247786957770586, "step": 23180 }, { "epoch": 7.735156771180787, "loss": 0.3335, "step": 23190 }, { "epoch": 7.735156771180787, "grad_norm": 3.1714279651641846, "step": 23190 }, { "epoch": 7.735156771180787, "learning_rate": 8.614242969863572e-07, "step": 23190 }, { "epoch": 7.735156771180787, "loss": 0.23594240844249725, "step": 23190 }, { "ce_loss": 0.02036680467426777, "epoch": 7.735156771180787, "step": 23190 }, { "distill_loss": 0.15960662066936493, "epoch": 7.735156771180787, "step": 23190 }, { "epoch": 7.735156771180787, "ref_ce_loss": 0.05590438097715378, "step": 23190 }, { "epoch": 7.735156771180787, "loss": 0.3538329601287842, "step": 23190 }, { "ce_loss": 0.08600302040576935, "epoch": 7.735156771180787, "step": 23190 }, { "distill_loss": 0.19357657432556152, "epoch": 7.735156771180787, "step": 23190 }, { "epoch": 7.735156771180787, "ref_ce_loss": 0.06092427670955658, "step": 23190 }, { "epoch": 7.735156771180787, "loss": 0.4086431860923767, "step": 23190 }, { "ce_loss": 0.036485668271780014, "epoch": 7.735156771180787, "step": 23190 }, { "distill_loss": 0.13172754645347595, "epoch": 7.735156771180787, "step": 23190 }, { "epoch": 7.735156771180787, "ref_ce_loss": 0.05472996085882187, "step": 23190 }, { "epoch": 7.735156771180787, "loss": 0.29737913608551025, "step": 23190 }, { "ce_loss": 0.025702279061079025, "epoch": 7.735156771180787, "step": 23190 }, { "distill_loss": 0.1611969769001007, "epoch": 7.735156771180787, "step": 23190 }, { "epoch": 7.735156771180787, "ref_ce_loss": 0.0548030324280262, "step": 23190 }, { "epoch": 7.738492328218813, "loss": 0.3057, "step": 23200 }, { "epoch": 7.738492328218813, "grad_norm": 3.2966296672821045, "step": 23200 }, { "epoch": 7.738492328218813, "learning_rate": 8.398827357343929e-07, "step": 23200 }, { "epoch": 7.738492328218813, "loss": 0.3330060839653015, "step": 23200 }, { "ce_loss": 0.042179033160209656, "epoch": 7.738492328218813, "step": 23200 }, { "distill_loss": 0.13909977674484253, "epoch": 7.738492328218813, "step": 23200 }, { "epoch": 7.738492328218813, "ref_ce_loss": 0.07637790590524673, "step": 23200 }, { "epoch": 7.738492328218813, "loss": 0.3866680860519409, "step": 23200 }, { "ce_loss": 0.0406869538128376, "epoch": 7.738492328218813, "step": 23200 }, { "distill_loss": 0.14271864295005798, "epoch": 7.738492328218813, "step": 23200 }, { "epoch": 7.738492328218813, "ref_ce_loss": 0.045196447521448135, "step": 23200 }, { "epoch": 7.738492328218813, "loss": 0.40931081771850586, "step": 23200 }, { "ce_loss": 0.0653044730424881, "epoch": 7.738492328218813, "step": 23200 }, { "distill_loss": 0.10990387946367264, "epoch": 7.738492328218813, "step": 23200 }, { "epoch": 7.738492328218813, "ref_ce_loss": 0.06856023520231247, "step": 23200 }, { "epoch": 7.738492328218813, "loss": 0.25872522592544556, "step": 23200 }, { "ce_loss": 0.025372039526700974, "epoch": 7.738492328218813, "step": 23200 }, { "distill_loss": 0.1737333983182907, "epoch": 7.738492328218813, "step": 23200 }, { "epoch": 7.738492328218813, "ref_ce_loss": 0.0390053316950798, "step": 23200 }, { "epoch": 7.741827885256837, "loss": 0.3241, "step": 23210 }, { "epoch": 7.741827885256837, "grad_norm": 2.839111566543579, "step": 23210 }, { "epoch": 7.741827885256837, "learning_rate": 8.186131836974474e-07, "step": 23210 }, { "epoch": 7.741827885256837, "loss": 0.2935517132282257, "step": 23210 }, { "ce_loss": 0.06324876844882965, "epoch": 7.741827885256837, "step": 23210 }, { "distill_loss": 0.14696261286735535, "epoch": 7.741827885256837, "step": 23210 }, { "epoch": 7.741827885256837, "ref_ce_loss": 0.0566866435110569, "step": 23210 }, { "epoch": 7.741827885256837, "loss": 0.5052548050880432, "step": 23210 }, { "ce_loss": 0.012156311422586441, "epoch": 7.741827885256837, "step": 23210 }, { "distill_loss": 0.23311343789100647, "epoch": 7.741827885256837, "step": 23210 }, { "epoch": 7.741827885256837, "ref_ce_loss": 0.06416646391153336, "step": 23210 }, { "epoch": 7.741827885256837, "loss": 0.18498748540878296, "step": 23210 }, { "ce_loss": 0.0010739491553977132, "epoch": 7.741827885256837, "step": 23210 }, { "distill_loss": 0.09309439361095428, "epoch": 7.741827885256837, "step": 23210 }, { "epoch": 7.741827885256837, "ref_ce_loss": 0.029653212055563927, "step": 23210 }, { "epoch": 7.741827885256837, "loss": 0.24336495995521545, "step": 23210 }, { "ce_loss": 0.05524908006191254, "epoch": 7.741827885256837, "step": 23210 }, { "distill_loss": 0.13438133895397186, "epoch": 7.741827885256837, "step": 23210 }, { "epoch": 7.741827885256837, "ref_ce_loss": 0.039194490760564804, "step": 23210 }, { "epoch": 7.745163442294864, "loss": 0.295, "step": 23220 }, { "epoch": 7.745163442294864, "grad_norm": 2.449200391769409, "step": 23220 }, { "epoch": 7.745163442294864, "learning_rate": 7.976156796627942e-07, "step": 23220 }, { "epoch": 7.745163442294864, "loss": 0.18167901039123535, "step": 23220 }, { "ce_loss": 0.031659264117479324, "epoch": 7.745163442294864, "step": 23220 }, { "distill_loss": 0.12303122133016586, "epoch": 7.745163442294864, "step": 23220 }, { "epoch": 7.745163442294864, "ref_ce_loss": 0.02684379555284977, "step": 23220 }, { "epoch": 7.745163442294864, "loss": 0.19133666157722473, "step": 23220 }, { "ce_loss": 0.040329594165086746, "epoch": 7.745163442294864, "step": 23220 }, { "distill_loss": 0.1013164073228836, "epoch": 7.745163442294864, "step": 23220 }, { "epoch": 7.745163442294864, "ref_ce_loss": 0.04963481053709984, "step": 23220 }, { "epoch": 7.745163442294864, "loss": 0.37134993076324463, "step": 23220 }, { "ce_loss": 0.01046949066221714, "epoch": 7.745163442294864, "step": 23220 }, { "distill_loss": 0.2826530933380127, "epoch": 7.745163442294864, "step": 23220 }, { "epoch": 7.745163442294864, "ref_ce_loss": 0.05680123716592789, "step": 23220 }, { "epoch": 7.745163442294864, "loss": 0.35854285955429077, "step": 23220 }, { "ce_loss": 0.050170235335826874, "epoch": 7.745163442294864, "step": 23220 }, { "distill_loss": 0.12987568974494934, "epoch": 7.745163442294864, "step": 23220 }, { "epoch": 7.745163442294864, "ref_ce_loss": 0.02719688042998314, "step": 23220 }, { "epoch": 7.748498999332888, "loss": 0.3107, "step": 23230 }, { "epoch": 7.748498999332888, "grad_norm": 6.194815635681152, "step": 23230 }, { "epoch": 7.748498999332888, "learning_rate": 7.7689026192162e-07, "step": 23230 }, { "epoch": 7.748498999332888, "loss": 0.28531792759895325, "step": 23230 }, { "ce_loss": 0.018100492656230927, "epoch": 7.748498999332888, "step": 23230 }, { "distill_loss": 0.18027476966381073, "epoch": 7.748498999332888, "step": 23230 }, { "epoch": 7.748498999332888, "ref_ce_loss": 0.04737915098667145, "step": 23230 }, { "epoch": 7.748498999332888, "loss": 0.4881407618522644, "step": 23230 }, { "ce_loss": 0.0619981475174427, "epoch": 7.748498999332888, "step": 23230 }, { "distill_loss": 0.1511438935995102, "epoch": 7.748498999332888, "step": 23230 }, { "epoch": 7.748498999332888, "ref_ce_loss": 0.04644491523504257, "step": 23230 }, { "epoch": 7.748498999332888, "loss": 0.33561399579048157, "step": 23230 }, { "ce_loss": 0.011638162657618523, "epoch": 7.748498999332888, "step": 23230 }, { "distill_loss": 0.24499163031578064, "epoch": 7.748498999332888, "step": 23230 }, { "epoch": 7.748498999332888, "ref_ce_loss": 0.07888083159923553, "step": 23230 }, { "epoch": 7.748498999332888, "loss": 0.4013439416885376, "step": 23230 }, { "ce_loss": 0.07862678915262222, "epoch": 7.748498999332888, "step": 23230 }, { "distill_loss": 0.2322043627500534, "epoch": 7.748498999332888, "step": 23230 }, { "epoch": 7.748498999332888, "ref_ce_loss": 0.06721118837594986, "step": 23230 }, { "epoch": 7.751834556370914, "loss": 0.347, "step": 23240 }, { "epoch": 7.751834556370914, "grad_norm": 2.814399003982544, "step": 23240 }, { "epoch": 7.751834556370914, "learning_rate": 7.564369682688754e-07, "step": 23240 }, { "epoch": 7.751834556370914, "loss": 0.19102168083190918, "step": 23240 }, { "ce_loss": 0.0067435563541948795, "epoch": 7.751834556370914, "step": 23240 }, { "distill_loss": 0.12526308000087738, "epoch": 7.751834556370914, "step": 23240 }, { "epoch": 7.751834556370914, "ref_ce_loss": 0.058875057846307755, "step": 23240 }, { "epoch": 7.751834556370914, "loss": 0.2656038701534271, "step": 23240 }, { "ce_loss": 0.014099403284490108, "epoch": 7.751834556370914, "step": 23240 }, { "distill_loss": 0.14388953149318695, "epoch": 7.751834556370914, "step": 23240 }, { "epoch": 7.751834556370914, "ref_ce_loss": 0.04784628748893738, "step": 23240 }, { "epoch": 7.751834556370914, "loss": 0.718388557434082, "step": 23240 }, { "ce_loss": 0.01962624490261078, "epoch": 7.751834556370914, "step": 23240 }, { "distill_loss": 0.26039832830429077, "epoch": 7.751834556370914, "step": 23240 }, { "epoch": 7.751834556370914, "ref_ce_loss": 0.05587448552250862, "step": 23240 }, { "epoch": 7.751834556370914, "loss": 0.28665295243263245, "step": 23240 }, { "ce_loss": 0.031991392374038696, "epoch": 7.751834556370914, "step": 23240 }, { "distill_loss": 0.1633574366569519, "epoch": 7.751834556370914, "step": 23240 }, { "epoch": 7.751834556370914, "ref_ce_loss": 0.042982008308172226, "step": 23240 }, { "epoch": 7.755170113408939, "loss": 0.3545, "step": 23250 }, { "epoch": 7.755170113408939, "grad_norm": 3.5545544624328613, "step": 23250 }, { "epoch": 7.755170113408939, "learning_rate": 7.362558360033411e-07, "step": 23250 }, { "epoch": 7.755170113408939, "loss": 0.22462019324302673, "step": 23250 }, { "ce_loss": 0.030002467334270477, "epoch": 7.755170113408939, "step": 23250 }, { "distill_loss": 0.1342082917690277, "epoch": 7.755170113408939, "step": 23250 }, { "epoch": 7.755170113408939, "ref_ce_loss": 0.037484072148799896, "step": 23250 }, { "epoch": 7.755170113408939, "loss": 0.2946101427078247, "step": 23250 }, { "ce_loss": 0.016634244471788406, "epoch": 7.755170113408939, "step": 23250 }, { "distill_loss": 0.21034982800483704, "epoch": 7.755170113408939, "step": 23250 }, { "epoch": 7.755170113408939, "ref_ce_loss": 0.06742505729198456, "step": 23250 }, { "epoch": 7.755170113408939, "loss": 0.2736393213272095, "step": 23250 }, { "ce_loss": 0.014051617123186588, "epoch": 7.755170113408939, "step": 23250 }, { "distill_loss": 0.17900004982948303, "epoch": 7.755170113408939, "step": 23250 }, { "epoch": 7.755170113408939, "ref_ce_loss": 0.03179965540766716, "step": 23250 }, { "epoch": 7.755170113408939, "loss": 0.3302898108959198, "step": 23250 }, { "ce_loss": 0.02950301021337509, "epoch": 7.755170113408939, "step": 23250 }, { "distill_loss": 0.1390431523323059, "epoch": 7.755170113408939, "step": 23250 }, { "epoch": 7.755170113408939, "ref_ce_loss": 0.03176158294081688, "step": 23250 }, { "epoch": 7.758505670446965, "loss": 0.3264, "step": 23260 }, { "epoch": 7.758505670446965, "grad_norm": 3.3747599124908447, "step": 23260 }, { "epoch": 7.758505670446965, "learning_rate": 7.163469019274115e-07, "step": 23260 }, { "epoch": 7.758505670446965, "loss": 0.28049081563949585, "step": 23260 }, { "ce_loss": 0.050241705030202866, "epoch": 7.758505670446965, "step": 23260 }, { "distill_loss": 0.1570914089679718, "epoch": 7.758505670446965, "step": 23260 }, { "epoch": 7.758505670446965, "ref_ce_loss": 0.041322261095047, "step": 23260 }, { "epoch": 7.758505670446965, "loss": 0.46387675404548645, "step": 23260 }, { "ce_loss": 0.04370967298746109, "epoch": 7.758505670446965, "step": 23260 }, { "distill_loss": 0.2979971468448639, "epoch": 7.758505670446965, "step": 23260 }, { "epoch": 7.758505670446965, "ref_ce_loss": 0.06697692722082138, "step": 23260 }, { "epoch": 7.758505670446965, "loss": 0.32402563095092773, "step": 23260 }, { "ce_loss": 0.023801935836672783, "epoch": 7.758505670446965, "step": 23260 }, { "distill_loss": 0.15474510192871094, "epoch": 7.758505670446965, "step": 23260 }, { "epoch": 7.758505670446965, "ref_ce_loss": 0.05275455862283707, "step": 23260 }, { "epoch": 7.758505670446965, "loss": 0.40194663405418396, "step": 23260 }, { "ce_loss": 0.02307719551026821, "epoch": 7.758505670446965, "step": 23260 }, { "distill_loss": 0.09973050653934479, "epoch": 7.758505670446965, "step": 23260 }, { "epoch": 7.758505670446965, "ref_ce_loss": 0.052247051149606705, "step": 23260 }, { "epoch": 7.7618412274849895, "loss": 0.3557, "step": 23270 }, { "epoch": 7.7618412274849895, "grad_norm": 3.8513638973236084, "step": 23270 }, { "epoch": 7.7618412274849895, "learning_rate": 6.967102023471283e-07, "step": 23270 }, { "epoch": 7.7618412274849895, "loss": 0.12965351343154907, "step": 23270 }, { "ce_loss": 0.0026622305158525705, "epoch": 7.7618412274849895, "step": 23270 }, { "distill_loss": 0.08537213504314423, "epoch": 7.7618412274849895, "step": 23270 }, { "epoch": 7.7618412274849895, "ref_ce_loss": 0.0415009967982769, "step": 23270 }, { "epoch": 7.7618412274849895, "loss": 0.1693611890077591, "step": 23270 }, { "ce_loss": 0.008908931165933609, "epoch": 7.7618412274849895, "step": 23270 }, { "distill_loss": 0.10009366273880005, "epoch": 7.7618412274849895, "step": 23270 }, { "epoch": 7.7618412274849895, "ref_ce_loss": 0.045529551804065704, "step": 23270 }, { "epoch": 7.7618412274849895, "loss": 0.3749549686908722, "step": 23270 }, { "ce_loss": 0.08394438028335571, "epoch": 7.7618412274849895, "step": 23270 }, { "distill_loss": 0.11877831071615219, "epoch": 7.7618412274849895, "step": 23270 }, { "epoch": 7.7618412274849895, "ref_ce_loss": 0.08435434848070145, "step": 23270 }, { "epoch": 7.7618412274849895, "loss": 0.25630539655685425, "step": 23270 }, { "ce_loss": 0.032179322093725204, "epoch": 7.7618412274849895, "step": 23270 }, { "distill_loss": 0.14230948686599731, "epoch": 7.7618412274849895, "step": 23270 }, { "epoch": 7.7618412274849895, "ref_ce_loss": 0.05302827060222626, "step": 23270 }, { "epoch": 7.765176784523016, "loss": 0.3329, "step": 23280 }, { "epoch": 7.765176784523016, "grad_norm": 4.037125110626221, "step": 23280 }, { "epoch": 7.765176784523016, "learning_rate": 6.773457730720966e-07, "step": 23280 }, { "epoch": 7.765176784523016, "loss": 0.24391396343708038, "step": 23280 }, { "ce_loss": 0.007923566736280918, "epoch": 7.765176784523016, "step": 23280 }, { "distill_loss": 0.14070750772953033, "epoch": 7.765176784523016, "step": 23280 }, { "epoch": 7.765176784523016, "ref_ce_loss": 0.06303049623966217, "step": 23280 }, { "epoch": 7.765176784523016, "loss": 0.3306331932544708, "step": 23280 }, { "ce_loss": 0.10012779384851456, "epoch": 7.765176784523016, "step": 23280 }, { "distill_loss": 0.13132646679878235, "epoch": 7.765176784523016, "step": 23280 }, { "epoch": 7.765176784523016, "ref_ce_loss": 0.06317123770713806, "step": 23280 }, { "epoch": 7.765176784523016, "loss": 0.30709633231163025, "step": 23280 }, { "ce_loss": 0.01032828539609909, "epoch": 7.765176784523016, "step": 23280 }, { "distill_loss": 0.20121553540229797, "epoch": 7.765176784523016, "step": 23280 }, { "epoch": 7.765176784523016, "ref_ce_loss": 0.050020549446344376, "step": 23280 }, { "epoch": 7.765176784523016, "loss": 0.2119283825159073, "step": 23280 }, { "ce_loss": 0.010647183284163475, "epoch": 7.765176784523016, "step": 23280 }, { "distill_loss": 0.15966114401817322, "epoch": 7.765176784523016, "step": 23280 }, { "epoch": 7.765176784523016, "ref_ce_loss": 0.041549064218997955, "step": 23280 }, { "epoch": 7.76851234156104, "loss": 0.338, "step": 23290 }, { "epoch": 7.76851234156104, "grad_norm": 4.431809902191162, "step": 23290 }, { "epoch": 7.76851234156104, "learning_rate": 6.582536494154022e-07, "step": 23290 }, { "epoch": 7.76851234156104, "loss": 0.2826223373413086, "step": 23290 }, { "ce_loss": 0.018869668245315552, "epoch": 7.76851234156104, "step": 23290 }, { "distill_loss": 0.21956700086593628, "epoch": 7.76851234156104, "step": 23290 }, { "epoch": 7.76851234156104, "ref_ce_loss": 0.04408290982246399, "step": 23290 }, { "epoch": 7.76851234156104, "loss": 0.22245214879512787, "step": 23290 }, { "ce_loss": 0.02188754640519619, "epoch": 7.76851234156104, "step": 23290 }, { "distill_loss": 0.12243182957172394, "epoch": 7.76851234156104, "step": 23290 }, { "epoch": 7.76851234156104, "ref_ce_loss": 0.045550353825092316, "step": 23290 }, { "epoch": 7.76851234156104, "loss": 0.24546176195144653, "step": 23290 }, { "ce_loss": 0.04683222249150276, "epoch": 7.76851234156104, "step": 23290 }, { "distill_loss": 0.13283726572990417, "epoch": 7.76851234156104, "step": 23290 }, { "epoch": 7.76851234156104, "ref_ce_loss": 0.04206204414367676, "step": 23290 }, { "epoch": 7.76851234156104, "loss": 0.16294394433498383, "step": 23290 }, { "ce_loss": 0.008387603797018528, "epoch": 7.76851234156104, "step": 23290 }, { "distill_loss": 0.08378133177757263, "epoch": 7.76851234156104, "step": 23290 }, { "epoch": 7.76851234156104, "ref_ce_loss": 0.033749647438526154, "step": 23290 }, { "epoch": 7.771847898599066, "loss": 0.3135, "step": 23300 }, { "epoch": 7.771847898599066, "grad_norm": 3.9904065132141113, "step": 23300 }, { "epoch": 7.771847898599066, "learning_rate": 6.39433866193545e-07, "step": 23300 }, { "epoch": 7.771847898599066, "loss": 0.6241667866706848, "step": 23300 }, { "ce_loss": 0.024339929223060608, "epoch": 7.771847898599066, "step": 23300 }, { "distill_loss": 0.19311808049678802, "epoch": 7.771847898599066, "step": 23300 }, { "epoch": 7.771847898599066, "ref_ce_loss": 0.05117562785744667, "step": 23300 }, { "epoch": 7.771847898599066, "loss": 0.2056247889995575, "step": 23300 }, { "ce_loss": 0.014999239705502987, "epoch": 7.771847898599066, "step": 23300 }, { "distill_loss": 0.13496920466423035, "epoch": 7.771847898599066, "step": 23300 }, { "epoch": 7.771847898599066, "ref_ce_loss": 0.05555931106209755, "step": 23300 }, { "epoch": 7.771847898599066, "loss": 0.4284008741378784, "step": 23300 }, { "ce_loss": 0.03497346118092537, "epoch": 7.771847898599066, "step": 23300 }, { "distill_loss": 0.23885288834571838, "epoch": 7.771847898599066, "step": 23300 }, { "epoch": 7.771847898599066, "ref_ce_loss": 0.050635844469070435, "step": 23300 }, { "epoch": 7.771847898599066, "loss": 0.34450194239616394, "step": 23300 }, { "ce_loss": 0.056762758642435074, "epoch": 7.771847898599066, "step": 23300 }, { "distill_loss": 0.17033667862415314, "epoch": 7.771847898599066, "step": 23300 }, { "epoch": 7.771847898599066, "ref_ce_loss": 0.0646563395857811, "step": 23300 }, { "epoch": 7.775183455637091, "loss": 0.3266, "step": 23310 }, { "epoch": 7.775183455637091, "grad_norm": 3.700901746749878, "step": 23310 }, { "epoch": 7.775183455637091, "learning_rate": 6.208864577263717e-07, "step": 23310 }, { "epoch": 7.775183455637091, "loss": 0.31795617938041687, "step": 23310 }, { "ce_loss": 0.023845195770263672, "epoch": 7.775183455637091, "step": 23310 }, { "distill_loss": 0.1281346082687378, "epoch": 7.775183455637091, "step": 23310 }, { "epoch": 7.775183455637091, "ref_ce_loss": 0.04168390855193138, "step": 23310 }, { "epoch": 7.775183455637091, "loss": 0.2685645818710327, "step": 23310 }, { "ce_loss": 0.030331619083881378, "epoch": 7.775183455637091, "step": 23310 }, { "distill_loss": 0.1365409791469574, "epoch": 7.775183455637091, "step": 23310 }, { "epoch": 7.775183455637091, "ref_ce_loss": 0.0594908744096756, "step": 23310 }, { "epoch": 7.775183455637091, "loss": 0.24335750937461853, "step": 23310 }, { "ce_loss": 0.05650734901428223, "epoch": 7.775183455637091, "step": 23310 }, { "distill_loss": 0.1242005005478859, "epoch": 7.775183455637091, "step": 23310 }, { "epoch": 7.775183455637091, "ref_ce_loss": 0.04149289056658745, "step": 23310 }, { "epoch": 7.775183455637091, "loss": 0.3003118634223938, "step": 23310 }, { "ce_loss": 0.020960014313459396, "epoch": 7.775183455637091, "step": 23310 }, { "distill_loss": 0.21566730737686157, "epoch": 7.775183455637091, "step": 23310 }, { "epoch": 7.775183455637091, "ref_ce_loss": 0.047933224588632584, "step": 23310 }, { "epoch": 7.778519012675117, "loss": 0.3369, "step": 23320 }, { "epoch": 7.778519012675117, "grad_norm": 2.937605857849121, "step": 23320 }, { "epoch": 7.778519012675117, "learning_rate": 6.026114578370434e-07, "step": 23320 }, { "epoch": 7.778519012675117, "loss": 0.25859567523002625, "step": 23320 }, { "ce_loss": 0.028528448194265366, "epoch": 7.778519012675117, "step": 23320 }, { "distill_loss": 0.1434447169303894, "epoch": 7.778519012675117, "step": 23320 }, { "epoch": 7.778519012675117, "ref_ce_loss": 0.045929595828056335, "step": 23320 }, { "epoch": 7.778519012675117, "loss": 0.6326279044151306, "step": 23320 }, { "ce_loss": 0.017608165740966797, "epoch": 7.778519012675117, "step": 23320 }, { "distill_loss": 0.13123691082000732, "epoch": 7.778519012675117, "step": 23320 }, { "epoch": 7.778519012675117, "ref_ce_loss": 0.04305972158908844, "step": 23320 }, { "epoch": 7.778519012675117, "loss": 0.21977195143699646, "step": 23320 }, { "ce_loss": 0.031461816281080246, "epoch": 7.778519012675117, "step": 23320 }, { "distill_loss": 0.14010189473628998, "epoch": 7.778519012675117, "step": 23320 }, { "epoch": 7.778519012675117, "ref_ce_loss": 0.03188446909189224, "step": 23320 }, { "epoch": 7.778519012675117, "loss": 0.25341397523880005, "step": 23320 }, { "ce_loss": 0.02488807961344719, "epoch": 7.778519012675117, "step": 23320 }, { "distill_loss": 0.15948396921157837, "epoch": 7.778519012675117, "step": 23320 }, { "epoch": 7.778519012675117, "ref_ce_loss": 0.06859219819307327, "step": 23320 }, { "epoch": 7.781854569713142, "loss": 0.2975, "step": 23330 }, { "epoch": 7.781854569713142, "grad_norm": 2.587712049484253, "step": 23330 }, { "epoch": 7.781854569713142, "learning_rate": 5.846088998519683e-07, "step": 23330 }, { "epoch": 7.781854569713142, "loss": 0.3860955238342285, "step": 23330 }, { "ce_loss": 0.009484180249273777, "epoch": 7.781854569713142, "step": 23330 }, { "distill_loss": 0.13444848358631134, "epoch": 7.781854569713142, "step": 23330 }, { "epoch": 7.781854569713142, "ref_ce_loss": 0.03321141377091408, "step": 23330 }, { "epoch": 7.781854569713142, "loss": 0.4119266867637634, "step": 23330 }, { "ce_loss": 0.0233280248939991, "epoch": 7.781854569713142, "step": 23330 }, { "distill_loss": 0.2424059808254242, "epoch": 7.781854569713142, "step": 23330 }, { "epoch": 7.781854569713142, "ref_ce_loss": 0.06320960074663162, "step": 23330 }, { "epoch": 7.781854569713142, "loss": 0.2660433351993561, "step": 23330 }, { "ce_loss": 0.06393882632255554, "epoch": 7.781854569713142, "step": 23330 }, { "distill_loss": 0.14668019115924835, "epoch": 7.781854569713142, "step": 23330 }, { "epoch": 7.781854569713142, "ref_ce_loss": 0.055365853011608124, "step": 23330 }, { "epoch": 7.781854569713142, "loss": 0.27549660205841064, "step": 23330 }, { "ce_loss": 0.0275224968791008, "epoch": 7.781854569713142, "step": 23330 }, { "distill_loss": 0.18359704315662384, "epoch": 7.781854569713142, "step": 23330 }, { "epoch": 7.781854569713142, "ref_ce_loss": 0.02890479750931263, "step": 23330 }, { "epoch": 7.785190126751168, "loss": 0.2935, "step": 23340 }, { "epoch": 7.785190126751168, "grad_norm": 3.114147663116455, "step": 23340 }, { "epoch": 7.785190126751168, "learning_rate": 5.668788166006854e-07, "step": 23340 }, { "epoch": 7.785190126751168, "loss": 0.2866002917289734, "step": 23340 }, { "ce_loss": 0.008112505078315735, "epoch": 7.785190126751168, "step": 23340 }, { "distill_loss": 0.16037411987781525, "epoch": 7.785190126751168, "step": 23340 }, { "epoch": 7.785190126751168, "ref_ce_loss": 0.04638290032744408, "step": 23340 }, { "epoch": 7.785190126751168, "loss": 0.2657839059829712, "step": 23340 }, { "ce_loss": 0.0471295528113842, "epoch": 7.785190126751168, "step": 23340 }, { "distill_loss": 0.13989612460136414, "epoch": 7.785190126751168, "step": 23340 }, { "epoch": 7.785190126751168, "ref_ce_loss": 0.045277271419763565, "step": 23340 }, { "epoch": 7.785190126751168, "loss": 0.3349273204803467, "step": 23340 }, { "ce_loss": 0.037495408207178116, "epoch": 7.785190126751168, "step": 23340 }, { "distill_loss": 0.20219463109970093, "epoch": 7.785190126751168, "step": 23340 }, { "epoch": 7.785190126751168, "ref_ce_loss": 0.07002187520265579, "step": 23340 }, { "epoch": 7.785190126751168, "loss": 0.46111318469047546, "step": 23340 }, { "ce_loss": 0.05854769051074982, "epoch": 7.785190126751168, "step": 23340 }, { "distill_loss": 0.28044867515563965, "epoch": 7.785190126751168, "step": 23340 }, { "epoch": 7.785190126751168, "ref_ce_loss": 0.07100299000740051, "step": 23340 }, { "epoch": 7.788525683789192, "loss": 0.3275, "step": 23350 }, { "epoch": 7.788525683789192, "grad_norm": 3.6980140209198, "step": 23350 }, { "epoch": 7.788525683789192, "learning_rate": 5.494212404158982e-07, "step": 23350 }, { "epoch": 7.788525683789192, "loss": 0.25768497586250305, "step": 23350 }, { "ce_loss": 0.011918849311769009, "epoch": 7.788525683789192, "step": 23350 }, { "distill_loss": 0.1634317934513092, "epoch": 7.788525683789192, "step": 23350 }, { "epoch": 7.788525683789192, "ref_ce_loss": 0.043155863881111145, "step": 23350 }, { "epoch": 7.788525683789192, "loss": 0.216793954372406, "step": 23350 }, { "ce_loss": 0.02071814425289631, "epoch": 7.788525683789192, "step": 23350 }, { "distill_loss": 0.14459185302257538, "epoch": 7.788525683789192, "step": 23350 }, { "epoch": 7.788525683789192, "ref_ce_loss": 0.05123360455036163, "step": 23350 }, { "epoch": 7.788525683789192, "loss": 0.1484888792037964, "step": 23350 }, { "ce_loss": 0.010345513932406902, "epoch": 7.788525683789192, "step": 23350 }, { "distill_loss": 0.08661268651485443, "epoch": 7.788525683789192, "step": 23350 }, { "epoch": 7.788525683789192, "ref_ce_loss": 0.03675112500786781, "step": 23350 }, { "epoch": 7.788525683789192, "loss": 0.205137237906456, "step": 23350 }, { "ce_loss": 0.04296492040157318, "epoch": 7.788525683789192, "step": 23350 }, { "distill_loss": 0.11191177368164062, "epoch": 7.788525683789192, "step": 23350 }, { "epoch": 7.788525683789192, "ref_ce_loss": 0.05003888159990311, "step": 23350 }, { "epoch": 7.7918612408272185, "loss": 0.3069, "step": 23360 }, { "epoch": 7.7918612408272185, "grad_norm": 2.725006580352783, "step": 23360 }, { "epoch": 7.7918612408272185, "learning_rate": 5.322362031333238e-07, "step": 23360 }, { "epoch": 7.7918612408272185, "loss": 0.4058573842048645, "step": 23360 }, { "ce_loss": 0.03540191799402237, "epoch": 7.7918612408272185, "step": 23360 }, { "distill_loss": 0.11767835170030594, "epoch": 7.7918612408272185, "step": 23360 }, { "epoch": 7.7918612408272185, "ref_ce_loss": 0.029293276369571686, "step": 23360 }, { "epoch": 7.7918612408272185, "loss": 0.4584265351295471, "step": 23360 }, { "ce_loss": 0.0269757267087698, "epoch": 7.7918612408272185, "step": 23360 }, { "distill_loss": 0.2049024999141693, "epoch": 7.7918612408272185, "step": 23360 }, { "epoch": 7.7918612408272185, "ref_ce_loss": 0.048592083156108856, "step": 23360 }, { "epoch": 7.7918612408272185, "loss": 0.3664950728416443, "step": 23360 }, { "ce_loss": 0.0457327701151371, "epoch": 7.7918612408272185, "step": 23360 }, { "distill_loss": 0.2076897770166397, "epoch": 7.7918612408272185, "step": 23360 }, { "epoch": 7.7918612408272185, "ref_ce_loss": 0.0358634777367115, "step": 23360 }, { "epoch": 7.7918612408272185, "loss": 0.1988409459590912, "step": 23360 }, { "ce_loss": 0.013353724963963032, "epoch": 7.7918612408272185, "step": 23360 }, { "distill_loss": 0.14962157607078552, "epoch": 7.7918612408272185, "step": 23360 }, { "epoch": 7.7918612408272185, "ref_ce_loss": 0.035612523555755615, "step": 23360 }, { "epoch": 7.795196797865243, "loss": 0.3187, "step": 23370 }, { "epoch": 7.795196797865243, "grad_norm": 3.3891184329986572, "step": 23370 }, { "epoch": 7.795196797865243, "learning_rate": 5.153237360916773e-07, "step": 23370 }, { "epoch": 7.795196797865243, "loss": 0.5488206744194031, "step": 23370 }, { "ce_loss": 0.10254282504320145, "epoch": 7.795196797865243, "step": 23370 }, { "distill_loss": 0.378812313079834, "epoch": 7.795196797865243, "step": 23370 }, { "epoch": 7.795196797865243, "ref_ce_loss": 0.04811196029186249, "step": 23370 }, { "epoch": 7.795196797865243, "loss": 0.4255595803260803, "step": 23370 }, { "ce_loss": 0.03799142688512802, "epoch": 7.795196797865243, "step": 23370 }, { "distill_loss": 0.2826904058456421, "epoch": 7.795196797865243, "step": 23370 }, { "epoch": 7.795196797865243, "ref_ce_loss": 0.04478185623884201, "step": 23370 }, { "epoch": 7.795196797865243, "loss": 0.2226969301700592, "step": 23370 }, { "ce_loss": 0.021655146032571793, "epoch": 7.795196797865243, "step": 23370 }, { "distill_loss": 0.1437462568283081, "epoch": 7.795196797865243, "step": 23370 }, { "epoch": 7.795196797865243, "ref_ce_loss": 0.046798039227724075, "step": 23370 }, { "epoch": 7.795196797865243, "loss": 0.48034214973449707, "step": 23370 }, { "ce_loss": 0.046554792672395706, "epoch": 7.795196797865243, "step": 23370 }, { "distill_loss": 0.3787533640861511, "epoch": 7.795196797865243, "step": 23370 }, { "epoch": 7.795196797865243, "ref_ce_loss": 0.05498090758919716, "step": 23370 }, { "epoch": 7.798532354903269, "loss": 0.3151, "step": 23380 }, { "epoch": 7.798532354903269, "grad_norm": 4.284021854400635, "step": 23380 }, { "epoch": 7.798532354903269, "learning_rate": 4.986838701326545e-07, "step": 23380 }, { "epoch": 7.798532354903269, "loss": 0.3707292079925537, "step": 23380 }, { "ce_loss": 0.09322760999202728, "epoch": 7.798532354903269, "step": 23380 }, { "distill_loss": 0.21029648184776306, "epoch": 7.798532354903269, "step": 23380 }, { "epoch": 7.798532354903269, "ref_ce_loss": 0.05152899771928787, "step": 23380 }, { "epoch": 7.798532354903269, "loss": 0.3132014870643616, "step": 23380 }, { "ce_loss": 0.04309116676449776, "epoch": 7.798532354903269, "step": 23380 }, { "distill_loss": 0.14849551022052765, "epoch": 7.798532354903269, "step": 23380 }, { "epoch": 7.798532354903269, "ref_ce_loss": 0.05384783446788788, "step": 23380 }, { "epoch": 7.798532354903269, "loss": 0.2031479924917221, "step": 23380 }, { "ce_loss": 0.010098463855683804, "epoch": 7.798532354903269, "step": 23380 }, { "distill_loss": 0.1490442454814911, "epoch": 7.798532354903269, "step": 23380 }, { "epoch": 7.798532354903269, "ref_ce_loss": 0.029072962701320648, "step": 23380 }, { "epoch": 7.798532354903269, "loss": 0.23375585675239563, "step": 23380 }, { "ce_loss": 0.013286544941365719, "epoch": 7.798532354903269, "step": 23380 }, { "distill_loss": 0.10644297301769257, "epoch": 7.798532354903269, "step": 23380 }, { "epoch": 7.798532354903269, "ref_ce_loss": 0.02351474016904831, "step": 23380 }, { "epoch": 7.801867911941294, "loss": 0.3135, "step": 23390 }, { "epoch": 7.801867911941294, "grad_norm": 3.123403310775757, "step": 23390 }, { "epoch": 7.801867911941294, "learning_rate": 4.82316635600799e-07, "step": 23390 }, { "epoch": 7.801867911941294, "loss": 0.2821028530597687, "step": 23390 }, { "ce_loss": 0.019099919125437737, "epoch": 7.801867911941294, "step": 23390 }, { "distill_loss": 0.17505885660648346, "epoch": 7.801867911941294, "step": 23390 }, { "epoch": 7.801867911941294, "ref_ce_loss": 0.06228472664952278, "step": 23390 }, { "epoch": 7.801867911941294, "loss": 0.18802164494991302, "step": 23390 }, { "ce_loss": 0.009978541173040867, "epoch": 7.801867911941294, "step": 23390 }, { "distill_loss": 0.11921600997447968, "epoch": 7.801867911941294, "step": 23390 }, { "epoch": 7.801867911941294, "ref_ce_loss": 0.03513723239302635, "step": 23390 }, { "epoch": 7.801867911941294, "loss": 0.45474499464035034, "step": 23390 }, { "ce_loss": 0.07498273998498917, "epoch": 7.801867911941294, "step": 23390 }, { "distill_loss": 0.19355358183383942, "epoch": 7.801867911941294, "step": 23390 }, { "epoch": 7.801867911941294, "ref_ce_loss": 0.07677411288022995, "step": 23390 }, { "epoch": 7.801867911941294, "loss": 0.270125150680542, "step": 23390 }, { "ce_loss": 0.06720106303691864, "epoch": 7.801867911941294, "step": 23390 }, { "distill_loss": 0.13926897943019867, "epoch": 7.801867911941294, "step": 23390 }, { "epoch": 7.801867911941294, "ref_ce_loss": 0.038092099130153656, "step": 23390 }, { "epoch": 7.80520346897932, "loss": 0.3264, "step": 23400 }, { "epoch": 7.80520346897932, "grad_norm": 2.53342604637146, "step": 23400 }, { "epoch": 7.80520346897932, "learning_rate": 4.662220623434854e-07, "step": 23400 }, { "epoch": 7.80520346897932, "loss": 0.3736017942428589, "step": 23400 }, { "ce_loss": 0.02059594914317131, "epoch": 7.80520346897932, "step": 23400 }, { "distill_loss": 0.14803819358348846, "epoch": 7.80520346897932, "step": 23400 }, { "epoch": 7.80520346897932, "ref_ce_loss": 0.06236787885427475, "step": 23400 }, { "epoch": 7.80520346897932, "loss": 0.34300127625465393, "step": 23400 }, { "ce_loss": 0.07793295383453369, "epoch": 7.80520346897932, "step": 23400 }, { "distill_loss": 0.13648638129234314, "epoch": 7.80520346897932, "step": 23400 }, { "epoch": 7.80520346897932, "ref_ce_loss": 0.04996887966990471, "step": 23400 }, { "epoch": 7.80520346897932, "loss": 0.27772849798202515, "step": 23400 }, { "ce_loss": 0.0037580865900963545, "epoch": 7.80520346897932, "step": 23400 }, { "distill_loss": 0.2001986801624298, "epoch": 7.80520346897932, "step": 23400 }, { "epoch": 7.80520346897932, "ref_ce_loss": 0.043960537761449814, "step": 23400 }, { "epoch": 7.80520346897932, "loss": 0.2673993706703186, "step": 23400 }, { "ce_loss": 0.052981454879045486, "epoch": 7.80520346897932, "step": 23400 }, { "distill_loss": 0.12502837181091309, "epoch": 7.80520346897932, "step": 23400 }, { "epoch": 7.80520346897932, "ref_ce_loss": 0.060486406087875366, "step": 23400 }, { "epoch": 7.808539026017344, "loss": 0.3236, "step": 23410 }, { "epoch": 7.808539026017344, "grad_norm": 3.594543695449829, "step": 23410 }, { "epoch": 7.808539026017344, "learning_rate": 4.504001797108692e-07, "step": 23410 }, { "epoch": 7.808539026017344, "loss": 0.21197645366191864, "step": 23410 }, { "ce_loss": 0.033052023500204086, "epoch": 7.808539026017344, "step": 23410 }, { "distill_loss": 0.12372135370969772, "epoch": 7.808539026017344, "step": 23410 }, { "epoch": 7.808539026017344, "ref_ce_loss": 0.03794866055250168, "step": 23410 }, { "epoch": 7.808539026017344, "loss": 0.22799083590507507, "step": 23410 }, { "ce_loss": 0.012985233217477798, "epoch": 7.808539026017344, "step": 23410 }, { "distill_loss": 0.18273982405662537, "epoch": 7.808539026017344, "step": 23410 }, { "epoch": 7.808539026017344, "ref_ce_loss": 0.0319737084209919, "step": 23410 }, { "epoch": 7.808539026017344, "loss": 0.32432064414024353, "step": 23410 }, { "ce_loss": 0.03222603350877762, "epoch": 7.808539026017344, "step": 23410 }, { "distill_loss": 0.2243024706840515, "epoch": 7.808539026017344, "step": 23410 }, { "epoch": 7.808539026017344, "ref_ce_loss": 0.03382931277155876, "step": 23410 }, { "epoch": 7.808539026017344, "loss": 0.19178549945354462, "step": 23410 }, { "ce_loss": 0.029627012088894844, "epoch": 7.808539026017344, "step": 23410 }, { "distill_loss": 0.12371587753295898, "epoch": 7.808539026017344, "step": 23410 }, { "epoch": 7.808539026017344, "ref_ce_loss": 0.038338132202625275, "step": 23410 }, { "epoch": 7.811874583055371, "loss": 0.324, "step": 23420 }, { "epoch": 7.811874583055371, "grad_norm": 3.236091375350952, "step": 23420 }, { "epoch": 7.811874583055371, "learning_rate": 4.3485101655582057e-07, "step": 23420 }, { "epoch": 7.811874583055371, "loss": 0.3116261065006256, "step": 23420 }, { "ce_loss": 0.039746448397636414, "epoch": 7.811874583055371, "step": 23420 }, { "distill_loss": 0.12467524409294128, "epoch": 7.811874583055371, "step": 23420 }, { "epoch": 7.811874583055371, "ref_ce_loss": 0.054104890674352646, "step": 23420 }, { "epoch": 7.811874583055371, "loss": 0.346844881772995, "step": 23420 }, { "ce_loss": 0.05025966838002205, "epoch": 7.811874583055371, "step": 23420 }, { "distill_loss": 0.19769015908241272, "epoch": 7.811874583055371, "step": 23420 }, { "epoch": 7.811874583055371, "ref_ce_loss": 0.06749259680509567, "step": 23420 }, { "epoch": 7.811874583055371, "loss": 0.22081111371517181, "step": 23420 }, { "ce_loss": 0.04156069457530975, "epoch": 7.811874583055371, "step": 23420 }, { "distill_loss": 0.13512654602527618, "epoch": 7.811874583055371, "step": 23420 }, { "epoch": 7.811874583055371, "ref_ce_loss": 0.04406435787677765, "step": 23420 }, { "epoch": 7.811874583055371, "loss": 0.24682305753231049, "step": 23420 }, { "ce_loss": 0.019891489297151566, "epoch": 7.811874583055371, "step": 23420 }, { "distill_loss": 0.1656222641468048, "epoch": 7.811874583055371, "step": 23420 }, { "epoch": 7.811874583055371, "ref_ce_loss": 0.061124760657548904, "step": 23420 }, { "epoch": 7.815210140093396, "loss": 0.2939, "step": 23430 }, { "epoch": 7.815210140093396, "grad_norm": 4.592631816864014, "step": 23430 }, { "epoch": 7.815210140093396, "learning_rate": 4.1957460123389074e-07, "step": 23430 }, { "epoch": 7.815210140093396, "loss": 0.23414938151836395, "step": 23430 }, { "ce_loss": 0.02321458049118519, "epoch": 7.815210140093396, "step": 23430 }, { "distill_loss": 0.12797735631465912, "epoch": 7.815210140093396, "step": 23430 }, { "epoch": 7.815210140093396, "ref_ce_loss": 0.05279207602143288, "step": 23430 }, { "epoch": 7.815210140093396, "loss": 0.24856454133987427, "step": 23430 }, { "ce_loss": 0.012455428019165993, "epoch": 7.815210140093396, "step": 23430 }, { "distill_loss": 0.1560458242893219, "epoch": 7.815210140093396, "step": 23430 }, { "epoch": 7.815210140093396, "ref_ce_loss": 0.06088119000196457, "step": 23430 }, { "epoch": 7.815210140093396, "loss": 0.39245137572288513, "step": 23430 }, { "ce_loss": 0.0226961188018322, "epoch": 7.815210140093396, "step": 23430 }, { "distill_loss": 0.21423345804214478, "epoch": 7.815210140093396, "step": 23430 }, { "epoch": 7.815210140093396, "ref_ce_loss": 0.055108457803726196, "step": 23430 }, { "epoch": 7.815210140093396, "loss": 0.3869752883911133, "step": 23430 }, { "ce_loss": 0.027073608711361885, "epoch": 7.815210140093396, "step": 23430 }, { "distill_loss": 0.18560923635959625, "epoch": 7.815210140093396, "step": 23430 }, { "epoch": 7.815210140093396, "ref_ce_loss": 0.060218311846256256, "step": 23430 }, { "epoch": 7.818545697131421, "loss": 0.334, "step": 23440 }, { "epoch": 7.818545697131421, "grad_norm": 4.625476837158203, "step": 23440 }, { "epoch": 7.818545697131421, "learning_rate": 4.045709616032122e-07, "step": 23440 }, { "epoch": 7.818545697131421, "loss": 0.40051406621932983, "step": 23440 }, { "ce_loss": 0.024206679314374924, "epoch": 7.818545697131421, "step": 23440 }, { "distill_loss": 0.2373490333557129, "epoch": 7.818545697131421, "step": 23440 }, { "epoch": 7.818545697131421, "ref_ce_loss": 0.049871303141117096, "step": 23440 }, { "epoch": 7.818545697131421, "loss": 0.48238322138786316, "step": 23440 }, { "ce_loss": 0.03466752916574478, "epoch": 7.818545697131421, "step": 23440 }, { "distill_loss": 0.32631319761276245, "epoch": 7.818545697131421, "step": 23440 }, { "epoch": 7.818545697131421, "ref_ce_loss": 0.050187211483716965, "step": 23440 }, { "epoch": 7.818545697131421, "loss": 0.2827662527561188, "step": 23440 }, { "ce_loss": 0.01909097470343113, "epoch": 7.818545697131421, "step": 23440 }, { "distill_loss": 0.21451322734355927, "epoch": 7.818545697131421, "step": 23440 }, { "epoch": 7.818545697131421, "ref_ce_loss": 0.04905122518539429, "step": 23440 }, { "epoch": 7.818545697131421, "loss": 0.17934754490852356, "step": 23440 }, { "ce_loss": 0.0021979936864227057, "epoch": 7.818545697131421, "step": 23440 }, { "distill_loss": 0.10636477172374725, "epoch": 7.818545697131421, "step": 23440 }, { "epoch": 7.818545697131421, "ref_ce_loss": 0.01800953969359398, "step": 23440 }, { "epoch": 7.821881254169447, "loss": 0.3487, "step": 23450 }, { "epoch": 7.821881254169447, "grad_norm": 7.102529525756836, "step": 23450 }, { "epoch": 7.821881254169447, "learning_rate": 3.89840125024532e-07, "step": 23450 }, { "epoch": 7.821881254169447, "loss": 0.45607298612594604, "step": 23450 }, { "ce_loss": 0.06953687965869904, "epoch": 7.821881254169447, "step": 23450 }, { "distill_loss": 0.13492602109909058, "epoch": 7.821881254169447, "step": 23450 }, { "epoch": 7.821881254169447, "ref_ce_loss": 0.07326891273260117, "step": 23450 }, { "epoch": 7.821881254169447, "loss": 0.2469940036535263, "step": 23450 }, { "ce_loss": 0.042449042201042175, "epoch": 7.821881254169447, "step": 23450 }, { "distill_loss": 0.13725703954696655, "epoch": 7.821881254169447, "step": 23450 }, { "epoch": 7.821881254169447, "ref_ce_loss": 0.04501740261912346, "step": 23450 }, { "epoch": 7.821881254169447, "loss": 0.3980846107006073, "step": 23450 }, { "ce_loss": 0.0027305660769343376, "epoch": 7.821881254169447, "step": 23450 }, { "distill_loss": 0.3378257751464844, "epoch": 7.821881254169447, "step": 23450 }, { "epoch": 7.821881254169447, "ref_ce_loss": 0.04068752005696297, "step": 23450 }, { "epoch": 7.821881254169447, "loss": 0.26008281111717224, "step": 23450 }, { "ce_loss": 0.06028151512145996, "epoch": 7.821881254169447, "step": 23450 }, { "distill_loss": 0.1407707929611206, "epoch": 7.821881254169447, "step": 23450 }, { "epoch": 7.821881254169447, "ref_ce_loss": 0.04119338467717171, "step": 23450 }, { "epoch": 7.825216811207472, "loss": 0.3276, "step": 23460 }, { "epoch": 7.825216811207472, "grad_norm": 2.3489158153533936, "step": 23460 }, { "epoch": 7.825216811207472, "learning_rate": 3.753821183610617e-07, "step": 23460 }, { "epoch": 7.825216811207472, "loss": 0.5334453582763672, "step": 23460 }, { "ce_loss": 0.021312382072210312, "epoch": 7.825216811207472, "step": 23460 }, { "distill_loss": 0.19238534569740295, "epoch": 7.825216811207472, "step": 23460 }, { "epoch": 7.825216811207472, "ref_ce_loss": 0.05404605343937874, "step": 23460 }, { "epoch": 7.825216811207472, "loss": 0.33135032653808594, "step": 23460 }, { "ce_loss": 0.07059341669082642, "epoch": 7.825216811207472, "step": 23460 }, { "distill_loss": 0.15060116350650787, "epoch": 7.825216811207472, "step": 23460 }, { "epoch": 7.825216811207472, "ref_ce_loss": 0.06151656061410904, "step": 23460 }, { "epoch": 7.825216811207472, "loss": 0.24996304512023926, "step": 23460 }, { "ce_loss": 0.027928778901696205, "epoch": 7.825216811207472, "step": 23460 }, { "distill_loss": 0.16909432411193848, "epoch": 7.825216811207472, "step": 23460 }, { "epoch": 7.825216811207472, "ref_ce_loss": 0.039568815380334854, "step": 23460 }, { "epoch": 7.825216811207472, "loss": 0.21376681327819824, "step": 23460 }, { "ce_loss": 0.019738253206014633, "epoch": 7.825216811207472, "step": 23460 }, { "distill_loss": 0.14254921674728394, "epoch": 7.825216811207472, "step": 23460 }, { "epoch": 7.825216811207472, "ref_ce_loss": 0.037193119525909424, "step": 23460 }, { "epoch": 7.828552368245497, "loss": 0.3357, "step": 23470 }, { "epoch": 7.828552368245497, "grad_norm": 9.519171714782715, "step": 23470 }, { "epoch": 7.828552368245497, "learning_rate": 3.611969679785109e-07, "step": 23470 }, { "epoch": 7.828552368245497, "loss": 0.20791323482990265, "step": 23470 }, { "ce_loss": 0.023825544863939285, "epoch": 7.828552368245497, "step": 23470 }, { "distill_loss": 0.15400870144367218, "epoch": 7.828552368245497, "step": 23470 }, { "epoch": 7.828552368245497, "ref_ce_loss": 0.02994854561984539, "step": 23470 }, { "epoch": 7.828552368245497, "loss": 0.4624814987182617, "step": 23470 }, { "ce_loss": 0.06506424397230148, "epoch": 7.828552368245497, "step": 23470 }, { "distill_loss": 0.21073627471923828, "epoch": 7.828552368245497, "step": 23470 }, { "epoch": 7.828552368245497, "ref_ce_loss": 0.07934938371181488, "step": 23470 }, { "epoch": 7.828552368245497, "loss": 0.2593517005443573, "step": 23470 }, { "ce_loss": 0.002018554601818323, "epoch": 7.828552368245497, "step": 23470 }, { "distill_loss": 0.2089160680770874, "epoch": 7.828552368245497, "step": 23470 }, { "epoch": 7.828552368245497, "ref_ce_loss": 0.03180965408682823, "step": 23470 }, { "epoch": 7.828552368245497, "loss": 0.4130901098251343, "step": 23470 }, { "ce_loss": 0.053077515214681625, "epoch": 7.828552368245497, "step": 23470 }, { "distill_loss": 0.19720834493637085, "epoch": 7.828552368245497, "step": 23470 }, { "epoch": 7.828552368245497, "ref_ce_loss": 0.08093966543674469, "step": 23470 }, { "epoch": 7.831887925283523, "loss": 0.3327, "step": 23480 }, { "epoch": 7.831887925283523, "grad_norm": 4.104872703552246, "step": 23480 }, { "epoch": 7.831887925283523, "learning_rate": 3.4728469974500404e-07, "step": 23480 }, { "epoch": 7.831887925283523, "loss": 0.23577287793159485, "step": 23480 }, { "ce_loss": 0.017805000767111778, "epoch": 7.831887925283523, "step": 23480 }, { "distill_loss": 0.10591477900743484, "epoch": 7.831887925283523, "step": 23480 }, { "epoch": 7.831887925283523, "ref_ce_loss": 0.05476228520274162, "step": 23480 }, { "epoch": 7.831887925283523, "loss": 0.20567350089550018, "step": 23480 }, { "ce_loss": 0.011999239213764668, "epoch": 7.831887925283523, "step": 23480 }, { "distill_loss": 0.12367543578147888, "epoch": 7.831887925283523, "step": 23480 }, { "epoch": 7.831887925283523, "ref_ce_loss": 0.03651357442140579, "step": 23480 }, { "epoch": 7.831887925283523, "loss": 0.2672771215438843, "step": 23480 }, { "ce_loss": 0.03912220522761345, "epoch": 7.831887925283523, "step": 23480 }, { "distill_loss": 0.1666669398546219, "epoch": 7.831887925283523, "step": 23480 }, { "epoch": 7.831887925283523, "ref_ce_loss": 0.04421749338507652, "step": 23480 }, { "epoch": 7.831887925283523, "loss": 0.357584148645401, "step": 23480 }, { "ce_loss": 0.009105579927563667, "epoch": 7.831887925283523, "step": 23480 }, { "distill_loss": 0.16976270079612732, "epoch": 7.831887925283523, "step": 23480 }, { "epoch": 7.831887925283523, "ref_ce_loss": 0.050727520138025284, "step": 23480 }, { "epoch": 7.835223482321548, "loss": 0.2992, "step": 23490 }, { "epoch": 7.835223482321548, "grad_norm": 3.172041177749634, "step": 23490 }, { "epoch": 7.835223482321548, "learning_rate": 3.3364533903101343e-07, "step": 23490 }, { "epoch": 7.835223482321548, "loss": 0.3321012854576111, "step": 23490 }, { "ce_loss": 0.06076185032725334, "epoch": 7.835223482321548, "step": 23490 }, { "distill_loss": 0.16718757152557373, "epoch": 7.835223482321548, "step": 23490 }, { "epoch": 7.835223482321548, "ref_ce_loss": 0.06900273263454437, "step": 23490 }, { "epoch": 7.835223482321548, "loss": 0.16614539921283722, "step": 23490 }, { "ce_loss": 0.011339708231389523, "epoch": 7.835223482321548, "step": 23490 }, { "distill_loss": 0.10029933601617813, "epoch": 7.835223482321548, "step": 23490 }, { "epoch": 7.835223482321548, "ref_ce_loss": 0.021356748417019844, "step": 23490 }, { "epoch": 7.835223482321548, "loss": 0.27904677391052246, "step": 23490 }, { "ce_loss": 0.01795981265604496, "epoch": 7.835223482321548, "step": 23490 }, { "distill_loss": 0.14555040001869202, "epoch": 7.835223482321548, "step": 23490 }, { "epoch": 7.835223482321548, "ref_ce_loss": 0.0673777163028717, "step": 23490 }, { "epoch": 7.835223482321548, "loss": 0.4013983905315399, "step": 23490 }, { "ce_loss": 0.03549182042479515, "epoch": 7.835223482321548, "step": 23490 }, { "distill_loss": 0.24230507016181946, "epoch": 7.835223482321548, "step": 23490 }, { "epoch": 7.835223482321548, "ref_ce_loss": 0.06218017637729645, "step": 23490 }, { "epoch": 7.838559039359573, "loss": 0.3185, "step": 23500 }, { "epoch": 7.838559039359573, "grad_norm": 3.262314558029175, "step": 23500 }, { "epoch": 7.838559039359573, "learning_rate": 3.202789107093762e-07, "step": 23500 }, { "epoch": 7.838559039359573, "loss": 0.46285945177078247, "step": 23500 }, { "ce_loss": 0.016897911205887794, "epoch": 7.838559039359573, "step": 23500 }, { "distill_loss": 0.15950268507003784, "epoch": 7.838559039359573, "step": 23500 }, { "epoch": 7.838559039359573, "ref_ce_loss": 0.06392750889062881, "step": 23500 }, { "epoch": 7.838559039359573, "loss": 0.2956462502479553, "step": 23500 }, { "ce_loss": 0.007706071715801954, "epoch": 7.838559039359573, "step": 23500 }, { "distill_loss": 0.17924486100673676, "epoch": 7.838559039359573, "step": 23500 }, { "epoch": 7.838559039359573, "ref_ce_loss": 0.04117276147007942, "step": 23500 }, { "epoch": 7.838559039359573, "loss": 0.2654488980770111, "step": 23500 }, { "ce_loss": 0.04544892907142639, "epoch": 7.838559039359573, "step": 23500 }, { "distill_loss": 0.15256768465042114, "epoch": 7.838559039359573, "step": 23500 }, { "epoch": 7.838559039359573, "ref_ce_loss": 0.043563321232795715, "step": 23500 }, { "epoch": 7.838559039359573, "loss": 0.3258354365825653, "step": 23500 }, { "ce_loss": 0.028348958119750023, "epoch": 7.838559039359573, "step": 23500 }, { "distill_loss": 0.16680380702018738, "epoch": 7.838559039359573, "step": 23500 }, { "epoch": 7.838559039359573, "ref_ce_loss": 0.041485149413347244, "step": 23500 }, { "epoch": 7.841894596397599, "loss": 0.3601, "step": 23510 }, { "epoch": 7.841894596397599, "grad_norm": 2.688413619995117, "step": 23510 }, { "epoch": 7.841894596397599, "learning_rate": 3.0718543915517756e-07, "step": 23510 }, { "epoch": 7.841894596397599, "loss": 0.21694333851337433, "step": 23510 }, { "ce_loss": 0.005275980569422245, "epoch": 7.841894596397599, "step": 23510 }, { "distill_loss": 0.1437319666147232, "epoch": 7.841894596397599, "step": 23510 }, { "epoch": 7.841894596397599, "ref_ce_loss": 0.04064946621656418, "step": 23510 }, { "epoch": 7.841894596397599, "loss": 0.14688892662525177, "step": 23510 }, { "ce_loss": 0.005248190835118294, "epoch": 7.841894596397599, "step": 23510 }, { "distill_loss": 0.12220533192157745, "epoch": 7.841894596397599, "step": 23510 }, { "epoch": 7.841894596397599, "ref_ce_loss": 0.019273869693279266, "step": 23510 }, { "epoch": 7.841894596397599, "loss": 0.28784316778182983, "step": 23510 }, { "ce_loss": 0.0338323637843132, "epoch": 7.841894596397599, "step": 23510 }, { "distill_loss": 0.19960737228393555, "epoch": 7.841894596397599, "step": 23510 }, { "epoch": 7.841894596397599, "ref_ce_loss": 0.05415638908743858, "step": 23510 }, { "epoch": 7.841894596397599, "loss": 0.26387009024620056, "step": 23510 }, { "ce_loss": 0.035002514719963074, "epoch": 7.841894596397599, "step": 23510 }, { "distill_loss": 0.13795851171016693, "epoch": 7.841894596397599, "step": 23510 }, { "epoch": 7.841894596397599, "ref_ce_loss": 0.054587170481681824, "step": 23510 }, { "epoch": 7.845230153435624, "loss": 0.2939, "step": 23520 }, { "epoch": 7.845230153435624, "grad_norm": 3.071887731552124, "step": 23520 }, { "epoch": 7.845230153435624, "learning_rate": 2.943649482457344e-07, "step": 23520 }, { "epoch": 7.845230153435624, "loss": 0.19254055619239807, "step": 23520 }, { "ce_loss": 0.016048870980739594, "epoch": 7.845230153435624, "step": 23520 }, { "distill_loss": 0.11593937128782272, "epoch": 7.845230153435624, "step": 23520 }, { "epoch": 7.845230153435624, "ref_ce_loss": 0.045436661690473557, "step": 23520 }, { "epoch": 7.845230153435624, "loss": 0.2930757999420166, "step": 23520 }, { "ce_loss": 0.030346477404236794, "epoch": 7.845230153435624, "step": 23520 }, { "distill_loss": 0.15989623963832855, "epoch": 7.845230153435624, "step": 23520 }, { "epoch": 7.845230153435624, "ref_ce_loss": 0.05691306293010712, "step": 23520 }, { "epoch": 7.845230153435624, "loss": 0.39531221985816956, "step": 23520 }, { "ce_loss": 0.015122991986572742, "epoch": 7.845230153435624, "step": 23520 }, { "distill_loss": 0.28203243017196655, "epoch": 7.845230153435624, "step": 23520 }, { "epoch": 7.845230153435624, "ref_ce_loss": 0.06494788825511932, "step": 23520 }, { "epoch": 7.845230153435624, "loss": 0.33640390634536743, "step": 23520 }, { "ce_loss": 0.08751996606588364, "epoch": 7.845230153435624, "step": 23520 }, { "distill_loss": 0.15109512209892273, "epoch": 7.845230153435624, "step": 23520 }, { "epoch": 7.845230153435624, "ref_ce_loss": 0.04479917511343956, "step": 23520 }, { "epoch": 7.8485657104736495, "loss": 0.2966, "step": 23530 }, { "epoch": 7.8485657104736495, "grad_norm": 2.259341239929199, "step": 23530 }, { "epoch": 7.8485657104736495, "learning_rate": 2.81817461360595e-07, "step": 23530 }, { "epoch": 7.8485657104736495, "loss": 0.32070645689964294, "step": 23530 }, { "ce_loss": 0.025516116991639137, "epoch": 7.8485657104736495, "step": 23530 }, { "distill_loss": 0.22206610441207886, "epoch": 7.8485657104736495, "step": 23530 }, { "epoch": 7.8485657104736495, "ref_ce_loss": 0.04203583672642708, "step": 23530 }, { "epoch": 7.8485657104736495, "loss": 0.30409732460975647, "step": 23530 }, { "ce_loss": 0.008006962016224861, "epoch": 7.8485657104736495, "step": 23530 }, { "distill_loss": 0.2180558443069458, "epoch": 7.8485657104736495, "step": 23530 }, { "epoch": 7.8485657104736495, "ref_ce_loss": 0.07791711390018463, "step": 23530 }, { "epoch": 7.8485657104736495, "loss": 0.546816885471344, "step": 23530 }, { "ce_loss": 0.010635165497660637, "epoch": 7.8485657104736495, "step": 23530 }, { "distill_loss": 0.3440686762332916, "epoch": 7.8485657104736495, "step": 23530 }, { "epoch": 7.8485657104736495, "ref_ce_loss": 0.06196082383394241, "step": 23530 }, { "epoch": 7.8485657104736495, "loss": 0.35767361521720886, "step": 23530 }, { "ce_loss": 0.06103089079260826, "epoch": 7.8485657104736495, "step": 23530 }, { "distill_loss": 0.127061128616333, "epoch": 7.8485657104736495, "step": 23530 }, { "epoch": 7.8485657104736495, "ref_ce_loss": 0.05860128253698349, "step": 23530 }, { "epoch": 7.851901267511675, "loss": 0.3247, "step": 23540 }, { "epoch": 7.851901267511675, "grad_norm": 3.118084669113159, "step": 23540 }, { "epoch": 7.851901267511675, "learning_rate": 2.695430013813726e-07, "step": 23540 }, { "epoch": 7.851901267511675, "loss": 0.36181530356407166, "step": 23540 }, { "ce_loss": 0.06342129409313202, "epoch": 7.851901267511675, "step": 23540 }, { "distill_loss": 0.20927932858467102, "epoch": 7.851901267511675, "step": 23540 }, { "epoch": 7.851901267511675, "ref_ce_loss": 0.05260811373591423, "step": 23540 }, { "epoch": 7.851901267511675, "loss": 0.4537930488586426, "step": 23540 }, { "ce_loss": 0.06014955788850784, "epoch": 7.851901267511675, "step": 23540 }, { "distill_loss": 0.20907607674598694, "epoch": 7.851901267511675, "step": 23540 }, { "epoch": 7.851901267511675, "ref_ce_loss": 0.06965752691030502, "step": 23540 }, { "epoch": 7.851901267511675, "loss": 0.3706998825073242, "step": 23540 }, { "ce_loss": 0.04698286950588226, "epoch": 7.851901267511675, "step": 23540 }, { "distill_loss": 0.14785529673099518, "epoch": 7.851901267511675, "step": 23540 }, { "epoch": 7.851901267511675, "ref_ce_loss": 0.06559489667415619, "step": 23540 }, { "epoch": 7.851901267511675, "loss": 0.2899203896522522, "step": 23540 }, { "ce_loss": 0.03699249029159546, "epoch": 7.851901267511675, "step": 23540 }, { "distill_loss": 0.11820313334465027, "epoch": 7.851901267511675, "step": 23540 }, { "epoch": 7.851901267511675, "ref_ce_loss": 0.0372081995010376, "step": 23540 }, { "epoch": 7.8552368245497, "loss": 0.3375, "step": 23550 }, { "epoch": 7.8552368245497, "grad_norm": 5.25492525100708, "step": 23550 }, { "epoch": 7.8552368245497, "learning_rate": 2.5754159069187876e-07, "step": 23550 }, { "epoch": 7.8552368245497, "loss": 0.25230440497398376, "step": 23550 }, { "ce_loss": 0.04072759672999382, "epoch": 7.8552368245497, "step": 23550 }, { "distill_loss": 0.13626137375831604, "epoch": 7.8552368245497, "step": 23550 }, { "epoch": 7.8552368245497, "ref_ce_loss": 0.05338587239384651, "step": 23550 }, { "epoch": 7.8552368245497, "loss": 0.3071846663951874, "step": 23550 }, { "ce_loss": 0.025177132338285446, "epoch": 7.8552368245497, "step": 23550 }, { "distill_loss": 0.18812674283981323, "epoch": 7.8552368245497, "step": 23550 }, { "epoch": 7.8552368245497, "ref_ce_loss": 0.06389034539461136, "step": 23550 }, { "epoch": 7.8552368245497, "loss": 0.27324751019477844, "step": 23550 }, { "ce_loss": 0.0538780502974987, "epoch": 7.8552368245497, "step": 23550 }, { "distill_loss": 0.1565019190311432, "epoch": 7.8552368245497, "step": 23550 }, { "epoch": 7.8552368245497, "ref_ce_loss": 0.04335758090019226, "step": 23550 }, { "epoch": 7.8552368245497, "loss": 0.2749263048171997, "step": 23550 }, { "ce_loss": 0.028948500752449036, "epoch": 7.8552368245497, "step": 23550 }, { "distill_loss": 0.17235276103019714, "epoch": 7.8552368245497, "step": 23550 }, { "epoch": 7.8552368245497, "ref_ce_loss": 0.05884414166212082, "step": 23550 }, { "epoch": 7.8585723815877255, "loss": 0.3391, "step": 23560 }, { "epoch": 7.8585723815877255, "grad_norm": 2.6983072757720947, "step": 23560 }, { "epoch": 7.8585723815877255, "learning_rate": 2.458132511779565e-07, "step": 23560 }, { "epoch": 7.8585723815877255, "loss": 0.24318772554397583, "step": 23560 }, { "ce_loss": 0.029617050662636757, "epoch": 7.8585723815877255, "step": 23560 }, { "distill_loss": 0.13530349731445312, "epoch": 7.8585723815877255, "step": 23560 }, { "epoch": 7.8585723815877255, "ref_ce_loss": 0.043204981833696365, "step": 23560 }, { "epoch": 7.8585723815877255, "loss": 0.25192779302597046, "step": 23560 }, { "ce_loss": 0.008975867182016373, "epoch": 7.8585723815877255, "step": 23560 }, { "distill_loss": 0.1775493025779724, "epoch": 7.8585723815877255, "step": 23560 }, { "epoch": 7.8585723815877255, "ref_ce_loss": 0.03575044870376587, "step": 23560 }, { "epoch": 7.8585723815877255, "loss": 0.22142037749290466, "step": 23560 }, { "ce_loss": 0.02612127549946308, "epoch": 7.8585723815877255, "step": 23560 }, { "distill_loss": 0.10729941725730896, "epoch": 7.8585723815877255, "step": 23560 }, { "epoch": 7.8585723815877255, "ref_ce_loss": 0.06410533934831619, "step": 23560 }, { "epoch": 7.8585723815877255, "loss": 0.2029029130935669, "step": 23560 }, { "ce_loss": 0.01904483139514923, "epoch": 7.8585723815877255, "step": 23560 }, { "distill_loss": 0.13385045528411865, "epoch": 7.8585723815877255, "step": 23560 }, { "epoch": 7.8585723815877255, "ref_ce_loss": 0.04983455315232277, "step": 23560 }, { "epoch": 7.861907938625751, "loss": 0.3088, "step": 23570 }, { "epoch": 7.861907938625751, "grad_norm": 3.6548972129821777, "step": 23570 }, { "epoch": 7.861907938625751, "learning_rate": 2.3435800422744733e-07, "step": 23570 }, { "epoch": 7.861907938625751, "loss": 0.35257551074028015, "step": 23570 }, { "ce_loss": 0.030391503125429153, "epoch": 7.861907938625751, "step": 23570 }, { "distill_loss": 0.1383948028087616, "epoch": 7.861907938625751, "step": 23570 }, { "epoch": 7.861907938625751, "ref_ce_loss": 0.06357093900442123, "step": 23570 }, { "epoch": 7.861907938625751, "loss": 0.3334633409976959, "step": 23570 }, { "ce_loss": 0.05558884143829346, "epoch": 7.861907938625751, "step": 23570 }, { "distill_loss": 0.20167039334774017, "epoch": 7.861907938625751, "step": 23570 }, { "epoch": 7.861907938625751, "ref_ce_loss": 0.061854951083660126, "step": 23570 }, { "epoch": 7.861907938625751, "loss": 0.30800241231918335, "step": 23570 }, { "ce_loss": 0.0217773225158453, "epoch": 7.861907938625751, "step": 23570 }, { "distill_loss": 0.16537217795848846, "epoch": 7.861907938625751, "step": 23570 }, { "epoch": 7.861907938625751, "ref_ce_loss": 0.05210186913609505, "step": 23570 }, { "epoch": 7.861907938625751, "loss": 0.2675626277923584, "step": 23570 }, { "ce_loss": 0.034394048154354095, "epoch": 7.861907938625751, "step": 23570 }, { "distill_loss": 0.12180566042661667, "epoch": 7.861907938625751, "step": 23570 }, { "epoch": 7.861907938625751, "ref_ce_loss": 0.06949251890182495, "step": 23570 }, { "epoch": 7.865243495663776, "loss": 0.3309, "step": 23580 }, { "epoch": 7.865243495663776, "grad_norm": 3.2683770656585693, "step": 23580 }, { "epoch": 7.865243495663776, "learning_rate": 2.2317587073020782e-07, "step": 23580 }, { "epoch": 7.865243495663776, "loss": 0.24115531146526337, "step": 23580 }, { "ce_loss": 0.04724248871207237, "epoch": 7.865243495663776, "step": 23580 }, { "distill_loss": 0.14451994001865387, "epoch": 7.865243495663776, "step": 23580 }, { "epoch": 7.865243495663776, "ref_ce_loss": 0.04924272000789642, "step": 23580 }, { "epoch": 7.865243495663776, "loss": 0.25895681977272034, "step": 23580 }, { "ce_loss": 0.0406833216547966, "epoch": 7.865243495663776, "step": 23580 }, { "distill_loss": 0.12115300446748734, "epoch": 7.865243495663776, "step": 23580 }, { "epoch": 7.865243495663776, "ref_ce_loss": 0.04782585799694061, "step": 23580 }, { "epoch": 7.865243495663776, "loss": 0.4971678853034973, "step": 23580 }, { "ce_loss": 0.02928864024579525, "epoch": 7.865243495663776, "step": 23580 }, { "distill_loss": 0.40851619839668274, "epoch": 7.865243495663776, "step": 23580 }, { "epoch": 7.865243495663776, "ref_ce_loss": 0.05929753929376602, "step": 23580 }, { "epoch": 7.865243495663776, "loss": 0.2548585534095764, "step": 23580 }, { "ce_loss": 0.02185073494911194, "epoch": 7.865243495663776, "step": 23580 }, { "distill_loss": 0.19197684526443481, "epoch": 7.865243495663776, "step": 23580 }, { "epoch": 7.865243495663776, "ref_ce_loss": 0.040667105466127396, "step": 23580 }, { "epoch": 7.868579052701802, "loss": 0.3371, "step": 23590 }, { "epoch": 7.868579052701802, "grad_norm": 2.3236372470855713, "step": 23590 }, { "epoch": 7.868579052701802, "learning_rate": 2.122668710780595e-07, "step": 23590 }, { "epoch": 7.868579052701802, "loss": 0.32696297764778137, "step": 23590 }, { "ce_loss": 0.04291388392448425, "epoch": 7.868579052701802, "step": 23590 }, { "distill_loss": 0.1363409459590912, "epoch": 7.868579052701802, "step": 23590 }, { "epoch": 7.868579052701802, "ref_ce_loss": 0.05377655103802681, "step": 23590 }, { "epoch": 7.868579052701802, "loss": 0.294104665517807, "step": 23590 }, { "ce_loss": 0.023383162915706635, "epoch": 7.868579052701802, "step": 23590 }, { "distill_loss": 0.13425928354263306, "epoch": 7.868579052701802, "step": 23590 }, { "epoch": 7.868579052701802, "ref_ce_loss": 0.045569244772195816, "step": 23590 }, { "epoch": 7.868579052701802, "loss": 0.16625121235847473, "step": 23590 }, { "ce_loss": 0.019522182643413544, "epoch": 7.868579052701802, "step": 23590 }, { "distill_loss": 0.11160635203123093, "epoch": 7.868579052701802, "step": 23590 }, { "epoch": 7.868579052701802, "ref_ce_loss": 0.0349382720887661, "step": 23590 }, { "epoch": 7.868579052701802, "loss": 0.2597416937351227, "step": 23590 }, { "ce_loss": 0.04834597557783127, "epoch": 7.868579052701802, "step": 23590 }, { "distill_loss": 0.14939779043197632, "epoch": 7.868579052701802, "step": 23590 }, { "epoch": 7.868579052701802, "ref_ce_loss": 0.061613187193870544, "step": 23590 }, { "epoch": 7.871914609739827, "loss": 0.3061, "step": 23600 }, { "epoch": 7.871914609739827, "grad_norm": 1.84232759475708, "step": 23600 }, { "epoch": 7.871914609739827, "learning_rate": 2.016310251646891e-07, "step": 23600 }, { "epoch": 7.871914609739827, "loss": 0.19464971125125885, "step": 23600 }, { "ce_loss": 0.0028662015683948994, "epoch": 7.871914609739827, "step": 23600 }, { "distill_loss": 0.15298305451869965, "epoch": 7.871914609739827, "step": 23600 }, { "epoch": 7.871914609739827, "ref_ce_loss": 0.03864739090204239, "step": 23600 }, { "epoch": 7.871914609739827, "loss": 0.5443291664123535, "step": 23600 }, { "ce_loss": 0.04491385072469711, "epoch": 7.871914609739827, "step": 23600 }, { "distill_loss": 0.2117350846529007, "epoch": 7.871914609739827, "step": 23600 }, { "epoch": 7.871914609739827, "ref_ce_loss": 0.049522701650857925, "step": 23600 }, { "epoch": 7.871914609739827, "loss": 0.30889323353767395, "step": 23600 }, { "ce_loss": 0.03567614406347275, "epoch": 7.871914609739827, "step": 23600 }, { "distill_loss": 0.17977787554264069, "epoch": 7.871914609739827, "step": 23600 }, { "epoch": 7.871914609739827, "ref_ce_loss": 0.05225006118416786, "step": 23600 }, { "epoch": 7.871914609739827, "loss": 0.2686649560928345, "step": 23600 }, { "ce_loss": 0.019530225545167923, "epoch": 7.871914609739827, "step": 23600 }, { "distill_loss": 0.20698407292366028, "epoch": 7.871914609739827, "step": 23600 }, { "epoch": 7.871914609739827, "ref_ce_loss": 0.030849579721689224, "step": 23600 }, { "epoch": 7.875250166777852, "loss": 0.3492, "step": 23610 }, { "epoch": 7.875250166777852, "grad_norm": 4.917800426483154, "step": 23610 }, { "epoch": 7.875250166777852, "learning_rate": 1.9126835238569838e-07, "step": 23610 }, { "epoch": 7.875250166777852, "loss": 0.2599946856498718, "step": 23610 }, { "ce_loss": 0.03415341302752495, "epoch": 7.875250166777852, "step": 23610 }, { "distill_loss": 0.14789728820323944, "epoch": 7.875250166777852, "step": 23610 }, { "epoch": 7.875250166777852, "ref_ce_loss": 0.03027048334479332, "step": 23610 }, { "epoch": 7.875250166777852, "loss": 0.14788013696670532, "step": 23610 }, { "ce_loss": 0.012526956386864185, "epoch": 7.875250166777852, "step": 23610 }, { "distill_loss": 0.10251648724079132, "epoch": 7.875250166777852, "step": 23610 }, { "epoch": 7.875250166777852, "ref_ce_loss": 0.03267025575041771, "step": 23610 }, { "epoch": 7.875250166777852, "loss": 0.1945207267999649, "step": 23610 }, { "ce_loss": 0.01578592136502266, "epoch": 7.875250166777852, "step": 23610 }, { "distill_loss": 0.1346457153558731, "epoch": 7.875250166777852, "step": 23610 }, { "epoch": 7.875250166777852, "ref_ce_loss": 0.04352135583758354, "step": 23610 }, { "epoch": 7.875250166777852, "loss": 0.23414014279842377, "step": 23610 }, { "ce_loss": 0.04965338110923767, "epoch": 7.875250166777852, "step": 23610 }, { "distill_loss": 0.1356775015592575, "epoch": 7.875250166777852, "step": 23610 }, { "epoch": 7.875250166777852, "ref_ce_loss": 0.04857119172811508, "step": 23610 }, { "epoch": 7.878585723815878, "loss": 0.2745, "step": 23620 }, { "epoch": 7.878585723815878, "grad_norm": 2.187544822692871, "step": 23620 }, { "epoch": 7.878585723815878, "learning_rate": 1.811788716385043e-07, "step": 23620 }, { "epoch": 7.878585723815878, "loss": 0.39947035908699036, "step": 23620 }, { "ce_loss": 0.009549811482429504, "epoch": 7.878585723815878, "step": 23620 }, { "distill_loss": 0.20651386678218842, "epoch": 7.878585723815878, "step": 23620 }, { "epoch": 7.878585723815878, "ref_ce_loss": 0.05287281423807144, "step": 23620 }, { "epoch": 7.878585723815878, "loss": 0.4136260747909546, "step": 23620 }, { "ce_loss": 0.05234153941273689, "epoch": 7.878585723815878, "step": 23620 }, { "distill_loss": 0.2478913813829422, "epoch": 7.878585723815878, "step": 23620 }, { "epoch": 7.878585723815878, "ref_ce_loss": 0.046148356050252914, "step": 23620 }, { "epoch": 7.878585723815878, "loss": 0.25423839688301086, "step": 23620 }, { "ce_loss": 0.03949936106801033, "epoch": 7.878585723815878, "step": 23620 }, { "distill_loss": 0.15055415034294128, "epoch": 7.878585723815878, "step": 23620 }, { "epoch": 7.878585723815878, "ref_ce_loss": 0.05031810700893402, "step": 23620 }, { "epoch": 7.878585723815878, "loss": 0.2075003683567047, "step": 23620 }, { "ce_loss": 0.020473573356866837, "epoch": 7.878585723815878, "step": 23620 }, { "distill_loss": 0.11563518643379211, "epoch": 7.878585723815878, "step": 23620 }, { "epoch": 7.878585723815878, "ref_ce_loss": 0.040492944419384, "step": 23620 }, { "epoch": 7.881921280853903, "loss": 0.3381, "step": 23630 }, { "epoch": 7.881921280853903, "grad_norm": 2.8946211338043213, "step": 23630 }, { "epoch": 7.881921280853903, "learning_rate": 1.7136260132235568e-07, "step": 23630 }, { "epoch": 7.881921280853903, "loss": 0.3676464557647705, "step": 23630 }, { "ce_loss": 0.04826905578374863, "epoch": 7.881921280853903, "step": 23630 }, { "distill_loss": 0.22701773047447205, "epoch": 7.881921280853903, "step": 23630 }, { "epoch": 7.881921280853903, "ref_ce_loss": 0.06122293695807457, "step": 23630 }, { "epoch": 7.881921280853903, "loss": 0.2739025950431824, "step": 23630 }, { "ce_loss": 0.05898267775774002, "epoch": 7.881921280853903, "step": 23630 }, { "distill_loss": 0.15476536750793457, "epoch": 7.881921280853903, "step": 23630 }, { "epoch": 7.881921280853903, "ref_ce_loss": 0.0593196265399456, "step": 23630 }, { "epoch": 7.881921280853903, "loss": 0.2718009948730469, "step": 23630 }, { "ce_loss": 0.02158510684967041, "epoch": 7.881921280853903, "step": 23630 }, { "distill_loss": 0.151712566614151, "epoch": 7.881921280853903, "step": 23630 }, { "epoch": 7.881921280853903, "ref_ce_loss": 0.03668779134750366, "step": 23630 }, { "epoch": 7.881921280853903, "loss": 0.18738394975662231, "step": 23630 }, { "ce_loss": 0.03252572566270828, "epoch": 7.881921280853903, "step": 23630 }, { "distill_loss": 0.11352349817752838, "epoch": 7.881921280853903, "step": 23630 }, { "epoch": 7.881921280853903, "ref_ce_loss": 0.031237930059432983, "step": 23630 }, { "epoch": 7.885256837891928, "loss": 0.3456, "step": 23640 }, { "epoch": 7.885256837891928, "grad_norm": 3.666982650756836, "step": 23640 }, { "epoch": 7.885256837891928, "learning_rate": 1.6181955933824987e-07, "step": 23640 }, { "epoch": 7.885256837891928, "loss": 0.2636154890060425, "step": 23640 }, { "ce_loss": 0.015169094316661358, "epoch": 7.885256837891928, "step": 23640 }, { "distill_loss": 0.19849561154842377, "epoch": 7.885256837891928, "step": 23640 }, { "epoch": 7.885256837891928, "ref_ce_loss": 0.028414104133844376, "step": 23640 }, { "epoch": 7.885256837891928, "loss": 0.24772925674915314, "step": 23640 }, { "ce_loss": 0.010222217999398708, "epoch": 7.885256837891928, "step": 23640 }, { "distill_loss": 0.174020454287529, "epoch": 7.885256837891928, "step": 23640 }, { "epoch": 7.885256837891928, "ref_ce_loss": 0.03140445426106453, "step": 23640 }, { "epoch": 7.885256837891928, "loss": 0.36517012119293213, "step": 23640 }, { "ce_loss": 0.04773040488362312, "epoch": 7.885256837891928, "step": 23640 }, { "distill_loss": 0.23224672675132751, "epoch": 7.885256837891928, "step": 23640 }, { "epoch": 7.885256837891928, "ref_ce_loss": 0.08495187014341354, "step": 23640 }, { "epoch": 7.885256837891928, "loss": 0.31244736909866333, "step": 23640 }, { "ce_loss": 0.061612311750650406, "epoch": 7.885256837891928, "step": 23640 }, { "distill_loss": 0.15047255158424377, "epoch": 7.885256837891928, "step": 23640 }, { "epoch": 7.885256837891928, "ref_ce_loss": 0.06343311071395874, "step": 23640 }, { "epoch": 7.888592394929954, "loss": 0.3154, "step": 23650 }, { "epoch": 7.888592394929954, "grad_norm": 3.4513959884643555, "step": 23650 }, { "epoch": 7.888592394929954, "learning_rate": 1.5254976308891608e-07, "step": 23650 }, { "epoch": 7.888592394929954, "loss": 0.266707181930542, "step": 23650 }, { "ce_loss": 0.011998561210930347, "epoch": 7.888592394929954, "step": 23650 }, { "distill_loss": 0.18357567489147186, "epoch": 7.888592394929954, "step": 23650 }, { "epoch": 7.888592394929954, "ref_ce_loss": 0.04288625344634056, "step": 23650 }, { "epoch": 7.888592394929954, "loss": 0.24832196533679962, "step": 23650 }, { "ce_loss": 0.01114476379007101, "epoch": 7.888592394929954, "step": 23650 }, { "distill_loss": 0.1261119693517685, "epoch": 7.888592394929954, "step": 23650 }, { "epoch": 7.888592394929954, "ref_ce_loss": 0.05986611172556877, "step": 23650 }, { "epoch": 7.888592394929954, "loss": 0.20678839087486267, "step": 23650 }, { "ce_loss": 0.008814916014671326, "epoch": 7.888592394929954, "step": 23650 }, { "distill_loss": 0.1221609115600586, "epoch": 7.888592394929954, "step": 23650 }, { "epoch": 7.888592394929954, "ref_ce_loss": 0.03747597709298134, "step": 23650 }, { "epoch": 7.888592394929954, "loss": 0.2413693517446518, "step": 23650 }, { "ce_loss": 0.020267309620976448, "epoch": 7.888592394929954, "step": 23650 }, { "distill_loss": 0.14778374135494232, "epoch": 7.888592394929954, "step": 23650 }, { "epoch": 7.888592394929954, "ref_ce_loss": 0.0390714630484581, "step": 23650 }, { "epoch": 7.891927951967979, "loss": 0.294, "step": 23660 }, { "epoch": 7.891927951967979, "grad_norm": 3.68865704536438, "step": 23660 }, { "epoch": 7.891927951967979, "learning_rate": 1.435532294788322e-07, "step": 23660 }, { "epoch": 7.891927951967979, "loss": 0.2128949910402298, "step": 23660 }, { "ce_loss": 0.004833103623241186, "epoch": 7.891927951967979, "step": 23660 }, { "distill_loss": 0.158272847533226, "epoch": 7.891927951967979, "step": 23660 }, { "epoch": 7.891927951967979, "ref_ce_loss": 0.027962271124124527, "step": 23660 }, { "epoch": 7.891927951967979, "loss": 0.3560689091682434, "step": 23660 }, { "ce_loss": 0.03083612397313118, "epoch": 7.891927951967979, "step": 23660 }, { "distill_loss": 0.17447644472122192, "epoch": 7.891927951967979, "step": 23660 }, { "epoch": 7.891927951967979, "ref_ce_loss": 0.05506594106554985, "step": 23660 }, { "epoch": 7.891927951967979, "loss": 0.38912326097488403, "step": 23660 }, { "ce_loss": 0.068735271692276, "epoch": 7.891927951967979, "step": 23660 }, { "distill_loss": 0.16929185390472412, "epoch": 7.891927951967979, "step": 23660 }, { "epoch": 7.891927951967979, "ref_ce_loss": 0.062119368463754654, "step": 23660 }, { "epoch": 7.891927951967979, "loss": 0.2961581349372864, "step": 23660 }, { "ce_loss": 0.032267630100250244, "epoch": 7.891927951967979, "step": 23660 }, { "distill_loss": 0.1588132381439209, "epoch": 7.891927951967979, "step": 23660 }, { "epoch": 7.891927951967979, "ref_ce_loss": 0.04926518350839615, "step": 23660 }, { "epoch": 7.895263509006004, "loss": 0.2938, "step": 23670 }, { "epoch": 7.895263509006004, "grad_norm": 3.3204846382141113, "step": 23670 }, { "epoch": 7.895263509006004, "learning_rate": 1.3482997491410796e-07, "step": 23670 }, { "epoch": 7.895263509006004, "loss": 0.2511689364910126, "step": 23670 }, { "ce_loss": 0.028505954891443253, "epoch": 7.895263509006004, "step": 23670 }, { "distill_loss": 0.11724421381950378, "epoch": 7.895263509006004, "step": 23670 }, { "epoch": 7.895263509006004, "ref_ce_loss": 0.05377810075879097, "step": 23670 }, { "epoch": 7.895263509006004, "loss": 0.33648693561553955, "step": 23670 }, { "ce_loss": 0.0159254539757967, "epoch": 7.895263509006004, "step": 23670 }, { "distill_loss": 0.12790490686893463, "epoch": 7.895263509006004, "step": 23670 }, { "epoch": 7.895263509006004, "ref_ce_loss": 0.042552024126052856, "step": 23670 }, { "epoch": 7.895263509006004, "loss": 0.3551349341869354, "step": 23670 }, { "ce_loss": 0.03814506158232689, "epoch": 7.895263509006004, "step": 23670 }, { "distill_loss": 0.15532177686691284, "epoch": 7.895263509006004, "step": 23670 }, { "epoch": 7.895263509006004, "ref_ce_loss": 0.04683928191661835, "step": 23670 }, { "epoch": 7.895263509006004, "loss": 0.3603041470050812, "step": 23670 }, { "ce_loss": 0.028399253264069557, "epoch": 7.895263509006004, "step": 23670 }, { "distill_loss": 0.2274635136127472, "epoch": 7.895263509006004, "step": 23670 }, { "epoch": 7.895263509006004, "ref_ce_loss": 0.052271027117967606, "step": 23670 }, { "epoch": 7.89859906604403, "loss": 0.309, "step": 23680 }, { "epoch": 7.89859906604403, "grad_norm": 3.2109603881835938, "step": 23680 }, { "epoch": 7.89859906604403, "learning_rate": 1.263800153025185e-07, "step": 23680 }, { "epoch": 7.89859906604403, "loss": 0.43084007501602173, "step": 23680 }, { "ce_loss": 0.005710463039577007, "epoch": 7.89859906604403, "step": 23680 }, { "distill_loss": 0.13513536751270294, "epoch": 7.89859906604403, "step": 23680 }, { "epoch": 7.89859906604403, "ref_ce_loss": 0.03500951826572418, "step": 23680 }, { "epoch": 7.89859906604403, "loss": 0.5318657159805298, "step": 23680 }, { "ce_loss": 0.007542737293988466, "epoch": 7.89859906604403, "step": 23680 }, { "distill_loss": 0.1139644905924797, "epoch": 7.89859906604403, "step": 23680 }, { "epoch": 7.89859906604403, "ref_ce_loss": 0.052930302917957306, "step": 23680 }, { "epoch": 7.89859906604403, "loss": 0.2502293288707733, "step": 23680 }, { "ce_loss": 0.0355820469558239, "epoch": 7.89859906604403, "step": 23680 }, { "distill_loss": 0.11790099740028381, "epoch": 7.89859906604403, "step": 23680 }, { "epoch": 7.89859906604403, "ref_ce_loss": 0.06142692267894745, "step": 23680 }, { "epoch": 7.89859906604403, "loss": 0.36094042658805847, "step": 23680 }, { "ce_loss": 0.013484828174114227, "epoch": 7.89859906604403, "step": 23680 }, { "distill_loss": 0.16219370067119598, "epoch": 7.89859906604403, "step": 23680 }, { "epoch": 7.89859906604403, "ref_ce_loss": 0.0636187344789505, "step": 23680 }, { "epoch": 7.901934623082055, "loss": 0.337, "step": 23690 }, { "epoch": 7.901934623082055, "grad_norm": 2.8887805938720703, "step": 23690 }, { "epoch": 7.901934623082055, "learning_rate": 1.1820336605347092e-07, "step": 23690 }, { "epoch": 7.901934623082055, "loss": 0.4276142120361328, "step": 23690 }, { "ce_loss": 0.06052268669009209, "epoch": 7.901934623082055, "step": 23690 }, { "distill_loss": 0.19310811161994934, "epoch": 7.901934623082055, "step": 23690 }, { "epoch": 7.901934623082055, "ref_ce_loss": 0.07668992877006531, "step": 23690 }, { "epoch": 7.901934623082055, "loss": 0.20680920779705048, "step": 23690 }, { "ce_loss": 0.022963564842939377, "epoch": 7.901934623082055, "step": 23690 }, { "distill_loss": 0.11855762451887131, "epoch": 7.901934623082055, "step": 23690 }, { "epoch": 7.901934623082055, "ref_ce_loss": 0.06482337415218353, "step": 23690 }, { "epoch": 7.901934623082055, "loss": 0.29407379031181335, "step": 23690 }, { "ce_loss": 0.026345551013946533, "epoch": 7.901934623082055, "step": 23690 }, { "distill_loss": 0.15834666788578033, "epoch": 7.901934623082055, "step": 23690 }, { "epoch": 7.901934623082055, "ref_ce_loss": 0.047213222831487656, "step": 23690 }, { "epoch": 7.901934623082055, "loss": 0.16668519377708435, "step": 23690 }, { "ce_loss": 0.010949512012302876, "epoch": 7.901934623082055, "step": 23690 }, { "distill_loss": 0.10162533074617386, "epoch": 7.901934623082055, "step": 23690 }, { "epoch": 7.901934623082055, "ref_ce_loss": 0.03732268139719963, "step": 23690 }, { "epoch": 7.90527018012008, "loss": 0.3171, "step": 23700 }, { "epoch": 7.90527018012008, "grad_norm": 2.6990199089050293, "step": 23700 }, { "epoch": 7.90527018012008, "learning_rate": 1.1030004207793763e-07, "step": 23700 }, { "epoch": 7.90527018012008, "loss": 0.18577899038791656, "step": 23700 }, { "ce_loss": 0.03319680318236351, "epoch": 7.90527018012008, "step": 23700 }, { "distill_loss": 0.09356150031089783, "epoch": 7.90527018012008, "step": 23700 }, { "epoch": 7.90527018012008, "ref_ce_loss": 0.03948800638318062, "step": 23700 }, { "epoch": 7.90527018012008, "loss": 0.2524036467075348, "step": 23700 }, { "ce_loss": 0.024047965183854103, "epoch": 7.90527018012008, "step": 23700 }, { "distill_loss": 0.12219231575727463, "epoch": 7.90527018012008, "step": 23700 }, { "epoch": 7.90527018012008, "ref_ce_loss": 0.07101521641016006, "step": 23700 }, { "epoch": 7.90527018012008, "loss": 0.1832265704870224, "step": 23700 }, { "ce_loss": 0.012943029403686523, "epoch": 7.90527018012008, "step": 23700 }, { "distill_loss": 0.11475443094968796, "epoch": 7.90527018012008, "step": 23700 }, { "epoch": 7.90527018012008, "ref_ce_loss": 0.043082673102617264, "step": 23700 }, { "epoch": 7.90527018012008, "loss": 0.36426877975463867, "step": 23700 }, { "ce_loss": 0.0561620257794857, "epoch": 7.90527018012008, "step": 23700 }, { "distill_loss": 0.1449742615222931, "epoch": 7.90527018012008, "step": 23700 }, { "epoch": 7.90527018012008, "ref_ce_loss": 0.07117841392755508, "step": 23700 }, { "epoch": 7.908605737158106, "loss": 0.3003, "step": 23710 }, { "epoch": 7.908605737158106, "grad_norm": 3.6066665649414062, "step": 23710 }, { "epoch": 7.908605737158106, "learning_rate": 1.0267005778847315e-07, "step": 23710 }, { "epoch": 7.908605737158106, "loss": 0.2224166840314865, "step": 23710 }, { "ce_loss": 0.04008515179157257, "epoch": 7.908605737158106, "step": 23710 }, { "distill_loss": 0.1341482698917389, "epoch": 7.908605737158106, "step": 23710 }, { "epoch": 7.908605737158106, "ref_ce_loss": 0.04800305888056755, "step": 23710 }, { "epoch": 7.908605737158106, "loss": 0.337680459022522, "step": 23710 }, { "ce_loss": 0.025626100599765778, "epoch": 7.908605737158106, "step": 23710 }, { "distill_loss": 0.19002515077590942, "epoch": 7.908605737158106, "step": 23710 }, { "epoch": 7.908605737158106, "ref_ce_loss": 0.07956057786941528, "step": 23710 }, { "epoch": 7.908605737158106, "loss": 0.33845025300979614, "step": 23710 }, { "ce_loss": 0.034407805651426315, "epoch": 7.908605737158106, "step": 23710 }, { "distill_loss": 0.24543067812919617, "epoch": 7.908605737158106, "step": 23710 }, { "epoch": 7.908605737158106, "ref_ce_loss": 0.058323897421360016, "step": 23710 }, { "epoch": 7.908605737158106, "loss": 0.293621301651001, "step": 23710 }, { "ce_loss": 0.008468334563076496, "epoch": 7.908605737158106, "step": 23710 }, { "distill_loss": 0.2118399441242218, "epoch": 7.908605737158106, "step": 23710 }, { "epoch": 7.908605737158106, "ref_ce_loss": 0.03856439143419266, "step": 23710 }, { "epoch": 7.911941294196131, "loss": 0.3429, "step": 23720 }, { "epoch": 7.911941294196131, "grad_norm": 4.594830513000488, "step": 23720 }, { "epoch": 7.911941294196131, "learning_rate": 9.53134270991307e-08, "step": 23720 }, { "epoch": 7.911941294196131, "loss": 0.2774868309497833, "step": 23720 }, { "ce_loss": 0.07063006609678268, "epoch": 7.911941294196131, "step": 23720 }, { "distill_loss": 0.12501201033592224, "epoch": 7.911941294196131, "step": 23720 }, { "epoch": 7.911941294196131, "ref_ce_loss": 0.0638425201177597, "step": 23720 }, { "epoch": 7.911941294196131, "loss": 0.23774346709251404, "step": 23720 }, { "ce_loss": 0.008322431705892086, "epoch": 7.911941294196131, "step": 23720 }, { "distill_loss": 0.1158476173877716, "epoch": 7.911941294196131, "step": 23720 }, { "epoch": 7.911941294196131, "ref_ce_loss": 0.04607711732387543, "step": 23720 }, { "epoch": 7.911941294196131, "loss": 0.5005284547805786, "step": 23720 }, { "ce_loss": 0.10049159824848175, "epoch": 7.911941294196131, "step": 23720 }, { "distill_loss": 0.25245940685272217, "epoch": 7.911941294196131, "step": 23720 }, { "epoch": 7.911941294196131, "ref_ce_loss": 0.034680433571338654, "step": 23720 }, { "epoch": 7.911941294196131, "loss": 0.3965238332748413, "step": 23720 }, { "ce_loss": 0.08287633210420609, "epoch": 7.911941294196131, "step": 23720 }, { "distill_loss": 0.21766790747642517, "epoch": 7.911941294196131, "step": 23720 }, { "epoch": 7.911941294196131, "ref_ce_loss": 0.047307275235652924, "step": 23720 }, { "epoch": 7.9152768512341565, "loss": 0.285, "step": 23730 }, { "epoch": 7.9152768512341565, "grad_norm": 3.145902156829834, "step": 23730 }, { "epoch": 7.9152768512341565, "learning_rate": 8.823016342554557e-08, "step": 23730 }, { "epoch": 7.9152768512341565, "loss": 0.2347753643989563, "step": 23730 }, { "ce_loss": 0.010778088122606277, "epoch": 7.9152768512341565, "step": 23730 }, { "distill_loss": 0.14141809940338135, "epoch": 7.9152768512341565, "step": 23730 }, { "epoch": 7.9152768512341565, "ref_ce_loss": 0.032311439514160156, "step": 23730 }, { "epoch": 7.9152768512341565, "loss": 0.3499191105365753, "step": 23730 }, { "ce_loss": 0.10165178775787354, "epoch": 7.9152768512341565, "step": 23730 }, { "distill_loss": 0.1755722612142563, "epoch": 7.9152768512341565, "step": 23730 }, { "epoch": 7.9152768512341565, "ref_ce_loss": 0.07252631336450577, "step": 23730 }, { "epoch": 7.9152768512341565, "loss": 0.4272885322570801, "step": 23730 }, { "ce_loss": 0.014664656482636929, "epoch": 7.9152768512341565, "step": 23730 }, { "distill_loss": 0.29934969544410706, "epoch": 7.9152768512341565, "step": 23730 }, { "epoch": 7.9152768512341565, "ref_ce_loss": 0.06960371136665344, "step": 23730 }, { "epoch": 7.9152768512341565, "loss": 0.14407768845558167, "step": 23730 }, { "ce_loss": 0.009971325285732746, "epoch": 7.9152768512341565, "step": 23730 }, { "distill_loss": 0.11345706135034561, "epoch": 7.9152768512341565, "step": 23730 }, { "epoch": 7.9152768512341565, "ref_ce_loss": 0.0139697827398777, "step": 23730 }, { "epoch": 7.918612408272182, "loss": 0.3189, "step": 23740 }, { "epoch": 7.918612408272182, "grad_norm": 3.545828104019165, "step": 23740 }, { "epoch": 7.918612408272182, "learning_rate": 8.14202796847685e-08, "step": 23740 }, { "epoch": 7.918612408272182, "loss": 0.17268899083137512, "step": 23740 }, { "ce_loss": 0.02548079378902912, "epoch": 7.918612408272182, "step": 23740 }, { "distill_loss": 0.10526704788208008, "epoch": 7.918612408272182, "step": 23740 }, { "epoch": 7.918612408272182, "ref_ce_loss": 0.0418148934841156, "step": 23740 }, { "epoch": 7.918612408272182, "loss": 0.19164453446865082, "step": 23740 }, { "ce_loss": 0.004120680969208479, "epoch": 7.918612408272182, "step": 23740 }, { "distill_loss": 0.1119823306798935, "epoch": 7.918612408272182, "step": 23740 }, { "epoch": 7.918612408272182, "ref_ce_loss": 0.0482141338288784, "step": 23740 }, { "epoch": 7.918612408272182, "loss": 0.39481455087661743, "step": 23740 }, { "ce_loss": 0.0830196812748909, "epoch": 7.918612408272182, "step": 23740 }, { "distill_loss": 0.24142134189605713, "epoch": 7.918612408272182, "step": 23740 }, { "epoch": 7.918612408272182, "ref_ce_loss": 0.030993618071079254, "step": 23740 }, { "epoch": 7.918612408272182, "loss": 0.31761011481285095, "step": 23740 }, { "ce_loss": 0.04722367972135544, "epoch": 7.918612408272182, "step": 23740 }, { "distill_loss": 0.17737168073654175, "epoch": 7.918612408272182, "step": 23740 }, { "epoch": 7.918612408272182, "ref_ce_loss": 0.07376193255186081, "step": 23740 }, { "epoch": 7.921947965310207, "loss": 0.2852, "step": 23750 }, { "epoch": 7.921947965310207, "grad_norm": 2.1956324577331543, "step": 23750 }, { "epoch": 7.921947965310207, "learning_rate": 7.488378829534903e-08, "step": 23750 }, { "epoch": 7.921947965310207, "loss": 0.45576271414756775, "step": 23750 }, { "ce_loss": 0.05745376646518707, "epoch": 7.921947965310207, "step": 23750 }, { "distill_loss": 0.26547667384147644, "epoch": 7.921947965310207, "step": 23750 }, { "epoch": 7.921947965310207, "ref_ce_loss": 0.06741317361593246, "step": 23750 }, { "epoch": 7.921947965310207, "loss": 0.3387796878814697, "step": 23750 }, { "ce_loss": 0.03637593984603882, "epoch": 7.921947965310207, "step": 23750 }, { "distill_loss": 0.19918107986450195, "epoch": 7.921947965310207, "step": 23750 }, { "epoch": 7.921947965310207, "ref_ce_loss": 0.05107717961072922, "step": 23750 }, { "epoch": 7.921947965310207, "loss": 0.2869316637516022, "step": 23750 }, { "ce_loss": 0.015967372804880142, "epoch": 7.921947965310207, "step": 23750 }, { "distill_loss": 0.12445250153541565, "epoch": 7.921947965310207, "step": 23750 }, { "epoch": 7.921947965310207, "ref_ce_loss": 0.04145001992583275, "step": 23750 }, { "epoch": 7.921947965310207, "loss": 0.31970643997192383, "step": 23750 }, { "ce_loss": 0.03916800767183304, "epoch": 7.921947965310207, "step": 23750 }, { "distill_loss": 0.14288796484470367, "epoch": 7.921947965310207, "step": 23750 }, { "epoch": 7.921947965310207, "ref_ce_loss": 0.051019471138715744, "step": 23750 }, { "epoch": 7.9252835223482325, "loss": 0.3092, "step": 23760 }, { "epoch": 7.9252835223482325, "grad_norm": 4.615090370178223, "step": 23760 }, { "epoch": 7.9252835223482325, "learning_rate": 6.862070117725216e-08, "step": 23760 }, { "epoch": 7.9252835223482325, "loss": 0.32668691873550415, "step": 23760 }, { "ce_loss": 0.03978271782398224, "epoch": 7.9252835223482325, "step": 23760 }, { "distill_loss": 0.1851176917552948, "epoch": 7.9252835223482325, "step": 23760 }, { "epoch": 7.9252835223482325, "ref_ce_loss": 0.050581347197294235, "step": 23760 }, { "epoch": 7.9252835223482325, "loss": 0.2914896607398987, "step": 23760 }, { "ce_loss": 0.0444149523973465, "epoch": 7.9252835223482325, "step": 23760 }, { "distill_loss": 0.14117684960365295, "epoch": 7.9252835223482325, "step": 23760 }, { "epoch": 7.9252835223482325, "ref_ce_loss": 0.037499092519283295, "step": 23760 }, { "epoch": 7.9252835223482325, "loss": 0.3269153833389282, "step": 23760 }, { "ce_loss": 0.004225606564432383, "epoch": 7.9252835223482325, "step": 23760 }, { "distill_loss": 0.20178727805614471, "epoch": 7.9252835223482325, "step": 23760 }, { "epoch": 7.9252835223482325, "ref_ce_loss": 0.06147385388612747, "step": 23760 }, { "epoch": 7.9252835223482325, "loss": 0.342536985874176, "step": 23760 }, { "ce_loss": 0.013994453474879265, "epoch": 7.9252835223482325, "step": 23760 }, { "distill_loss": 0.25507158041000366, "epoch": 7.9252835223482325, "step": 23760 }, { "epoch": 7.9252835223482325, "ref_ce_loss": 0.05506007373332977, "step": 23760 }, { "epoch": 7.928619079386258, "loss": 0.3146, "step": 23770 }, { "epoch": 7.928619079386258, "grad_norm": 3.3470752239227295, "step": 23770 }, { "epoch": 7.928619079386258, "learning_rate": 6.263102975190837e-08, "step": 23770 }, { "epoch": 7.928619079386258, "loss": 0.395652711391449, "step": 23770 }, { "ce_loss": 0.08911252021789551, "epoch": 7.928619079386258, "step": 23770 }, { "distill_loss": 0.21665403246879578, "epoch": 7.928619079386258, "step": 23770 }, { "epoch": 7.928619079386258, "ref_ce_loss": 0.07107770442962646, "step": 23770 }, { "epoch": 7.928619079386258, "loss": 0.3040449619293213, "step": 23770 }, { "ce_loss": 0.015351009555161, "epoch": 7.928619079386258, "step": 23770 }, { "distill_loss": 0.2228090912103653, "epoch": 7.928619079386258, "step": 23770 }, { "epoch": 7.928619079386258, "ref_ce_loss": 0.04557941108942032, "step": 23770 }, { "epoch": 7.928619079386258, "loss": 0.23056939244270325, "step": 23770 }, { "ce_loss": 0.023988118395209312, "epoch": 7.928619079386258, "step": 23770 }, { "distill_loss": 0.13429005444049835, "epoch": 7.928619079386258, "step": 23770 }, { "epoch": 7.928619079386258, "ref_ce_loss": 0.07207389920949936, "step": 23770 }, { "epoch": 7.928619079386258, "loss": 0.28277847170829773, "step": 23770 }, { "ce_loss": 0.046883635222911835, "epoch": 7.928619079386258, "step": 23770 }, { "distill_loss": 0.12595364451408386, "epoch": 7.928619079386258, "step": 23770 }, { "epoch": 7.928619079386258, "ref_ce_loss": 0.04364423453807831, "step": 23770 }, { "epoch": 7.931954636424283, "loss": 0.3274, "step": 23780 }, { "epoch": 7.931954636424283, "grad_norm": 3.726837158203125, "step": 23780 }, { "epoch": 7.931954636424283, "learning_rate": 5.6914784942097004e-08, "step": 23780 }, { "epoch": 7.931954636424283, "loss": 0.2981261909008026, "step": 23780 }, { "ce_loss": 0.010031945072114468, "epoch": 7.931954636424283, "step": 23780 }, { "distill_loss": 0.22223161160945892, "epoch": 7.931954636424283, "step": 23780 }, { "epoch": 7.931954636424283, "ref_ce_loss": 0.040669411420822144, "step": 23780 }, { "epoch": 7.931954636424283, "loss": 0.2453022301197052, "step": 23780 }, { "ce_loss": 0.034803520888090134, "epoch": 7.931954636424283, "step": 23780 }, { "distill_loss": 0.16121885180473328, "epoch": 7.931954636424283, "step": 23780 }, { "epoch": 7.931954636424283, "ref_ce_loss": 0.04921687766909599, "step": 23780 }, { "epoch": 7.931954636424283, "loss": 0.3023455739021301, "step": 23780 }, { "ce_loss": 0.05195079371333122, "epoch": 7.931954636424283, "step": 23780 }, { "distill_loss": 0.15900222957134247, "epoch": 7.931954636424283, "step": 23780 }, { "epoch": 7.931954636424283, "ref_ce_loss": 0.04697020724415779, "step": 23780 }, { "epoch": 7.931954636424283, "loss": 0.20833349227905273, "step": 23780 }, { "ce_loss": 0.02568138763308525, "epoch": 7.931954636424283, "step": 23780 }, { "distill_loss": 0.11793918907642365, "epoch": 7.931954636424283, "step": 23780 }, { "epoch": 7.931954636424283, "ref_ce_loss": 0.042938802391290665, "step": 23780 }, { "epoch": 7.935290193462309, "loss": 0.3196, "step": 23790 }, { "epoch": 7.935290193462309, "grad_norm": 3.1810896396636963, "step": 23790 }, { "epoch": 7.935290193462309, "learning_rate": 5.14719771720129e-08, "step": 23790 }, { "epoch": 7.935290193462309, "loss": 0.4055737555027008, "step": 23790 }, { "ce_loss": 0.06018650531768799, "epoch": 7.935290193462309, "step": 23790 }, { "distill_loss": 0.22380036115646362, "epoch": 7.935290193462309, "step": 23790 }, { "epoch": 7.935290193462309, "ref_ce_loss": 0.098446786403656, "step": 23790 }, { "epoch": 7.935290193462309, "loss": 0.24172917008399963, "step": 23790 }, { "ce_loss": 0.029822878539562225, "epoch": 7.935290193462309, "step": 23790 }, { "distill_loss": 0.15046468377113342, "epoch": 7.935290193462309, "step": 23790 }, { "epoch": 7.935290193462309, "ref_ce_loss": 0.04222504422068596, "step": 23790 }, { "epoch": 7.935290193462309, "loss": 0.19609327614307404, "step": 23790 }, { "ce_loss": 0.0036498629488050938, "epoch": 7.935290193462309, "step": 23790 }, { "distill_loss": 0.11934056133031845, "epoch": 7.935290193462309, "step": 23790 }, { "epoch": 7.935290193462309, "ref_ce_loss": 0.04515659064054489, "step": 23790 }, { "epoch": 7.935290193462309, "loss": 0.6010411977767944, "step": 23790 }, { "ce_loss": 0.04046886786818504, "epoch": 7.935290193462309, "step": 23790 }, { "distill_loss": 0.225779190659523, "epoch": 7.935290193462309, "step": 23790 }, { "epoch": 7.935290193462309, "ref_ce_loss": 0.0491827093064785, "step": 23790 }, { "epoch": 7.938625750500334, "loss": 0.3412, "step": 23800 }, { "epoch": 7.938625750500334, "grad_norm": 2.477982521057129, "step": 23800 }, { "epoch": 7.938625750500334, "learning_rate": 4.6302616367149824e-08, "step": 23800 }, { "epoch": 7.938625750500334, "loss": 0.5360799431800842, "step": 23800 }, { "ce_loss": 0.028202766552567482, "epoch": 7.938625750500334, "step": 23800 }, { "distill_loss": 0.30352216958999634, "epoch": 7.938625750500334, "step": 23800 }, { "epoch": 7.938625750500334, "ref_ce_loss": 0.04931466281414032, "step": 23800 }, { "epoch": 7.938625750500334, "loss": 0.24504601955413818, "step": 23800 }, { "ce_loss": 0.053709227591753006, "epoch": 7.938625750500334, "step": 23800 }, { "distill_loss": 0.1536063253879547, "epoch": 7.938625750500334, "step": 23800 }, { "epoch": 7.938625750500334, "ref_ce_loss": 0.03759874030947685, "step": 23800 }, { "epoch": 7.938625750500334, "loss": 0.17871206998825073, "step": 23800 }, { "ce_loss": 0.01082976907491684, "epoch": 7.938625750500334, "step": 23800 }, { "distill_loss": 0.09340672194957733, "epoch": 7.938625750500334, "step": 23800 }, { "epoch": 7.938625750500334, "ref_ce_loss": 0.018215442076325417, "step": 23800 }, { "epoch": 7.938625750500334, "loss": 0.41664665937423706, "step": 23800 }, { "ce_loss": 0.007084306795150042, "epoch": 7.938625750500334, "step": 23800 }, { "distill_loss": 0.18884873390197754, "epoch": 7.938625750500334, "step": 23800 }, { "epoch": 7.938625750500334, "ref_ce_loss": 0.034425389021635056, "step": 23800 }, { "epoch": 7.941961307538359, "loss": 0.3242, "step": 23810 }, { "epoch": 7.941961307538359, "grad_norm": 2.6601004600524902, "step": 23810 }, { "epoch": 7.941961307538359, "learning_rate": 4.140671195443368e-08, "step": 23810 }, { "epoch": 7.941961307538359, "loss": 0.17185519635677338, "step": 23810 }, { "ce_loss": 0.004476575180888176, "epoch": 7.941961307538359, "step": 23810 }, { "distill_loss": 0.08696437627077103, "epoch": 7.941961307538359, "step": 23810 }, { "epoch": 7.941961307538359, "ref_ce_loss": 0.03971419855952263, "step": 23810 }, { "epoch": 7.941961307538359, "loss": 0.4425716996192932, "step": 23810 }, { "ce_loss": 0.012841652147471905, "epoch": 7.941961307538359, "step": 23810 }, { "distill_loss": 0.21129585802555084, "epoch": 7.941961307538359, "step": 23810 }, { "epoch": 7.941961307538359, "ref_ce_loss": 0.04600348696112633, "step": 23810 }, { "epoch": 7.941961307538359, "loss": 0.3691619038581848, "step": 23810 }, { "ce_loss": 0.08319517225027084, "epoch": 7.941961307538359, "step": 23810 }, { "distill_loss": 0.1942371129989624, "epoch": 7.941961307538359, "step": 23810 }, { "epoch": 7.941961307538359, "ref_ce_loss": 0.06677474826574326, "step": 23810 }, { "epoch": 7.941961307538359, "loss": 0.3194017708301544, "step": 23810 }, { "ce_loss": 0.04230117425322533, "epoch": 7.941961307538359, "step": 23810 }, { "distill_loss": 0.14899131655693054, "epoch": 7.941961307538359, "step": 23810 }, { "epoch": 7.941961307538359, "ref_ce_loss": 0.05892167240381241, "step": 23810 }, { "epoch": 7.945296864576385, "loss": 0.3391, "step": 23820 }, { "epoch": 7.945296864576385, "grad_norm": 3.832427978515625, "step": 23820 }, { "epoch": 7.945296864576385, "learning_rate": 3.678427286202268e-08, "step": 23820 }, { "epoch": 7.945296864576385, "loss": 0.4109659790992737, "step": 23820 }, { "ce_loss": 0.0191717017441988, "epoch": 7.945296864576385, "step": 23820 }, { "distill_loss": 0.14880934357643127, "epoch": 7.945296864576385, "step": 23820 }, { "epoch": 7.945296864576385, "ref_ce_loss": 0.021254807710647583, "step": 23820 }, { "epoch": 7.945296864576385, "loss": 0.33141595125198364, "step": 23820 }, { "ce_loss": 0.04109518229961395, "epoch": 7.945296864576385, "step": 23820 }, { "distill_loss": 0.24267660081386566, "epoch": 7.945296864576385, "step": 23820 }, { "epoch": 7.945296864576385, "ref_ce_loss": 0.04730328917503357, "step": 23820 }, { "epoch": 7.945296864576385, "loss": 0.18887729942798615, "step": 23820 }, { "ce_loss": 0.017927071079611778, "epoch": 7.945296864576385, "step": 23820 }, { "distill_loss": 0.12233343720436096, "epoch": 7.945296864576385, "step": 23820 }, { "epoch": 7.945296864576385, "ref_ce_loss": 0.04846320301294327, "step": 23820 }, { "epoch": 7.945296864576385, "loss": 0.37258389592170715, "step": 23820 }, { "ce_loss": 0.011785218492150307, "epoch": 7.945296864576385, "step": 23820 }, { "distill_loss": 0.12698237597942352, "epoch": 7.945296864576385, "step": 23820 }, { "epoch": 7.945296864576385, "ref_ce_loss": 0.07257344573736191, "step": 23820 }, { "epoch": 7.94863242161441, "loss": 0.3368, "step": 23830 }, { "epoch": 7.94863242161441, "grad_norm": 2.7429282665252686, "step": 23830 }, { "epoch": 7.94863242161441, "learning_rate": 3.243530751944057e-08, "step": 23830 }, { "epoch": 7.94863242161441, "loss": 0.22189457714557648, "step": 23830 }, { "ce_loss": 0.027077307924628258, "epoch": 7.94863242161441, "step": 23830 }, { "distill_loss": 0.16582459211349487, "epoch": 7.94863242161441, "step": 23830 }, { "epoch": 7.94863242161441, "ref_ce_loss": 0.028566185384988785, "step": 23830 }, { "epoch": 7.94863242161441, "loss": 0.363383412361145, "step": 23830 }, { "ce_loss": 0.03507964313030243, "epoch": 7.94863242161441, "step": 23830 }, { "distill_loss": 0.15669700503349304, "epoch": 7.94863242161441, "step": 23830 }, { "epoch": 7.94863242161441, "ref_ce_loss": 0.06525471806526184, "step": 23830 }, { "epoch": 7.94863242161441, "loss": 0.2155691385269165, "step": 23830 }, { "ce_loss": 0.01610867865383625, "epoch": 7.94863242161441, "step": 23830 }, { "distill_loss": 0.14725200831890106, "epoch": 7.94863242161441, "step": 23830 }, { "epoch": 7.94863242161441, "ref_ce_loss": 0.024043144658207893, "step": 23830 }, { "epoch": 7.94863242161441, "loss": 0.21124151349067688, "step": 23830 }, { "ce_loss": 0.018030596897006035, "epoch": 7.94863242161441, "step": 23830 }, { "distill_loss": 0.12607772648334503, "epoch": 7.94863242161441, "step": 23830 }, { "epoch": 7.94863242161441, "ref_ce_loss": 0.033300936222076416, "step": 23830 }, { "epoch": 7.951967978652435, "loss": 0.2854, "step": 23840 }, { "epoch": 7.951967978652435, "grad_norm": 2.5058624744415283, "step": 23840 }, { "epoch": 7.951967978652435, "learning_rate": 2.8359823857476705e-08, "step": 23840 }, { "epoch": 7.951967978652435, "loss": 0.29359400272369385, "step": 23840 }, { "ce_loss": 0.09036729484796524, "epoch": 7.951967978652435, "step": 23840 }, { "distill_loss": 0.15034735202789307, "epoch": 7.951967978652435, "step": 23840 }, { "epoch": 7.951967978652435, "ref_ce_loss": 0.03662831336259842, "step": 23840 }, { "epoch": 7.951967978652435, "loss": 0.36976414918899536, "step": 23840 }, { "ce_loss": 0.060340650379657745, "epoch": 7.951967978652435, "step": 23840 }, { "distill_loss": 0.1331426501274109, "epoch": 7.951967978652435, "step": 23840 }, { "epoch": 7.951967978652435, "ref_ce_loss": 0.04315692558884621, "step": 23840 }, { "epoch": 7.951967978652435, "loss": 0.2175612449645996, "step": 23840 }, { "ce_loss": 0.004586022812873125, "epoch": 7.951967978652435, "step": 23840 }, { "distill_loss": 0.13164444267749786, "epoch": 7.951967978652435, "step": 23840 }, { "epoch": 7.951967978652435, "ref_ce_loss": 0.05861181393265724, "step": 23840 }, { "epoch": 7.951967978652435, "loss": 0.2547665238380432, "step": 23840 }, { "ce_loss": 0.036511220037937164, "epoch": 7.951967978652435, "step": 23840 }, { "distill_loss": 0.10626421123743057, "epoch": 7.951967978652435, "step": 23840 }, { "epoch": 7.951967978652435, "ref_ce_loss": 0.04119854047894478, "step": 23840 }, { "epoch": 7.955303535690461, "loss": 0.3294, "step": 23850 }, { "epoch": 7.955303535690461, "grad_norm": 3.057788372039795, "step": 23850 }, { "epoch": 7.955303535690461, "learning_rate": 2.4557829308202714e-08, "step": 23850 }, { "epoch": 7.955303535690461, "loss": 0.2910195291042328, "step": 23850 }, { "ce_loss": 0.011250575073063374, "epoch": 7.955303535690461, "step": 23850 }, { "distill_loss": 0.21544837951660156, "epoch": 7.955303535690461, "step": 23850 }, { "epoch": 7.955303535690461, "ref_ce_loss": 0.043316297233104706, "step": 23850 }, { "epoch": 7.955303535690461, "loss": 0.3057239055633545, "step": 23850 }, { "ce_loss": 0.02447928860783577, "epoch": 7.955303535690461, "step": 23850 }, { "distill_loss": 0.22899970412254333, "epoch": 7.955303535690461, "step": 23850 }, { "epoch": 7.955303535690461, "ref_ce_loss": 0.051885154098272324, "step": 23850 }, { "epoch": 7.955303535690461, "loss": 0.24294811487197876, "step": 23850 }, { "ce_loss": 0.011331611312925816, "epoch": 7.955303535690461, "step": 23850 }, { "distill_loss": 0.16551092267036438, "epoch": 7.955303535690461, "step": 23850 }, { "epoch": 7.955303535690461, "ref_ce_loss": 0.042633138597011566, "step": 23850 }, { "epoch": 7.955303535690461, "loss": 0.6378141641616821, "step": 23850 }, { "ce_loss": 0.02813078835606575, "epoch": 7.955303535690461, "step": 23850 }, { "distill_loss": 0.14803311228752136, "epoch": 7.955303535690461, "step": 23850 }, { "epoch": 7.955303535690461, "ref_ce_loss": 0.05598514899611473, "step": 23850 }, { "epoch": 7.958639092728486, "loss": 0.3353, "step": 23860 }, { "epoch": 7.958639092728486, "grad_norm": 3.8991682529449463, "step": 23860 }, { "epoch": 7.958639092728486, "learning_rate": 2.102933080497249e-08, "step": 23860 }, { "epoch": 7.958639092728486, "loss": 0.1944175809621811, "step": 23860 }, { "ce_loss": 0.012968351133167744, "epoch": 7.958639092728486, "step": 23860 }, { "distill_loss": 0.1464006006717682, "epoch": 7.958639092728486, "step": 23860 }, { "epoch": 7.958639092728486, "ref_ce_loss": 0.034779004752635956, "step": 23860 }, { "epoch": 7.958639092728486, "loss": 0.34716102480888367, "step": 23860 }, { "ce_loss": 0.0167516078799963, "epoch": 7.958639092728486, "step": 23860 }, { "distill_loss": 0.22644741833209991, "epoch": 7.958639092728486, "step": 23860 }, { "epoch": 7.958639092728486, "ref_ce_loss": 0.05799233913421631, "step": 23860 }, { "epoch": 7.958639092728486, "loss": 0.24939963221549988, "step": 23860 }, { "ce_loss": 0.021768810227513313, "epoch": 7.958639092728486, "step": 23860 }, { "distill_loss": 0.15550008416175842, "epoch": 7.958639092728486, "step": 23860 }, { "epoch": 7.958639092728486, "ref_ce_loss": 0.049455419182777405, "step": 23860 }, { "epoch": 7.958639092728486, "loss": 0.46806520223617554, "step": 23860 }, { "ce_loss": 0.029784483835101128, "epoch": 7.958639092728486, "step": 23860 }, { "distill_loss": 0.20984099805355072, "epoch": 7.958639092728486, "step": 23860 }, { "epoch": 7.958639092728486, "ref_ce_loss": 0.057520121335983276, "step": 23860 }, { "epoch": 7.961974649766511, "loss": 0.3539, "step": 23870 }, { "epoch": 7.961974649766511, "grad_norm": 3.716891288757324, "step": 23870 }, { "epoch": 7.961974649766511, "learning_rate": 1.7774334782372224e-08, "step": 23870 }, { "epoch": 7.961974649766511, "loss": 0.3307017385959625, "step": 23870 }, { "ce_loss": 0.002352922922000289, "epoch": 7.961974649766511, "step": 23870 }, { "distill_loss": 0.16378623247146606, "epoch": 7.961974649766511, "step": 23870 }, { "epoch": 7.961974649766511, "ref_ce_loss": 0.07698747515678406, "step": 23870 }, { "epoch": 7.961974649766511, "loss": 0.348903626203537, "step": 23870 }, { "ce_loss": 0.026433821767568588, "epoch": 7.961974649766511, "step": 23870 }, { "distill_loss": 0.2482292652130127, "epoch": 7.961974649766511, "step": 23870 }, { "epoch": 7.961974649766511, "ref_ce_loss": 0.05007699504494667, "step": 23870 }, { "epoch": 7.961974649766511, "loss": 0.2856520712375641, "step": 23870 }, { "ce_loss": 0.06673577427864075, "epoch": 7.961974649766511, "step": 23870 }, { "distill_loss": 0.1341075897216797, "epoch": 7.961974649766511, "step": 23870 }, { "epoch": 7.961974649766511, "ref_ce_loss": 0.04442819580435753, "step": 23870 }, { "epoch": 7.961974649766511, "loss": 0.3865754008293152, "step": 23870 }, { "ce_loss": 0.009188508614897728, "epoch": 7.961974649766511, "step": 23870 }, { "distill_loss": 0.19062168896198273, "epoch": 7.961974649766511, "step": 23870 }, { "epoch": 7.961974649766511, "ref_ce_loss": 0.05311084911227226, "step": 23870 }, { "epoch": 7.965310206804537, "loss": 0.308, "step": 23880 }, { "epoch": 7.965310206804537, "grad_norm": 2.8505074977874756, "step": 23880 }, { "epoch": 7.965310206804537, "learning_rate": 1.4792847176220423e-08, "step": 23880 }, { "epoch": 7.965310206804537, "loss": 0.5729832053184509, "step": 23880 }, { "ce_loss": 0.03645119443535805, "epoch": 7.965310206804537, "step": 23880 }, { "distill_loss": 0.12541188299655914, "epoch": 7.965310206804537, "step": 23880 }, { "epoch": 7.965310206804537, "ref_ce_loss": 0.04846469312906265, "step": 23880 }, { "epoch": 7.965310206804537, "loss": 0.47117969393730164, "step": 23880 }, { "ce_loss": 0.039367545396089554, "epoch": 7.965310206804537, "step": 23880 }, { "distill_loss": 0.2976151704788208, "epoch": 7.965310206804537, "step": 23880 }, { "epoch": 7.965310206804537, "ref_ce_loss": 0.0795765370130539, "step": 23880 }, { "epoch": 7.965310206804537, "loss": 0.3643397390842438, "step": 23880 }, { "ce_loss": 0.031012022867798805, "epoch": 7.965310206804537, "step": 23880 }, { "distill_loss": 0.22613711655139923, "epoch": 7.965310206804537, "step": 23880 }, { "epoch": 7.965310206804537, "ref_ce_loss": 0.050101302564144135, "step": 23880 }, { "epoch": 7.965310206804537, "loss": 0.26308199763298035, "step": 23880 }, { "ce_loss": 0.015335160307586193, "epoch": 7.965310206804537, "step": 23880 }, { "distill_loss": 0.13855913281440735, "epoch": 7.965310206804537, "step": 23880 }, { "epoch": 7.965310206804537, "ref_ce_loss": 0.06943170726299286, "step": 23880 }, { "epoch": 7.968645763842562, "loss": 0.3256, "step": 23890 }, { "epoch": 7.968645763842562, "grad_norm": 4.51202392578125, "step": 23890 }, { "epoch": 7.968645763842562, "learning_rate": 1.2084873423584552e-08, "step": 23890 }, { "epoch": 7.968645763842562, "loss": 0.33923307061195374, "step": 23890 }, { "ce_loss": 0.018250662833452225, "epoch": 7.968645763842562, "step": 23890 }, { "distill_loss": 0.25644806027412415, "epoch": 7.968645763842562, "step": 23890 }, { "epoch": 7.968645763842562, "ref_ce_loss": 0.04556307941675186, "step": 23890 }, { "epoch": 7.968645763842562, "loss": 0.4055192470550537, "step": 23890 }, { "ce_loss": 0.05485742539167404, "epoch": 7.968645763842562, "step": 23890 }, { "distill_loss": 0.2536390423774719, "epoch": 7.968645763842562, "step": 23890 }, { "epoch": 7.968645763842562, "ref_ce_loss": 0.0628480315208435, "step": 23890 }, { "epoch": 7.968645763842562, "loss": 0.3442380726337433, "step": 23890 }, { "ce_loss": 0.05271763727068901, "epoch": 7.968645763842562, "step": 23890 }, { "distill_loss": 0.15627413988113403, "epoch": 7.968645763842562, "step": 23890 }, { "epoch": 7.968645763842562, "ref_ce_loss": 0.11837184429168701, "step": 23890 }, { "epoch": 7.968645763842562, "loss": 0.2478257417678833, "step": 23890 }, { "ce_loss": 0.0008815837791189551, "epoch": 7.968645763842562, "step": 23890 }, { "distill_loss": 0.17156405746936798, "epoch": 7.968645763842562, "step": 23890 }, { "epoch": 7.968645763842562, "ref_ce_loss": 0.03145066648721695, "step": 23890 }, { "epoch": 7.971981320880587, "loss": 0.3203, "step": 23900 }, { "epoch": 7.971981320880587, "grad_norm": 2.825669050216675, "step": 23900 }, { "epoch": 7.971981320880587, "learning_rate": 9.65041846273107e-09, "step": 23900 }, { "epoch": 7.971981320880587, "loss": 0.3642113208770752, "step": 23900 }, { "ce_loss": 0.02877146378159523, "epoch": 7.971981320880587, "step": 23900 }, { "distill_loss": 0.18471139669418335, "epoch": 7.971981320880587, "step": 23900 }, { "epoch": 7.971981320880587, "ref_ce_loss": 0.07360993325710297, "step": 23900 }, { "epoch": 7.971981320880587, "loss": 0.30159834027290344, "step": 23900 }, { "ce_loss": 0.008763355202972889, "epoch": 7.971981320880587, "step": 23900 }, { "distill_loss": 0.20540040731430054, "epoch": 7.971981320880587, "step": 23900 }, { "epoch": 7.971981320880587, "ref_ce_loss": 0.05317322164773941, "step": 23900 }, { "epoch": 7.971981320880587, "loss": 0.36580002307891846, "step": 23900 }, { "ce_loss": 0.01582839898765087, "epoch": 7.971981320880587, "step": 23900 }, { "distill_loss": 0.28203338384628296, "epoch": 7.971981320880587, "step": 23900 }, { "epoch": 7.971981320880587, "ref_ce_loss": 0.04814530909061432, "step": 23900 }, { "epoch": 7.971981320880587, "loss": 0.26183679699897766, "step": 23900 }, { "ce_loss": 0.021706560626626015, "epoch": 7.971981320880587, "step": 23900 }, { "distill_loss": 0.1578688770532608, "epoch": 7.971981320880587, "step": 23900 }, { "epoch": 7.971981320880587, "ref_ce_loss": 0.04011444374918938, "step": 23900 }, { "epoch": 7.975316877918613, "loss": 0.3224, "step": 23910 }, { "epoch": 7.975316877918613, "grad_norm": 2.1414968967437744, "step": 23910 }, { "epoch": 7.975316877918613, "learning_rate": 7.489486733142091e-09, "step": 23910 }, { "epoch": 7.975316877918613, "loss": 0.3207355737686157, "step": 23910 }, { "ce_loss": 0.006373913958668709, "epoch": 7.975316877918613, "step": 23910 }, { "distill_loss": 0.24951599538326263, "epoch": 7.975316877918613, "step": 23910 }, { "epoch": 7.975316877918613, "ref_ce_loss": 0.06476053595542908, "step": 23910 }, { "epoch": 7.975316877918613, "loss": 0.20579034090042114, "step": 23910 }, { "ce_loss": 0.004123230930417776, "epoch": 7.975316877918613, "step": 23910 }, { "distill_loss": 0.10635682940483093, "epoch": 7.975316877918613, "step": 23910 }, { "epoch": 7.975316877918613, "ref_ce_loss": 0.04275263100862503, "step": 23910 }, { "epoch": 7.975316877918613, "loss": 0.32259857654571533, "step": 23910 }, { "ce_loss": 0.019134141504764557, "epoch": 7.975316877918613, "step": 23910 }, { "distill_loss": 0.1968674212694168, "epoch": 7.975316877918613, "step": 23910 }, { "epoch": 7.975316877918613, "ref_ce_loss": 0.03675852715969086, "step": 23910 }, { "epoch": 7.975316877918613, "loss": 0.30018824338912964, "step": 23910 }, { "ce_loss": 0.015749013051390648, "epoch": 7.975316877918613, "step": 23910 }, { "distill_loss": 0.1425822377204895, "epoch": 7.975316877918613, "step": 23910 }, { "epoch": 7.975316877918613, "ref_ce_loss": 0.06256047636270523, "step": 23910 }, { "epoch": 7.978652434956638, "loss": 0.3346, "step": 23920 }, { "epoch": 7.978652434956638, "grad_norm": 3.1309776306152344, "step": 23920 }, { "epoch": 7.978652434956638, "learning_rate": 5.602082175515388e-09, "step": 23920 }, { "epoch": 7.978652434956638, "loss": 0.27254971861839294, "step": 23920 }, { "ce_loss": 0.02068653143942356, "epoch": 7.978652434956638, "step": 23920 }, { "distill_loss": 0.13779519498348236, "epoch": 7.978652434956638, "step": 23920 }, { "epoch": 7.978652434956638, "ref_ce_loss": 0.052534569054841995, "step": 23920 }, { "epoch": 7.978652434956638, "loss": 0.19287121295928955, "step": 23920 }, { "ce_loss": 0.008895105682313442, "epoch": 7.978652434956638, "step": 23920 }, { "distill_loss": 0.11923294514417648, "epoch": 7.978652434956638, "step": 23920 }, { "epoch": 7.978652434956638, "ref_ce_loss": 0.035864803940057755, "step": 23920 }, { "epoch": 7.978652434956638, "loss": 0.17826569080352783, "step": 23920 }, { "ce_loss": 0.018720723688602448, "epoch": 7.978652434956638, "step": 23920 }, { "distill_loss": 0.10931409150362015, "epoch": 7.978652434956638, "step": 23920 }, { "epoch": 7.978652434956638, "ref_ce_loss": 0.033927470445632935, "step": 23920 }, { "epoch": 7.978652434956638, "loss": 0.2010970562696457, "step": 23920 }, { "ce_loss": 0.03899431601166725, "epoch": 7.978652434956638, "step": 23920 }, { "distill_loss": 0.11069336533546448, "epoch": 7.978652434956638, "step": 23920 }, { "epoch": 7.978652434956638, "ref_ce_loss": 0.026145868003368378, "step": 23920 }, { "epoch": 7.9819879919946635, "loss": 0.3318, "step": 23930 }, { "epoch": 7.9819879919946635, "grad_norm": 3.460667371749878, "step": 23930 }, { "epoch": 7.9819879919946635, "learning_rate": 3.988208231747725e-09, "step": 23930 }, { "epoch": 7.9819879919946635, "loss": 0.3221658170223236, "step": 23930 }, { "ce_loss": 0.006332428194582462, "epoch": 7.9819879919946635, "step": 23930 }, { "distill_loss": 0.1634363979101181, "epoch": 7.9819879919946635, "step": 23930 }, { "epoch": 7.9819879919946635, "ref_ce_loss": 0.07524649053812027, "step": 23930 }, { "epoch": 7.9819879919946635, "loss": 0.20411431789398193, "step": 23930 }, { "ce_loss": 0.02330736815929413, "epoch": 7.9819879919946635, "step": 23930 }, { "distill_loss": 0.09716067463159561, "epoch": 7.9819879919946635, "step": 23930 }, { "epoch": 7.9819879919946635, "ref_ce_loss": 0.051996905356645584, "step": 23930 }, { "epoch": 7.9819879919946635, "loss": 0.3010110557079315, "step": 23930 }, { "ce_loss": 0.023202329874038696, "epoch": 7.9819879919946635, "step": 23930 }, { "distill_loss": 0.23072974383831024, "epoch": 7.9819879919946635, "step": 23930 }, { "epoch": 7.9819879919946635, "ref_ce_loss": 0.04667457938194275, "step": 23930 }, { "epoch": 7.9819879919946635, "loss": 0.28928112983703613, "step": 23930 }, { "ce_loss": 0.017655836418271065, "epoch": 7.9819879919946635, "step": 23930 }, { "distill_loss": 0.12512949109077454, "epoch": 7.9819879919946635, "step": 23930 }, { "epoch": 7.9819879919946635, "ref_ce_loss": 0.03689692169427872, "step": 23930 }, { "epoch": 7.985323549032689, "loss": 0.3457, "step": 23940 }, { "epoch": 7.985323549032689, "grad_norm": 2.695939064025879, "step": 23940 }, { "epoch": 7.985323549032689, "learning_rate": 2.6478678448682567e-09, "step": 23940 }, { "epoch": 7.985323549032689, "loss": 0.2173708826303482, "step": 23940 }, { "ce_loss": 0.04703154414892197, "epoch": 7.985323549032689, "step": 23940 }, { "distill_loss": 0.11919710040092468, "epoch": 7.985323549032689, "step": 23940 }, { "epoch": 7.985323549032689, "ref_ce_loss": 0.03168262913823128, "step": 23940 }, { "epoch": 7.985323549032689, "loss": 0.21798212826251984, "step": 23940 }, { "ce_loss": 0.007845278829336166, "epoch": 7.985323549032689, "step": 23940 }, { "distill_loss": 0.12254693359136581, "epoch": 7.985323549032689, "step": 23940 }, { "epoch": 7.985323549032689, "ref_ce_loss": 0.05747520551085472, "step": 23940 }, { "epoch": 7.985323549032689, "loss": 0.3193615972995758, "step": 23940 }, { "ce_loss": 0.015500759705901146, "epoch": 7.985323549032689, "step": 23940 }, { "distill_loss": 0.11406480520963669, "epoch": 7.985323549032689, "step": 23940 }, { "epoch": 7.985323549032689, "ref_ce_loss": 0.03308076784014702, "step": 23940 }, { "epoch": 7.985323549032689, "loss": 0.19805341958999634, "step": 23940 }, { "ce_loss": 0.015502206049859524, "epoch": 7.985323549032689, "step": 23940 }, { "distill_loss": 0.15302814543247223, "epoch": 7.985323549032689, "step": 23940 }, { "epoch": 7.985323549032689, "ref_ce_loss": 0.02931785024702549, "step": 23940 }, { "epoch": 7.988659106070714, "loss": 0.2937, "step": 23950 }, { "epoch": 7.988659106070714, "grad_norm": 5.747377395629883, "step": 23950 }, { "epoch": 7.988659106070714, "learning_rate": 1.5810634591550964e-09, "step": 23950 }, { "epoch": 7.988659106070714, "loss": 0.4828079342842102, "step": 23950 }, { "ce_loss": 0.07146650552749634, "epoch": 7.988659106070714, "step": 23950 }, { "distill_loss": 0.3095947504043579, "epoch": 7.988659106070714, "step": 23950 }, { "epoch": 7.988659106070714, "ref_ce_loss": 0.04873261973261833, "step": 23950 }, { "epoch": 7.988659106070714, "loss": 0.4262124300003052, "step": 23950 }, { "ce_loss": 0.027315624058246613, "epoch": 7.988659106070714, "step": 23950 }, { "distill_loss": 0.16963714361190796, "epoch": 7.988659106070714, "step": 23950 }, { "epoch": 7.988659106070714, "ref_ce_loss": 0.060540758073329926, "step": 23950 }, { "epoch": 7.988659106070714, "loss": 0.42749008536338806, "step": 23950 }, { "ce_loss": 0.051516093313694, "epoch": 7.988659106070714, "step": 23950 }, { "distill_loss": 0.23047225177288055, "epoch": 7.988659106070714, "step": 23950 }, { "epoch": 7.988659106070714, "ref_ce_loss": 0.08982308208942413, "step": 23950 }, { "epoch": 7.988659106070714, "loss": 0.43480467796325684, "step": 23950 }, { "ce_loss": 0.03439018130302429, "epoch": 7.988659106070714, "step": 23950 }, { "distill_loss": 0.16861560940742493, "epoch": 7.988659106070714, "step": 23950 }, { "epoch": 7.988659106070714, "ref_ce_loss": 0.04760503023862839, "step": 23950 }, { "epoch": 7.9919946631087395, "loss": 0.3183, "step": 23960 }, { "epoch": 7.9919946631087395, "grad_norm": 2.7786331176757812, "step": 23960 }, { "epoch": 7.9919946631087395, "learning_rate": 7.877970200353967e-10, "step": 23960 }, { "epoch": 7.9919946631087395, "loss": 0.2137977033853531, "step": 23960 }, { "ce_loss": 0.02110884338617325, "epoch": 7.9919946631087395, "step": 23960 }, { "distill_loss": 0.12767818570137024, "epoch": 7.9919946631087395, "step": 23960 }, { "epoch": 7.9919946631087395, "ref_ce_loss": 0.034436240792274475, "step": 23960 }, { "epoch": 7.9919946631087395, "loss": 0.26209551095962524, "step": 23960 }, { "ce_loss": 0.0033823607955127954, "epoch": 7.9919946631087395, "step": 23960 }, { "distill_loss": 0.211116760969162, "epoch": 7.9919946631087395, "step": 23960 }, { "epoch": 7.9919946631087395, "ref_ce_loss": 0.04747690632939339, "step": 23960 }, { "epoch": 7.9919946631087395, "loss": 0.4075537323951721, "step": 23960 }, { "ce_loss": 0.013052606023848057, "epoch": 7.9919946631087395, "step": 23960 }, { "distill_loss": 0.17281053960323334, "epoch": 7.9919946631087395, "step": 23960 }, { "epoch": 7.9919946631087395, "ref_ce_loss": 0.05032519996166229, "step": 23960 }, { "epoch": 7.9919946631087395, "loss": 0.29942646622657776, "step": 23960 }, { "ce_loss": 0.051360610872507095, "epoch": 7.9919946631087395, "step": 23960 }, { "distill_loss": 0.1980040967464447, "epoch": 7.9919946631087395, "step": 23960 }, { "epoch": 7.9919946631087395, "ref_ce_loss": 0.049913190305233, "step": 23960 }, { "epoch": 7.995330220146765, "loss": 0.2936, "step": 23970 }, { "epoch": 7.995330220146765, "grad_norm": 2.5634679794311523, "step": 23970 }, { "epoch": 7.995330220146765, "learning_rate": 2.680699741186565e-10, "step": 23970 }, { "epoch": 7.995330220146765, "loss": 0.2984486222267151, "step": 23970 }, { "ce_loss": 0.031040744855999947, "epoch": 7.995330220146765, "step": 23970 }, { "distill_loss": 0.16967520117759705, "epoch": 7.995330220146765, "step": 23970 }, { "epoch": 7.995330220146765, "ref_ce_loss": 0.07135186344385147, "step": 23970 }, { "epoch": 7.995330220146765, "loss": 0.28316909074783325, "step": 23970 }, { "ce_loss": 0.014297720044851303, "epoch": 7.995330220146765, "step": 23970 }, { "distill_loss": 0.18668489158153534, "epoch": 7.995330220146765, "step": 23970 }, { "epoch": 7.995330220146765, "ref_ce_loss": 0.05426119267940521, "step": 23970 }, { "epoch": 7.995330220146765, "loss": 0.3653208911418915, "step": 23970 }, { "ce_loss": 0.11517569422721863, "epoch": 7.995330220146765, "step": 23970 }, { "distill_loss": 0.19170519709587097, "epoch": 7.995330220146765, "step": 23970 }, { "epoch": 7.995330220146765, "ref_ce_loss": 0.04996060952544212, "step": 23970 }, { "epoch": 7.995330220146765, "loss": 0.28267958760261536, "step": 23970 }, { "ce_loss": 0.08331085741519928, "epoch": 7.995330220146765, "step": 23970 }, { "distill_loss": 0.13024798035621643, "epoch": 7.995330220146765, "step": 23970 }, { "epoch": 7.995330220146765, "ref_ce_loss": 0.06889346987009048, "step": 23970 }, { "epoch": 7.99866577718479, "loss": 0.3085, "step": 23980 }, { "epoch": 7.99866577718479, "grad_norm": 2.1330296993255615, "step": 23980 }, { "epoch": 7.99866577718479, "learning_rate": 2.188326918006744e-11, "step": 23980 }, { "epoch": 7.99866577718479, "loss": 0.9639419317245483, "step": 23980 }, { "ce_loss": 0.06816727668046951, "epoch": 7.99866577718479, "step": 23980 }, { "distill_loss": 0.14574238657951355, "epoch": 7.99866577718479, "step": 23980 }, { "epoch": 7.99866577718479, "ref_ce_loss": 0.06669634580612183, "step": 23980 }, { "epoch": 7.99866577718479, "loss": 0.3473624587059021, "step": 23980 }, { "ce_loss": 0.04842541366815567, "epoch": 7.99866577718479, "step": 23980 }, { "distill_loss": 0.23083451390266418, "epoch": 7.99866577718479, "step": 23980 }, { "epoch": 7.99866577718479, "ref_ce_loss": 0.051015496253967285, "step": 23980 }, { "epoch": 7.99866577718479, "loss": 0.21875524520874023, "step": 23980 }, { "ce_loss": 0.01979643665254116, "epoch": 7.99866577718479, "step": 23980 }, { "distill_loss": 0.11872906237840652, "epoch": 7.99866577718479, "step": 23980 }, { "epoch": 7.99866577718479, "ref_ce_loss": 0.03821733966469765, "step": 23980 }, { "epoch": 7.99866577718479, "loss": 0.20249786972999573, "step": 23980 }, { "ce_loss": 0.03200726583600044, "epoch": 7.99866577718479, "step": 23980 }, { "distill_loss": 0.13593564927577972, "epoch": 7.99866577718479, "step": 23980 }, { "epoch": 7.99866577718479, "ref_ce_loss": 0.034453000873327255, "step": 23980 }, { "epoch": 8.0, "step": 23984, "train_runtime": 145201.4613 }, { "epoch": 8.0, "step": 23984, "train_samples_per_second": 21.142 }, { "epoch": 8.0, "step": 23984, "train_steps_per_second": 0.165 }, { "epoch": 8.0, "step": 23984, "total_flos": 0.0 }, { "epoch": 8.0, "step": 23984, "train_loss": 0.537310748268478 } ], "logging_steps": 10, "max_steps": 23984, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }