{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 3884, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012873326467559218, "grad_norm": 644.0, "learning_rate": 4.99356333676622e-07, "logits/chosen": -2.5093750953674316, "logits/rejected": -2.5406250953674316, "logps/chosen": -287.6000061035156, "logps/rejected": -302.79998779296875, "loss": 0.6961, "rewards/accuracies": 0.2874999940395355, "rewards/chosen": -0.01401367224752903, "rewards/margins": 0.0025878907181322575, "rewards/rejected": -0.0165557861328125, "step": 5 }, { "epoch": 0.0025746652935118436, "grad_norm": 616.0, "learning_rate": 4.987126673532441e-07, "logits/chosen": -2.229687452316284, "logits/rejected": -2.609375, "logps/chosen": -290.20001220703125, "logps/rejected": -283.3999938964844, "loss": 0.6547, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": 0.015332031063735485, "rewards/margins": 0.08476562798023224, "rewards/rejected": -0.06956787407398224, "step": 10 }, { "epoch": 0.0038619979402677654, "grad_norm": 636.0, "learning_rate": 4.980690010298661e-07, "logits/chosen": -2.487499952316284, "logits/rejected": -2.534374952316284, "logps/chosen": -312.20001220703125, "logps/rejected": -406.79998779296875, "loss": 0.6117, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.039306640625, "rewards/margins": 0.19733886420726776, "rewards/rejected": -0.2369384765625, "step": 15 }, { "epoch": 0.005149330587023687, "grad_norm": 490.0, "learning_rate": 4.974253347064881e-07, "logits/chosen": -2.659374952316284, "logits/rejected": -2.640625, "logps/chosen": -352.3999938964844, "logps/rejected": -345.6000061035156, "loss": 0.6312, "rewards/accuracies": 0.625, "rewards/chosen": -0.13104248046875, "rewards/margins": 0.15019531548023224, "rewards/rejected": -0.28144532442092896, "step": 20 }, { "epoch": 0.006436663233779609, "grad_norm": 536.0, "learning_rate": 4.967816683831102e-07, "logits/chosen": -2.4781250953674316, "logits/rejected": -2.4124999046325684, "logps/chosen": -331.20001220703125, "logps/rejected": -405.6000061035156, "loss": 0.5781, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.265380859375, "rewards/margins": 0.3291992247104645, "rewards/rejected": -0.59375, "step": 25 }, { "epoch": 0.007723995880535531, "grad_norm": 616.0, "learning_rate": 4.961380020597322e-07, "logits/chosen": -2.5406250953674316, "logits/rejected": -2.596874952316284, "logps/chosen": -307.6000061035156, "logps/rejected": -341.0, "loss": 0.5883, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.25214844942092896, "rewards/margins": 0.28828126192092896, "rewards/rejected": -0.5409179925918579, "step": 30 }, { "epoch": 0.009011328527291453, "grad_norm": 254.0, "learning_rate": 4.954943357363543e-07, "logits/chosen": -2.609375, "logits/rejected": -2.6812500953674316, "logps/chosen": -251.5, "logps/rejected": -268.0, "loss": 0.6266, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0943603515625, "rewards/margins": 0.17851562798023224, "rewards/rejected": -0.27265626192092896, "step": 35 }, { "epoch": 0.010298661174047374, "grad_norm": 310.0, "learning_rate": 4.948506694129763e-07, "logits/chosen": -2.674999952316284, "logits/rejected": -2.7874999046325684, "logps/chosen": -320.3999938964844, "logps/rejected": -320.20001220703125, "loss": 0.6137, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.088958740234375, "rewards/margins": 0.21855469048023224, "rewards/rejected": -0.3075195252895355, "step": 40 }, { "epoch": 0.011585993820803296, "grad_norm": 592.0, "learning_rate": 4.942070030895984e-07, "logits/chosen": -2.5687499046325684, "logits/rejected": -2.3968749046325684, "logps/chosen": -350.3999938964844, "logps/rejected": -412.79998779296875, "loss": 0.55, "rewards/accuracies": 0.6875, "rewards/chosen": -0.39716798067092896, "rewards/margins": 0.46894532442092896, "rewards/rejected": -0.865234375, "step": 45 }, { "epoch": 0.012873326467559218, "grad_norm": 676.0, "learning_rate": 4.935633367662204e-07, "logits/chosen": -2.453125, "logits/rejected": -2.4906249046325684, "logps/chosen": -306.1000061035156, "logps/rejected": -337.0, "loss": 0.5906, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.33613282442092896, "rewards/margins": 0.3333984315395355, "rewards/rejected": -0.668994128704071, "step": 50 }, { "epoch": 0.01416065911431514, "grad_norm": 312.0, "learning_rate": 4.929196704428423e-07, "logits/chosen": -2.637500047683716, "logits/rejected": -2.340625047683716, "logps/chosen": -259.79998779296875, "logps/rejected": -273.20001220703125, "loss": 0.6117, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.12401123344898224, "rewards/margins": 0.24062499403953552, "rewards/rejected": -0.3640991151332855, "step": 55 }, { "epoch": 0.015447991761071062, "grad_norm": 410.0, "learning_rate": 4.922760041194645e-07, "logits/chosen": -2.606250047683716, "logits/rejected": -2.565624952316284, "logps/chosen": -235.8000030517578, "logps/rejected": -270.5, "loss": 0.6352, "rewards/accuracies": 0.5, "rewards/chosen": -0.23447266221046448, "rewards/margins": 0.21323242783546448, "rewards/rejected": -0.448211669921875, "step": 60 }, { "epoch": 0.016735324407826983, "grad_norm": 624.0, "learning_rate": 4.916323377960865e-07, "logits/chosen": -2.487499952316284, "logits/rejected": -2.535937547683716, "logps/chosen": -321.6000061035156, "logps/rejected": -349.6000061035156, "loss": 0.5613, "rewards/accuracies": 0.625, "rewards/chosen": -0.3672119081020355, "rewards/margins": 0.4175781309604645, "rewards/rejected": -0.78515625, "step": 65 }, { "epoch": 0.018022657054582905, "grad_norm": 336.0, "learning_rate": 4.909886714727085e-07, "logits/chosen": -2.5093750953674316, "logits/rejected": -2.4546875953674316, "logps/chosen": -325.3999938964844, "logps/rejected": -326.6000061035156, "loss": 0.5707, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.09294433891773224, "rewards/margins": 0.36601561307907104, "rewards/rejected": -0.45917969942092896, "step": 70 }, { "epoch": 0.019309989701338827, "grad_norm": 326.0, "learning_rate": 4.903450051493306e-07, "logits/chosen": -2.59375, "logits/rejected": -2.7593750953674316, "logps/chosen": -338.0, "logps/rejected": -289.79998779296875, "loss": 0.6523, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.15625, "rewards/margins": 0.17452391982078552, "rewards/rejected": -0.3306640684604645, "step": 75 }, { "epoch": 0.02059732234809475, "grad_norm": 580.0, "learning_rate": 4.897013388259526e-07, "logits/chosen": -2.7093749046325684, "logits/rejected": -2.643749952316284, "logps/chosen": -390.0, "logps/rejected": -423.6000061035156, "loss": 0.5383, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0615234375, "rewards/margins": 0.46210938692092896, "rewards/rejected": -0.5234375, "step": 80 }, { "epoch": 0.02188465499485067, "grad_norm": 500.0, "learning_rate": 4.890576725025746e-07, "logits/chosen": -2.5531249046325684, "logits/rejected": -2.5406250953674316, "logps/chosen": -334.3999938964844, "logps/rejected": -432.3999938964844, "loss": 0.5484, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5887206792831421, "rewards/margins": 0.575976550579071, "rewards/rejected": -1.1648437976837158, "step": 85 }, { "epoch": 0.023171987641606592, "grad_norm": 592.0, "learning_rate": 4.884140061791967e-07, "logits/chosen": -2.653125047683716, "logits/rejected": -2.549999952316284, "logps/chosen": -341.6000061035156, "logps/rejected": -364.0, "loss": 0.6266, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.4369140565395355, "rewards/margins": 0.43895262479782104, "rewards/rejected": -0.8763672113418579, "step": 90 }, { "epoch": 0.024459320288362514, "grad_norm": 494.0, "learning_rate": 4.877703398558187e-07, "logits/chosen": -2.5875000953674316, "logits/rejected": -2.637500047683716, "logps/chosen": -340.0, "logps/rejected": -331.20001220703125, "loss": 0.6023, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.31744384765625, "rewards/margins": 0.3769897520542145, "rewards/rejected": -0.694140613079071, "step": 95 }, { "epoch": 0.025746652935118436, "grad_norm": 752.0, "learning_rate": 4.871266735324407e-07, "logits/chosen": -2.643749952316284, "logits/rejected": -2.53125, "logps/chosen": -379.20001220703125, "logps/rejected": -404.3999938964844, "loss": 0.5695, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.533203125, "rewards/margins": 0.46428221464157104, "rewards/rejected": -0.9984375238418579, "step": 100 }, { "epoch": 0.027033985581874358, "grad_norm": 732.0, "learning_rate": 4.864830072090629e-07, "logits/chosen": -2.5562500953674316, "logits/rejected": -2.515625, "logps/chosen": -337.20001220703125, "logps/rejected": -336.79998779296875, "loss": 0.6641, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2685546875, "rewards/margins": 0.321533203125, "rewards/rejected": -0.5914062261581421, "step": 105 }, { "epoch": 0.02832131822863028, "grad_norm": 466.0, "learning_rate": 4.858393408856848e-07, "logits/chosen": -2.4296875, "logits/rejected": -2.6031250953674316, "logps/chosen": -329.6000061035156, "logps/rejected": -373.79998779296875, "loss": 0.5527, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.44550782442092896, "rewards/margins": 0.478515625, "rewards/rejected": -0.922656238079071, "step": 110 }, { "epoch": 0.0296086508753862, "grad_norm": 428.0, "learning_rate": 4.851956745623069e-07, "logits/chosen": -2.734375, "logits/rejected": -2.715625047683716, "logps/chosen": -324.20001220703125, "logps/rejected": -354.79998779296875, "loss": 0.5598, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.11448974907398224, "rewards/margins": 0.44609373807907104, "rewards/rejected": -0.561328113079071, "step": 115 }, { "epoch": 0.030895983522142123, "grad_norm": 306.0, "learning_rate": 4.845520082389289e-07, "logits/chosen": -2.653125047683716, "logits/rejected": -2.5875000953674316, "logps/chosen": -337.0, "logps/rejected": -341.20001220703125, "loss": 0.5687, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.4141601622104645, "rewards/margins": 0.3786377012729645, "rewards/rejected": -0.7918701171875, "step": 120 }, { "epoch": 0.032183316168898045, "grad_norm": 552.0, "learning_rate": 4.839083419155509e-07, "logits/chosen": -2.78125, "logits/rejected": -2.5218749046325684, "logps/chosen": -338.70001220703125, "logps/rejected": -299.79998779296875, "loss": 0.632, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.17949219048023224, "rewards/margins": 0.20883789658546448, "rewards/rejected": -0.3873046934604645, "step": 125 }, { "epoch": 0.03347064881565397, "grad_norm": 436.0, "learning_rate": 4.83264675592173e-07, "logits/chosen": -2.5999999046325684, "logits/rejected": -2.503124952316284, "logps/chosen": -295.3999938964844, "logps/rejected": -354.3999938964844, "loss": 0.523, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.43730467557907104, "rewards/margins": 0.595507800579071, "rewards/rejected": -1.033203125, "step": 130 }, { "epoch": 0.03475798146240989, "grad_norm": 488.0, "learning_rate": 4.82621009268795e-07, "logits/chosen": -2.635937452316284, "logits/rejected": -2.5484375953674316, "logps/chosen": -317.79998779296875, "logps/rejected": -263.79998779296875, "loss": 0.6461, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.21210937201976776, "rewards/margins": 0.26240235567092896, "rewards/rejected": -0.4742187559604645, "step": 135 }, { "epoch": 0.03604531410916581, "grad_norm": 572.0, "learning_rate": 4.819773429454171e-07, "logits/chosen": -2.3734374046325684, "logits/rejected": -2.395312547683716, "logps/chosen": -326.6000061035156, "logps/rejected": -309.6000061035156, "loss": 0.5496, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4039062559604645, "rewards/margins": 0.5199218988418579, "rewards/rejected": -0.924023449420929, "step": 140 }, { "epoch": 0.03733264675592173, "grad_norm": 203.0, "learning_rate": 4.813336766220391e-07, "logits/chosen": -2.518749952316284, "logits/rejected": -2.637500047683716, "logps/chosen": -302.6000061035156, "logps/rejected": -304.5, "loss": 0.5508, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.4046874940395355, "rewards/margins": 0.48869627714157104, "rewards/rejected": -0.893359363079071, "step": 145 }, { "epoch": 0.038619979402677654, "grad_norm": 480.0, "learning_rate": 4.806900102986612e-07, "logits/chosen": -2.549999952316284, "logits/rejected": -2.6812500953674316, "logps/chosen": -318.79998779296875, "logps/rejected": -322.79998779296875, "loss": 0.5777, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.25605469942092896, "rewards/margins": 0.40605467557907104, "rewards/rejected": -0.661914050579071, "step": 150 }, { "epoch": 0.039907312049433576, "grad_norm": 344.0, "learning_rate": 4.800463439752832e-07, "logits/chosen": -2.4156250953674316, "logits/rejected": -2.6312499046325684, "logps/chosen": -193.75, "logps/rejected": -234.39999389648438, "loss": 0.5617, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.2811035215854645, "rewards/margins": 0.4822753965854645, "rewards/rejected": -0.762499988079071, "step": 155 }, { "epoch": 0.0411946446961895, "grad_norm": 207.0, "learning_rate": 4.794026776519052e-07, "logits/chosen": -2.409374952316284, "logits/rejected": -2.703125, "logps/chosen": -209.6999969482422, "logps/rejected": -219.64999389648438, "loss": 0.6164, "rewards/accuracies": 0.4375, "rewards/chosen": -0.12136230617761612, "rewards/margins": 0.26329344511032104, "rewards/rejected": -0.38554686307907104, "step": 160 }, { "epoch": 0.04248197734294542, "grad_norm": 362.0, "learning_rate": 4.787590113285273e-07, "logits/chosen": -2.409374952316284, "logits/rejected": -2.484375, "logps/chosen": -288.20001220703125, "logps/rejected": -379.20001220703125, "loss": 0.4984, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.568359375, "rewards/margins": 0.7890625, "rewards/rejected": -1.3585937023162842, "step": 165 }, { "epoch": 0.04376930998970134, "grad_norm": 512.0, "learning_rate": 4.781153450051493e-07, "logits/chosen": -2.621875047683716, "logits/rejected": -2.6624999046325684, "logps/chosen": -308.0, "logps/rejected": -335.6000061035156, "loss": 0.6258, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.30400389432907104, "rewards/margins": 0.3102661073207855, "rewards/rejected": -0.613525390625, "step": 170 }, { "epoch": 0.04505664263645726, "grad_norm": 848.0, "learning_rate": 4.774716786817714e-07, "logits/chosen": -2.307812452316284, "logits/rejected": -2.40625, "logps/chosen": -311.0, "logps/rejected": -351.20001220703125, "loss": 0.5902, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4541015625, "rewards/margins": 0.39140623807907104, "rewards/rejected": -0.845703125, "step": 175 }, { "epoch": 0.046343975283213185, "grad_norm": 488.0, "learning_rate": 4.7682801235839336e-07, "logits/chosen": -2.684375047683716, "logits/rejected": -2.6078124046325684, "logps/chosen": -280.79998779296875, "logps/rejected": -296.3999938964844, "loss": 0.6293, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.11420898139476776, "rewards/margins": 0.28857421875, "rewards/rejected": -0.4027343690395355, "step": 180 }, { "epoch": 0.047631307929969106, "grad_norm": 528.0, "learning_rate": 4.7618434603501545e-07, "logits/chosen": -2.5250000953674316, "logits/rejected": -2.450000047683716, "logps/chosen": -332.79998779296875, "logps/rejected": -410.79998779296875, "loss": 0.4895, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.43098145723342896, "rewards/margins": 0.8277343511581421, "rewards/rejected": -1.2585937976837158, "step": 185 }, { "epoch": 0.04891864057672503, "grad_norm": 356.0, "learning_rate": 4.755406797116375e-07, "logits/chosen": -2.6875, "logits/rejected": -2.3734374046325684, "logps/chosen": -285.0, "logps/rejected": -398.79998779296875, "loss": 0.477, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.35380858182907104, "rewards/margins": 0.7593749761581421, "rewards/rejected": -1.113671898841858, "step": 190 }, { "epoch": 0.05020597322348095, "grad_norm": 502.0, "learning_rate": 4.748970133882595e-07, "logits/chosen": -2.606250047683716, "logits/rejected": -2.5218749046325684, "logps/chosen": -285.20001220703125, "logps/rejected": -341.3999938964844, "loss": 0.484, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3763671815395355, "rewards/margins": 0.7880859375, "rewards/rejected": -1.165283203125, "step": 195 }, { "epoch": 0.05149330587023687, "grad_norm": 392.0, "learning_rate": 4.742533470648816e-07, "logits/chosen": -2.3843750953674316, "logits/rejected": -2.534374952316284, "logps/chosen": -242.39999389648438, "logps/rejected": -287.0, "loss": 0.6477, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.3335327208042145, "rewards/margins": 0.3388671875, "rewards/rejected": -0.6717773675918579, "step": 200 }, { "epoch": 0.052780638516992794, "grad_norm": 508.0, "learning_rate": 4.7360968074150357e-07, "logits/chosen": -2.609375, "logits/rejected": -2.674999952316284, "logps/chosen": -301.6000061035156, "logps/rejected": -389.3999938964844, "loss": 0.4785, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3936706483364105, "rewards/margins": 0.787109375, "rewards/rejected": -1.1804687976837158, "step": 205 }, { "epoch": 0.054067971163748715, "grad_norm": 520.0, "learning_rate": 4.729660144181256e-07, "logits/chosen": -2.5546875, "logits/rejected": -2.6031250953674316, "logps/chosen": -333.0, "logps/rejected": -372.6000061035156, "loss": 0.5527, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2757812440395355, "rewards/margins": 0.5562499761581421, "rewards/rejected": -0.832812488079071, "step": 210 }, { "epoch": 0.05535530381050464, "grad_norm": 1088.0, "learning_rate": 4.7232234809474765e-07, "logits/chosen": -2.706249952316284, "logits/rejected": -2.3843750953674316, "logps/chosen": -294.6000061035156, "logps/rejected": -304.20001220703125, "loss": 0.6039, "rewards/accuracies": 0.625, "rewards/chosen": -0.10678710788488388, "rewards/margins": 0.3363281190395355, "rewards/rejected": -0.4429687559604645, "step": 215 }, { "epoch": 0.05664263645726056, "grad_norm": 482.0, "learning_rate": 4.716786817713697e-07, "logits/chosen": -2.6187500953674316, "logits/rejected": -2.659374952316284, "logps/chosen": -341.0, "logps/rejected": -362.8999938964844, "loss": 0.5094, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.43071287870407104, "rewards/margins": 0.699023425579071, "rewards/rejected": -1.1296875476837158, "step": 220 }, { "epoch": 0.05792996910401648, "grad_norm": 428.0, "learning_rate": 4.7103501544799174e-07, "logits/chosen": -2.784374952316284, "logits/rejected": -2.799999952316284, "logps/chosen": -326.20001220703125, "logps/rejected": -354.0, "loss": 0.607, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.19094237685203552, "rewards/margins": 0.33251953125, "rewards/rejected": -0.5238281488418579, "step": 225 }, { "epoch": 0.0592173017507724, "grad_norm": 412.0, "learning_rate": 4.703913491246138e-07, "logits/chosen": -2.753124952316284, "logits/rejected": -2.65625, "logps/chosen": -334.6000061035156, "logps/rejected": -329.6000061035156, "loss": 0.5734, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.18408203125, "rewards/margins": 0.44140625, "rewards/rejected": -0.625781238079071, "step": 230 }, { "epoch": 0.060504634397528324, "grad_norm": 652.0, "learning_rate": 4.697476828012358e-07, "logits/chosen": -2.7593750953674316, "logits/rejected": -2.6031250953674316, "logps/chosen": -344.79998779296875, "logps/rejected": -366.79998779296875, "loss": 0.5809, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3056640625, "rewards/margins": 0.4214843809604645, "rewards/rejected": -0.7269531488418579, "step": 235 }, { "epoch": 0.061791967044284246, "grad_norm": 560.0, "learning_rate": 4.6910401647785787e-07, "logits/chosen": -2.410937547683716, "logits/rejected": -2.503124952316284, "logps/chosen": -262.79998779296875, "logps/rejected": -294.20001220703125, "loss": 0.5824, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.26289063692092896, "rewards/margins": 0.4537109434604645, "rewards/rejected": -0.71673583984375, "step": 240 }, { "epoch": 0.06307929969104016, "grad_norm": 310.0, "learning_rate": 4.6846035015447986e-07, "logits/chosen": -2.6312499046325684, "logits/rejected": -2.5999999046325684, "logps/chosen": -245.1999969482422, "logps/rejected": -294.79998779296875, "loss": 0.5539, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.21328124403953552, "rewards/margins": 0.530468761920929, "rewards/rejected": -0.7447265386581421, "step": 245 }, { "epoch": 0.06436663233779609, "grad_norm": 564.0, "learning_rate": 4.6781668383110195e-07, "logits/chosen": -2.637500047683716, "logits/rejected": -2.5875000953674316, "logps/chosen": -295.0, "logps/rejected": -426.0, "loss": 0.4703, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.76513671875, "rewards/margins": 1.0398437976837158, "rewards/rejected": -1.8039062023162842, "step": 250 }, { "epoch": 0.065653964984552, "grad_norm": 402.0, "learning_rate": 4.67173017507724e-07, "logits/chosen": -2.395312547683716, "logits/rejected": -2.453125, "logps/chosen": -331.6000061035156, "logps/rejected": -402.79998779296875, "loss": 0.5084, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7025390863418579, "rewards/margins": 0.8880859613418579, "rewards/rejected": -1.591406226158142, "step": 255 }, { "epoch": 0.06694129763130793, "grad_norm": 560.0, "learning_rate": 4.66529351184346e-07, "logits/chosen": -2.528125047683716, "logits/rejected": -2.578125, "logps/chosen": -346.20001220703125, "logps/rejected": -407.6000061035156, "loss": 0.4555, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.598828136920929, "rewards/margins": 0.9937499761581421, "rewards/rejected": -1.59765625, "step": 260 }, { "epoch": 0.06822863027806385, "grad_norm": 540.0, "learning_rate": 4.658856848609681e-07, "logits/chosen": -2.4749999046325684, "logits/rejected": -2.440624952316284, "logps/chosen": -346.20001220703125, "logps/rejected": -372.79998779296875, "loss": 0.5285, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.875, "rewards/margins": 0.8511718511581421, "rewards/rejected": -1.728124976158142, "step": 265 }, { "epoch": 0.06951596292481978, "grad_norm": 456.0, "learning_rate": 4.652420185375901e-07, "logits/chosen": -2.518749952316284, "logits/rejected": -2.481250047683716, "logps/chosen": -384.3999938964844, "logps/rejected": -430.79998779296875, "loss": 0.4758, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6507812738418579, "rewards/margins": 1.014062523841858, "rewards/rejected": -1.666406273841858, "step": 270 }, { "epoch": 0.07080329557157569, "grad_norm": 520.0, "learning_rate": 4.645983522142121e-07, "logits/chosen": -2.671875, "logits/rejected": -2.71875, "logps/chosen": -303.0, "logps/rejected": -300.79998779296875, "loss": 0.5285, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.34150391817092896, "rewards/margins": 0.7132812738418579, "rewards/rejected": -1.0546875, "step": 275 }, { "epoch": 0.07209062821833162, "grad_norm": 556.0, "learning_rate": 4.639546858908342e-07, "logits/chosen": -2.378124952316284, "logits/rejected": -2.5531249046325684, "logps/chosen": -358.0, "logps/rejected": -347.6000061035156, "loss": 0.525, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3667968809604645, "rewards/margins": 0.643750011920929, "rewards/rejected": -1.01171875, "step": 280 }, { "epoch": 0.07337796086508754, "grad_norm": 632.0, "learning_rate": 4.633110195674562e-07, "logits/chosen": -2.364062547683716, "logits/rejected": -2.285937547683716, "logps/chosen": -315.20001220703125, "logps/rejected": -327.3999938964844, "loss": 0.65, "rewards/accuracies": 0.5, "rewards/chosen": -0.6435546875, "rewards/margins": 0.42558592557907104, "rewards/rejected": -1.0691406726837158, "step": 285 }, { "epoch": 0.07466529351184346, "grad_norm": 494.0, "learning_rate": 4.6266735324407824e-07, "logits/chosen": -2.6343750953674316, "logits/rejected": -2.612499952316284, "logps/chosen": -296.79998779296875, "logps/rejected": -352.6000061035156, "loss": 0.5266, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4666503965854645, "rewards/margins": 0.6103515625, "rewards/rejected": -1.0750000476837158, "step": 290 }, { "epoch": 0.07595262615859938, "grad_norm": 458.0, "learning_rate": 4.620236869207003e-07, "logits/chosen": -2.535937547683716, "logits/rejected": -2.503124952316284, "logps/chosen": -363.79998779296875, "logps/rejected": -417.6000061035156, "loss": 0.5031, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.59765625, "rewards/margins": 0.801953136920929, "rewards/rejected": -1.400781273841858, "step": 295 }, { "epoch": 0.07723995880535531, "grad_norm": 620.0, "learning_rate": 4.613800205973223e-07, "logits/chosen": -2.671875, "logits/rejected": -2.6156249046325684, "logps/chosen": -282.0, "logps/rejected": -302.79998779296875, "loss": 0.5902, "rewards/accuracies": 0.5625, "rewards/chosen": -0.30781251192092896, "rewards/margins": 0.4696044921875, "rewards/rejected": -0.777539074420929, "step": 300 }, { "epoch": 0.07852729145211122, "grad_norm": 656.0, "learning_rate": 4.6073635427394437e-07, "logits/chosen": -2.5531249046325684, "logits/rejected": -2.4375, "logps/chosen": -358.6000061035156, "logps/rejected": -382.3999938964844, "loss": 0.5902, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7835937738418579, "rewards/margins": 0.689892590045929, "rewards/rejected": -1.474218726158142, "step": 305 }, { "epoch": 0.07981462409886715, "grad_norm": 636.0, "learning_rate": 4.6009268795056636e-07, "logits/chosen": -2.612499952316284, "logits/rejected": -2.578125, "logps/chosen": -339.6000061035156, "logps/rejected": -368.20001220703125, "loss": 0.5074, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.505078136920929, "rewards/margins": 0.8001953363418579, "rewards/rejected": -1.3039062023162842, "step": 310 }, { "epoch": 0.08110195674562307, "grad_norm": 492.0, "learning_rate": 4.5944902162718845e-07, "logits/chosen": -2.621875047683716, "logits/rejected": -2.621875047683716, "logps/chosen": -263.20001220703125, "logps/rejected": -361.0, "loss": 0.532, "rewards/accuracies": 0.625, "rewards/chosen": -0.3623046875, "rewards/margins": 0.6800781488418579, "rewards/rejected": -1.0417969226837158, "step": 315 }, { "epoch": 0.082389289392379, "grad_norm": 318.0, "learning_rate": 4.588053553038105e-07, "logits/chosen": -2.715625047683716, "logits/rejected": -2.7125000953674316, "logps/chosen": -270.6499938964844, "logps/rejected": -275.26873779296875, "loss": 0.5676, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.4095214903354645, "rewards/margins": 0.508251965045929, "rewards/rejected": -0.917498767375946, "step": 320 }, { "epoch": 0.08367662203913491, "grad_norm": 552.0, "learning_rate": 4.581616889804325e-07, "logits/chosen": -2.793750047683716, "logits/rejected": -2.7125000953674316, "logps/chosen": -319.20001220703125, "logps/rejected": -325.29998779296875, "loss": 0.6047, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.32539063692092896, "rewards/margins": 0.38847655057907104, "rewards/rejected": -0.715624988079071, "step": 325 }, { "epoch": 0.08496395468589084, "grad_norm": 604.0, "learning_rate": 4.575180226570546e-07, "logits/chosen": -2.5625, "logits/rejected": -2.401562452316284, "logps/chosen": -293.6000061035156, "logps/rejected": -346.79998779296875, "loss": 0.482, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.542773425579071, "rewards/margins": 0.8695312738418579, "rewards/rejected": -1.4132812023162842, "step": 330 }, { "epoch": 0.08625128733264675, "grad_norm": 426.0, "learning_rate": 4.568743563336766e-07, "logits/chosen": -2.450000047683716, "logits/rejected": -2.5078125, "logps/chosen": -275.0, "logps/rejected": -256.8999938964844, "loss": 0.618, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.4957031309604645, "rewards/margins": 0.4520507752895355, "rewards/rejected": -0.9457031488418579, "step": 335 }, { "epoch": 0.08753861997940268, "grad_norm": 143.0, "learning_rate": 4.562306900102986e-07, "logits/chosen": -2.6031250953674316, "logits/rejected": -2.75, "logps/chosen": -235.1999969482422, "logps/rejected": -258.0, "loss": 0.6711, "rewards/accuracies": 0.4375, "rewards/chosen": -0.38691407442092896, "rewards/margins": 0.2894531190395355, "rewards/rejected": -0.6746581792831421, "step": 340 }, { "epoch": 0.0888259526261586, "grad_norm": 260.0, "learning_rate": 4.555870236869207e-07, "logits/chosen": -2.78125, "logits/rejected": -2.184375047683716, "logps/chosen": -162.89999389648438, "logps/rejected": -225.1999969482422, "loss": 0.5836, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.23823241889476776, "rewards/margins": 0.46894532442092896, "rewards/rejected": -0.708203136920929, "step": 345 }, { "epoch": 0.09011328527291453, "grad_norm": 368.0, "learning_rate": 4.549433573635427e-07, "logits/chosen": -2.565624952316284, "logits/rejected": -2.59375, "logps/chosen": -329.20001220703125, "logps/rejected": -318.20001220703125, "loss": 0.5191, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2942871153354645, "rewards/margins": 0.749218761920929, "rewards/rejected": -1.0417969226837158, "step": 350 }, { "epoch": 0.09140061791967044, "grad_norm": 376.0, "learning_rate": 4.5429969104016474e-07, "logits/chosen": -2.5093750953674316, "logits/rejected": -2.481250047683716, "logps/chosen": -289.79998779296875, "logps/rejected": -289.0, "loss": 0.5855, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.35267335176467896, "rewards/margins": 0.49238282442092896, "rewards/rejected": -0.844433605670929, "step": 355 }, { "epoch": 0.09268795056642637, "grad_norm": 600.0, "learning_rate": 4.5365602471678684e-07, "logits/chosen": -2.700000047683716, "logits/rejected": -2.6078124046325684, "logps/chosen": -320.3999938964844, "logps/rejected": -354.0, "loss": 0.5773, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3797363340854645, "rewards/margins": 0.606640636920929, "rewards/rejected": -0.987500011920929, "step": 360 }, { "epoch": 0.09397528321318228, "grad_norm": 482.0, "learning_rate": 4.5301235839340883e-07, "logits/chosen": -2.5250000953674316, "logits/rejected": -2.621875047683716, "logps/chosen": -355.79998779296875, "logps/rejected": -346.20001220703125, "loss": 0.5578, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.538281261920929, "rewards/margins": 0.761523425579071, "rewards/rejected": -1.297265648841858, "step": 365 }, { "epoch": 0.09526261585993821, "grad_norm": 532.0, "learning_rate": 4.5236869207003087e-07, "logits/chosen": -2.546875, "logits/rejected": -2.4625000953674316, "logps/chosen": -312.79998779296875, "logps/rejected": -344.0, "loss": 0.5238, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.41704100370407104, "rewards/margins": 0.755859375, "rewards/rejected": -1.1720702648162842, "step": 370 }, { "epoch": 0.09654994850669413, "grad_norm": 636.0, "learning_rate": 4.517250257466529e-07, "logits/chosen": -2.59375, "logits/rejected": -2.6343750953674316, "logps/chosen": -360.3999938964844, "logps/rejected": -383.6000061035156, "loss": 0.4766, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4566406309604645, "rewards/margins": 0.883593738079071, "rewards/rejected": -1.3367187976837158, "step": 375 }, { "epoch": 0.09783728115345006, "grad_norm": 716.0, "learning_rate": 4.5108135942327496e-07, "logits/chosen": -2.734375, "logits/rejected": -2.746875047683716, "logps/chosen": -317.20001220703125, "logps/rejected": -280.6000061035156, "loss": 0.6539, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.42431640625, "rewards/margins": 0.3511718809604645, "rewards/rejected": -0.7757812738418579, "step": 380 }, { "epoch": 0.09912461380020597, "grad_norm": 388.0, "learning_rate": 4.50437693099897e-07, "logits/chosen": -2.6500000953674316, "logits/rejected": -2.799999952316284, "logps/chosen": -325.79998779296875, "logps/rejected": -280.125, "loss": 0.5832, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2696289122104645, "rewards/margins": 0.47607421875, "rewards/rejected": -0.745623767375946, "step": 385 }, { "epoch": 0.1004119464469619, "grad_norm": 652.0, "learning_rate": 4.49794026776519e-07, "logits/chosen": -2.590625047683716, "logits/rejected": -2.614062547683716, "logps/chosen": -353.3999938964844, "logps/rejected": -360.3999938964844, "loss": 0.5223, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5284179449081421, "rewards/margins": 0.6455078125, "rewards/rejected": -1.172265648841858, "step": 390 }, { "epoch": 0.10169927909371781, "grad_norm": 358.0, "learning_rate": 4.491503604531411e-07, "logits/chosen": -2.4671874046325684, "logits/rejected": -2.403125047683716, "logps/chosen": -304.79998779296875, "logps/rejected": -258.6000061035156, "loss": 0.6465, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.50592041015625, "rewards/margins": 0.43895262479782104, "rewards/rejected": -0.9453369379043579, "step": 395 }, { "epoch": 0.10298661174047374, "grad_norm": 460.0, "learning_rate": 4.4850669412976313e-07, "logits/chosen": -2.362499952316284, "logits/rejected": -2.375, "logps/chosen": -263.79998779296875, "logps/rejected": -323.79998779296875, "loss": 0.5078, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2529296875, "rewards/margins": 0.6742187738418579, "rewards/rejected": -0.927734375, "step": 400 }, { "epoch": 0.10427394438722966, "grad_norm": 350.0, "learning_rate": 4.478630278063851e-07, "logits/chosen": -2.559375047683716, "logits/rejected": -2.6500000953674316, "logps/chosen": -296.3999938964844, "logps/rejected": -363.79998779296875, "loss": 0.4773, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.38994139432907104, "rewards/margins": 0.843945324420929, "rewards/rejected": -1.232812523841858, "step": 405 }, { "epoch": 0.10556127703398559, "grad_norm": 564.0, "learning_rate": 4.472193614830072e-07, "logits/chosen": -2.596874952316284, "logits/rejected": -2.7593750953674316, "logps/chosen": -314.79998779296875, "logps/rejected": -334.6000061035156, "loss": 0.5164, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4869140684604645, "rewards/margins": 0.69921875, "rewards/rejected": -1.185156226158142, "step": 410 }, { "epoch": 0.1068486096807415, "grad_norm": 284.0, "learning_rate": 4.4657569515962926e-07, "logits/chosen": -2.4671874046325684, "logits/rejected": -2.6468749046325684, "logps/chosen": -249.39999389648438, "logps/rejected": -257.79998779296875, "loss": 0.632, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.17958983778953552, "rewards/margins": 0.3731445372104645, "rewards/rejected": -0.552734375, "step": 415 }, { "epoch": 0.10813594232749743, "grad_norm": 474.0, "learning_rate": 4.4593202883625124e-07, "logits/chosen": -2.4828124046325684, "logits/rejected": -2.628124952316284, "logps/chosen": -314.6000061035156, "logps/rejected": -368.0, "loss": 0.4922, "rewards/accuracies": 0.75, "rewards/chosen": -0.525195300579071, "rewards/margins": 0.856249988079071, "rewards/rejected": -1.3835937976837158, "step": 420 }, { "epoch": 0.10942327497425335, "grad_norm": 436.0, "learning_rate": 4.4528836251287334e-07, "logits/chosen": -2.6812500953674316, "logits/rejected": -2.674999952316284, "logps/chosen": -296.5, "logps/rejected": -326.1000061035156, "loss": 0.6285, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.3704589903354645, "rewards/margins": 0.3965820372104645, "rewards/rejected": -0.7668823003768921, "step": 425 }, { "epoch": 0.11071060762100927, "grad_norm": 207.0, "learning_rate": 4.4464469618949533e-07, "logits/chosen": -2.534374952316284, "logits/rejected": -2.4625000953674316, "logps/chosen": -276.20001220703125, "logps/rejected": -343.79998779296875, "loss": 0.5309, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.448974609375, "rewards/margins": 0.7476562261581421, "rewards/rejected": -1.197656273841858, "step": 430 }, { "epoch": 0.11199794026776519, "grad_norm": 510.0, "learning_rate": 4.4400102986611737e-07, "logits/chosen": -2.6968750953674316, "logits/rejected": -2.706249952316284, "logps/chosen": -355.6000061035156, "logps/rejected": -395.20001220703125, "loss": 0.5332, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.44062501192092896, "rewards/margins": 0.7007812261581421, "rewards/rejected": -1.142187476158142, "step": 435 }, { "epoch": 0.11328527291452112, "grad_norm": 492.0, "learning_rate": 4.4335736354273947e-07, "logits/chosen": -2.59375, "logits/rejected": -2.6343750953674316, "logps/chosen": -342.3999938964844, "logps/rejected": -346.3999938964844, "loss": 0.6219, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.39082032442092896, "rewards/margins": 0.4408203065395355, "rewards/rejected": -0.83203125, "step": 440 }, { "epoch": 0.11457260556127703, "grad_norm": 334.0, "learning_rate": 4.4271369721936146e-07, "logits/chosen": -2.3499999046325684, "logits/rejected": -2.34375, "logps/chosen": -303.0, "logps/rejected": -369.0, "loss": 0.465, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7535156011581421, "rewards/margins": 0.905078113079071, "rewards/rejected": -1.65625, "step": 445 }, { "epoch": 0.11585993820803296, "grad_norm": 728.0, "learning_rate": 4.420700308959835e-07, "logits/chosen": -2.643749952316284, "logits/rejected": -2.471874952316284, "logps/chosen": -319.6000061035156, "logps/rejected": -400.79998779296875, "loss": 0.5664, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4706054627895355, "rewards/margins": 0.7789062261581421, "rewards/rejected": -1.25, "step": 450 }, { "epoch": 0.11714727085478888, "grad_norm": 840.0, "learning_rate": 4.414263645726055e-07, "logits/chosen": -2.546875, "logits/rejected": -2.5625, "logps/chosen": -344.79998779296875, "logps/rejected": -428.0, "loss": 0.4543, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.971875011920929, "rewards/margins": 1.189843773841858, "rewards/rejected": -2.1656250953674316, "step": 455 }, { "epoch": 0.1184346035015448, "grad_norm": 592.0, "learning_rate": 4.407826982492276e-07, "logits/chosen": -2.515625, "logits/rejected": -2.4749999046325684, "logps/chosen": -303.3999938964844, "logps/rejected": -352.0, "loss": 0.5133, "rewards/accuracies": 0.6875, "rewards/chosen": -0.658984363079071, "rewards/margins": 0.776171863079071, "rewards/rejected": -1.435156226158142, "step": 460 }, { "epoch": 0.11972193614830072, "grad_norm": 396.0, "learning_rate": 4.4013903192584963e-07, "logits/chosen": -2.6656250953674316, "logits/rejected": -2.6171875, "logps/chosen": -256.8999938964844, "logps/rejected": -322.20001220703125, "loss": 0.5266, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.21452637016773224, "rewards/margins": 0.585156261920929, "rewards/rejected": -0.799609363079071, "step": 465 }, { "epoch": 0.12100926879505665, "grad_norm": 564.0, "learning_rate": 4.394953656024716e-07, "logits/chosen": -2.640625, "logits/rejected": -2.565624952316284, "logps/chosen": -325.79998779296875, "logps/rejected": -331.3999938964844, "loss": 0.5215, "rewards/accuracies": 0.6875, "rewards/chosen": -0.37651365995407104, "rewards/margins": 0.691210925579071, "rewards/rejected": -1.0671875476837158, "step": 470 }, { "epoch": 0.12229660144181256, "grad_norm": 940.0, "learning_rate": 4.388516992790937e-07, "logits/chosen": -2.5562500953674316, "logits/rejected": -2.4937500953674316, "logps/chosen": -307.6000061035156, "logps/rejected": -406.3999938964844, "loss": 0.4516, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6792968511581421, "rewards/margins": 1.0437500476837158, "rewards/rejected": -1.721093773841858, "step": 475 }, { "epoch": 0.12358393408856849, "grad_norm": 247.0, "learning_rate": 4.3820803295571576e-07, "logits/chosen": -2.4593749046325684, "logits/rejected": -2.5625, "logps/chosen": -313.1000061035156, "logps/rejected": -323.45001220703125, "loss": 0.566, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3075195252895355, "rewards/margins": 0.5963379144668579, "rewards/rejected": -0.9034423828125, "step": 480 }, { "epoch": 0.12487126673532441, "grad_norm": 564.0, "learning_rate": 4.3756436663233775e-07, "logits/chosen": NaN, "logits/rejected": -2.671875, "logps/chosen": -284.0, "logps/rejected": -336.23126220703125, "loss": 0.534, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.4183593690395355, "rewards/margins": 0.7953125238418579, "rewards/rejected": -1.213769555091858, "step": 485 }, { "epoch": 0.12615859938208032, "grad_norm": 378.0, "learning_rate": 4.3692070030895984e-07, "logits/chosen": -2.5250000953674316, "logits/rejected": -2.5250000953674316, "logps/chosen": -268.0, "logps/rejected": -427.6000061035156, "loss": 0.4988, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7242187261581421, "rewards/margins": 0.9429687261581421, "rewards/rejected": -1.665624976158142, "step": 490 }, { "epoch": 0.12744593202883625, "grad_norm": 386.0, "learning_rate": 4.3627703398558183e-07, "logits/chosen": -2.4437499046325684, "logits/rejected": -2.4000000953674316, "logps/chosen": -277.20001220703125, "logps/rejected": -374.79998779296875, "loss": 0.5051, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3592773377895355, "rewards/margins": 0.6917968988418579, "rewards/rejected": -1.0509765148162842, "step": 495 }, { "epoch": 0.12873326467559218, "grad_norm": 378.0, "learning_rate": 4.356333676622039e-07, "logits/chosen": -2.5093750953674316, "logits/rejected": -2.2906250953674316, "logps/chosen": -276.3999938964844, "logps/rejected": -299.70001220703125, "loss": 0.5547, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3712402284145355, "rewards/margins": 0.712695300579071, "rewards/rejected": -1.0812499523162842, "step": 500 }, { "epoch": 0.1300205973223481, "grad_norm": 460.0, "learning_rate": 4.3498970133882597e-07, "logits/chosen": NaN, "logits/rejected": -2.543750047683716, "logps/chosen": -252.0, "logps/rejected": -261.3125, "loss": 0.5723, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.4310546815395355, "rewards/margins": 0.6113678216934204, "rewards/rejected": -1.0412505865097046, "step": 505 }, { "epoch": 0.131307929969104, "grad_norm": 290.0, "learning_rate": 4.3434603501544796e-07, "logits/chosen": -2.606250047683716, "logits/rejected": -2.643749952316284, "logps/chosen": -317.6000061035156, "logps/rejected": -376.79998779296875, "loss": 0.4004, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.629687488079071, "rewards/margins": 1.1378905773162842, "rewards/rejected": -1.7687499523162842, "step": 510 }, { "epoch": 0.13259526261585994, "grad_norm": 256.0, "learning_rate": 4.3370236869207e-07, "logits/chosen": -2.434375047683716, "logits/rejected": -2.5843749046325684, "logps/chosen": -246.1999969482422, "logps/rejected": -273.6000061035156, "loss": 0.5527, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.32343751192092896, "rewards/margins": 0.596484363079071, "rewards/rejected": -0.920703113079071, "step": 515 }, { "epoch": 0.13388259526261587, "grad_norm": 348.0, "learning_rate": 4.330587023686921e-07, "logits/chosen": -2.559375047683716, "logits/rejected": -2.6187500953674316, "logps/chosen": -309.3999938964844, "logps/rejected": -384.79998779296875, "loss": 0.5207, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7164062261581421, "rewards/margins": 0.7822265625, "rewards/rejected": -1.497656226158142, "step": 520 }, { "epoch": 0.1351699279093718, "grad_norm": 340.0, "learning_rate": 4.324150360453141e-07, "logits/chosen": -2.8218750953674316, "logits/rejected": -2.8968749046325684, "logps/chosen": -342.79998779296875, "logps/rejected": -274.20001220703125, "loss": 0.6227, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.23759765923023224, "rewards/margins": 0.29987794160842896, "rewards/rejected": -0.536914050579071, "step": 525 }, { "epoch": 0.1364572605561277, "grad_norm": 233.0, "learning_rate": 4.3177136972193613e-07, "logits/chosen": -2.6015625, "logits/rejected": -2.5625, "logps/chosen": -291.0, "logps/rejected": -385.79998779296875, "loss": 0.4824, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.764843761920929, "rewards/margins": 1.1536133289337158, "rewards/rejected": -1.9210937023162842, "step": 530 }, { "epoch": 0.13774459320288363, "grad_norm": 444.0, "learning_rate": 4.311277033985581e-07, "logits/chosen": -2.6624999046325684, "logits/rejected": -2.6343750953674316, "logps/chosen": -280.6000061035156, "logps/rejected": -359.20001220703125, "loss": 0.5461, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4683593809604645, "rewards/margins": 0.621874988079071, "rewards/rejected": -1.090234398841858, "step": 535 }, { "epoch": 0.13903192584963955, "grad_norm": 516.0, "learning_rate": 4.304840370751802e-07, "logits/chosen": -2.5531249046325684, "logits/rejected": -2.249218702316284, "logps/chosen": -299.3999938964844, "logps/rejected": -350.6000061035156, "loss": 0.5734, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.39970701932907104, "rewards/margins": 0.590893566608429, "rewards/rejected": -0.9903808832168579, "step": 540 }, { "epoch": 0.14031925849639545, "grad_norm": 334.0, "learning_rate": 4.2984037075180226e-07, "logits/chosen": -2.440624952316284, "logits/rejected": -2.543750047683716, "logps/chosen": -285.5, "logps/rejected": -326.79998779296875, "loss": 0.507, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4859375059604645, "rewards/margins": 0.8447265625, "rewards/rejected": -1.328515648841858, "step": 545 }, { "epoch": 0.14160659114315138, "grad_norm": 592.0, "learning_rate": 4.2919670442842425e-07, "logits/chosen": -2.6875, "logits/rejected": -2.734375, "logps/chosen": -332.20001220703125, "logps/rejected": -362.0, "loss": 0.484, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6968749761581421, "rewards/margins": 0.9471191167831421, "rewards/rejected": -1.6437499523162842, "step": 550 }, { "epoch": 0.1428939237899073, "grad_norm": 322.0, "learning_rate": 4.2855303810504634e-07, "logits/chosen": -2.643749952316284, "logits/rejected": -2.6812500953674316, "logps/chosen": -267.6000061035156, "logps/rejected": -295.3999938964844, "loss": 0.5949, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.505078136920929, "rewards/margins": 0.4276367127895355, "rewards/rejected": -0.932812511920929, "step": 555 }, { "epoch": 0.14418125643666324, "grad_norm": 328.0, "learning_rate": 4.2790937178166833e-07, "logits/chosen": -2.6343750953674316, "logits/rejected": -2.565624952316284, "logps/chosen": -265.6000061035156, "logps/rejected": -304.6000061035156, "loss": 0.5012, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.38531494140625, "rewards/margins": 0.8076537847518921, "rewards/rejected": -1.190466284751892, "step": 560 }, { "epoch": 0.14546858908341914, "grad_norm": 384.0, "learning_rate": 4.272657054582904e-07, "logits/chosen": -2.5625, "logits/rejected": -2.307812452316284, "logps/chosen": -315.0, "logps/rejected": -332.20001220703125, "loss": 0.5566, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6636718511581421, "rewards/margins": 0.656933605670929, "rewards/rejected": -1.322656273841858, "step": 565 }, { "epoch": 0.14675592173017507, "grad_norm": 324.0, "learning_rate": 4.2662203913491247e-07, "logits/chosen": -2.5875000953674316, "logits/rejected": -2.270312547683716, "logps/chosen": -289.3999938964844, "logps/rejected": -325.20001220703125, "loss": 0.5559, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5189453363418579, "rewards/margins": 0.7613281011581421, "rewards/rejected": -1.28125, "step": 570 }, { "epoch": 0.148043254376931, "grad_norm": 516.0, "learning_rate": 4.2597837281153446e-07, "logits/chosen": -2.534374952316284, "logits/rejected": -2.528125047683716, "logps/chosen": -254.89999389648438, "logps/rejected": -379.79998779296875, "loss": 0.5457, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.630175769329071, "rewards/margins": 0.8441406488418579, "rewards/rejected": -1.47265625, "step": 575 }, { "epoch": 0.14933058702368693, "grad_norm": 398.0, "learning_rate": 4.253347064881565e-07, "logits/chosen": -2.518749952316284, "logits/rejected": -2.215625047683716, "logps/chosen": -310.79998779296875, "logps/rejected": -367.3999938964844, "loss": 0.4977, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7088378667831421, "rewards/margins": 0.8902343511581421, "rewards/rejected": -1.600000023841858, "step": 580 }, { "epoch": 0.15061791967044283, "grad_norm": 608.0, "learning_rate": 4.246910401647786e-07, "logits/chosen": -2.640625, "logits/rejected": -2.621875047683716, "logps/chosen": -333.6000061035156, "logps/rejected": -349.79998779296875, "loss": 0.5375, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.54150390625, "rewards/margins": 0.7373046875, "rewards/rejected": -1.278906226158142, "step": 585 }, { "epoch": 0.15190525231719876, "grad_norm": 264.0, "learning_rate": 4.240473738414006e-07, "logits/chosen": -2.7750000953674316, "logits/rejected": -2.668750047683716, "logps/chosen": -297.20001220703125, "logps/rejected": -266.0, "loss": 0.6641, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.28901976346969604, "rewards/margins": 0.22092285752296448, "rewards/rejected": -0.5107421875, "step": 590 }, { "epoch": 0.1531925849639547, "grad_norm": 274.0, "learning_rate": 4.2340370751802263e-07, "logits/chosen": -2.5640625953674316, "logits/rejected": -2.65625, "logps/chosen": -345.20001220703125, "logps/rejected": -354.79998779296875, "loss": 0.4492, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5157226324081421, "rewards/margins": 1.0515625476837158, "rewards/rejected": -1.567968726158142, "step": 595 }, { "epoch": 0.15447991761071062, "grad_norm": 528.0, "learning_rate": 4.227600411946447e-07, "logits/chosen": -2.59375, "logits/rejected": -2.5062499046325684, "logps/chosen": -301.3999938964844, "logps/rejected": -316.58123779296875, "loss": 0.4918, "rewards/accuracies": 0.625, "rewards/chosen": -0.700976550579071, "rewards/margins": 0.9237304925918579, "rewards/rejected": -1.6228148937225342, "step": 600 }, { "epoch": 0.15576725025746652, "grad_norm": 516.0, "learning_rate": 4.221163748712667e-07, "logits/chosen": -2.581249952316284, "logits/rejected": -2.6156249046325684, "logps/chosen": -398.79998779296875, "logps/rejected": -404.3999938964844, "loss": 0.5062, "rewards/accuracies": 0.6875, "rewards/chosen": -0.711621105670929, "rewards/margins": 0.822265625, "rewards/rejected": -1.5320312976837158, "step": 605 }, { "epoch": 0.15705458290422245, "grad_norm": 996.0, "learning_rate": 4.2147270854788876e-07, "logits/chosen": -2.6500000953674316, "logits/rejected": -2.5062499046325684, "logps/chosen": -278.20001220703125, "logps/rejected": -318.70001220703125, "loss": 0.6117, "rewards/accuracies": 0.5, "rewards/chosen": -0.64111328125, "rewards/margins": 0.73046875, "rewards/rejected": -1.368749976158142, "step": 610 }, { "epoch": 0.15834191555097837, "grad_norm": 680.0, "learning_rate": 4.208290422245108e-07, "logits/chosen": -2.637500047683716, "logits/rejected": -2.700000047683716, "logps/chosen": -305.20001220703125, "logps/rejected": -334.3999938964844, "loss": 0.6156, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5384277105331421, "rewards/margins": 0.5941406488418579, "rewards/rejected": -1.130468726158142, "step": 615 }, { "epoch": 0.1596292481977343, "grad_norm": 612.0, "learning_rate": 4.2018537590113285e-07, "logits/chosen": -2.643749952316284, "logits/rejected": -2.671875, "logps/chosen": -290.0, "logps/rejected": -410.79998779296875, "loss": 0.5148, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.675000011920929, "rewards/margins": 0.924609363079071, "rewards/rejected": -1.600000023841858, "step": 620 }, { "epoch": 0.1609165808444902, "grad_norm": 334.0, "learning_rate": 4.195417095777549e-07, "logits/chosen": -2.4937500953674316, "logits/rejected": -2.481250047683716, "logps/chosen": -270.3999938964844, "logps/rejected": -313.3999938964844, "loss": 0.5129, "rewards/accuracies": 0.6875, "rewards/chosen": -0.45341795682907104, "rewards/margins": 0.840624988079071, "rewards/rejected": -1.291601538658142, "step": 625 }, { "epoch": 0.16220391349124613, "grad_norm": 408.0, "learning_rate": 4.188980432543769e-07, "logits/chosen": -2.625, "logits/rejected": -2.4312500953674316, "logps/chosen": -267.6000061035156, "logps/rejected": -278.20001220703125, "loss": 0.5012, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5428711175918579, "rewards/margins": 0.840624988079071, "rewards/rejected": -1.3839843273162842, "step": 630 }, { "epoch": 0.16349124613800206, "grad_norm": 716.0, "learning_rate": 4.18254376930999e-07, "logits/chosen": -2.518749952316284, "logits/rejected": -2.4203124046325684, "logps/chosen": -327.79998779296875, "logps/rejected": -371.6000061035156, "loss": 0.4773, "rewards/accuracies": 0.6875, "rewards/chosen": -0.706372082233429, "rewards/margins": 0.876757800579071, "rewards/rejected": -1.585351586341858, "step": 635 }, { "epoch": 0.164778578784758, "grad_norm": 320.0, "learning_rate": 4.1761071060762096e-07, "logits/chosen": -2.700000047683716, "logits/rejected": -2.6875, "logps/chosen": -286.3999938964844, "logps/rejected": -324.3999938964844, "loss": 0.5398, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4708496034145355, "rewards/margins": 0.6630859375, "rewards/rejected": -1.1335937976837158, "step": 640 }, { "epoch": 0.1660659114315139, "grad_norm": 620.0, "learning_rate": 4.16967044284243e-07, "logits/chosen": -2.625, "logits/rejected": -2.6875, "logps/chosen": -332.0, "logps/rejected": -355.6000061035156, "loss": 0.5355, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.66259765625, "rewards/margins": 0.5775390863418579, "rewards/rejected": -1.2410156726837158, "step": 645 }, { "epoch": 0.16735324407826982, "grad_norm": 326.0, "learning_rate": 4.163233779608651e-07, "logits/chosen": -2.643749952316284, "logits/rejected": -2.5625, "logps/chosen": -290.6000061035156, "logps/rejected": -280.0, "loss": 0.5684, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.28178709745407104, "rewards/margins": 0.490478515625, "rewards/rejected": -0.7718750238418579, "step": 650 }, { "epoch": 0.16864057672502575, "grad_norm": 488.0, "learning_rate": 4.156797116374871e-07, "logits/chosen": -2.426562547683716, "logits/rejected": -2.371875047683716, "logps/chosen": -192.64999389648438, "logps/rejected": -225.10000610351562, "loss": 0.6344, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.26445311307907104, "rewards/margins": 0.27167969942092896, "rewards/rejected": -0.536328136920929, "step": 655 }, { "epoch": 0.16992790937178168, "grad_norm": 488.0, "learning_rate": 4.1503604531410913e-07, "logits/chosen": -2.659374952316284, "logits/rejected": -2.637500047683716, "logps/chosen": -361.6000061035156, "logps/rejected": -382.0, "loss": 0.4211, "rewards/accuracies": 0.75, "rewards/chosen": -0.48408204317092896, "rewards/margins": 1.162500023841858, "rewards/rejected": -1.6453125476837158, "step": 660 }, { "epoch": 0.17121524201853758, "grad_norm": 604.0, "learning_rate": 4.1439237899073123e-07, "logits/chosen": -2.5250000953674316, "logits/rejected": -2.4781250953674316, "logps/chosen": -313.6000061035156, "logps/rejected": -431.0, "loss": 0.4898, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.78125, "rewards/margins": 1.002343773841858, "rewards/rejected": -1.7843749523162842, "step": 665 }, { "epoch": 0.1725025746652935, "grad_norm": 418.0, "learning_rate": 4.137487126673532e-07, "logits/chosen": -2.659374952316284, "logits/rejected": -2.382031202316284, "logps/chosen": -253.8000030517578, "logps/rejected": -277.0, "loss": 0.5621, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.43994140625, "rewards/margins": 0.6709960699081421, "rewards/rejected": -1.109375, "step": 670 }, { "epoch": 0.17378990731204944, "grad_norm": 712.0, "learning_rate": 4.1310504634397526e-07, "logits/chosen": -2.621875047683716, "logits/rejected": -2.643749952316284, "logps/chosen": -373.6000061035156, "logps/rejected": -327.6000061035156, "loss": 0.5695, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4169921875, "rewards/margins": 0.606249988079071, "rewards/rejected": -1.0234375, "step": 675 }, { "epoch": 0.17507723995880536, "grad_norm": 372.0, "learning_rate": 4.124613800205973e-07, "logits/chosen": -2.59375, "logits/rejected": -2.46875, "logps/chosen": -352.20001220703125, "logps/rejected": -456.3999938964844, "loss": 0.4562, "rewards/accuracies": 0.75, "rewards/chosen": -0.560839831829071, "rewards/margins": 1.1417968273162842, "rewards/rejected": -1.701562523841858, "step": 680 }, { "epoch": 0.17636457260556127, "grad_norm": 436.0, "learning_rate": 4.1181771369721935e-07, "logits/chosen": -2.721874952316284, "logits/rejected": -2.768749952316284, "logps/chosen": -346.0, "logps/rejected": -323.20001220703125, "loss": 0.618, "rewards/accuracies": 0.5625, "rewards/chosen": -0.37812501192092896, "rewards/margins": 0.523242175579071, "rewards/rejected": -0.901171863079071, "step": 685 }, { "epoch": 0.1776519052523172, "grad_norm": 316.0, "learning_rate": 4.111740473738414e-07, "logits/chosen": -2.7249999046325684, "logits/rejected": -2.637500047683716, "logps/chosen": -348.79998779296875, "logps/rejected": -424.0, "loss": 0.6422, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.791796863079071, "rewards/margins": 0.5960937738418579, "rewards/rejected": -1.38671875, "step": 690 }, { "epoch": 0.17893923789907312, "grad_norm": 700.0, "learning_rate": 4.1053038105046343e-07, "logits/chosen": -2.715625047683716, "logits/rejected": -2.7281250953674316, "logps/chosen": -378.79998779296875, "logps/rejected": -397.20001220703125, "loss": 0.6078, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7066406011581421, "rewards/margins": 0.757031261920929, "rewards/rejected": -1.463281273841858, "step": 695 }, { "epoch": 0.18022657054582905, "grad_norm": 276.0, "learning_rate": 4.098867147270855e-07, "logits/chosen": -2.590625047683716, "logits/rejected": -2.731250047683716, "logps/chosen": -314.20001220703125, "logps/rejected": -345.3999938964844, "loss": 0.5145, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.48261719942092896, "rewards/margins": 0.832568347454071, "rewards/rejected": -1.316015601158142, "step": 700 }, { "epoch": 0.18151390319258495, "grad_norm": 696.0, "learning_rate": 4.0924304840370747e-07, "logits/chosen": -2.5374999046325684, "logits/rejected": -2.6343750953674316, "logps/chosen": -348.0, "logps/rejected": -398.79998779296875, "loss": 0.5969, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8519531488418579, "rewards/margins": 0.7749999761581421, "rewards/rejected": -1.6257812976837158, "step": 705 }, { "epoch": 0.18280123583934088, "grad_norm": 576.0, "learning_rate": 4.085993820803295e-07, "logits/chosen": -2.768749952316284, "logits/rejected": -2.753124952316284, "logps/chosen": -353.6000061035156, "logps/rejected": -374.0, "loss": 0.6125, "rewards/accuracies": 0.5625, "rewards/chosen": -0.651171863079071, "rewards/margins": 0.3768554627895355, "rewards/rejected": -1.0281250476837158, "step": 710 }, { "epoch": 0.1840885684860968, "grad_norm": 476.0, "learning_rate": 4.079557157569516e-07, "logits/chosen": -2.4203124046325684, "logits/rejected": -2.746875047683716, "logps/chosen": -244.3000030517578, "logps/rejected": -265.8999938964844, "loss": 0.6094, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.17589721083641052, "rewards/margins": 0.3184570372104645, "rewards/rejected": -0.49360352754592896, "step": 715 }, { "epoch": 0.18537590113285274, "grad_norm": 452.0, "learning_rate": 4.073120494335736e-07, "logits/chosen": -2.6968750953674316, "logits/rejected": -2.659374952316284, "logps/chosen": -324.79998779296875, "logps/rejected": -359.20001220703125, "loss": 0.4496, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5843750238418579, "rewards/margins": 1.153906226158142, "rewards/rejected": -1.7394530773162842, "step": 720 }, { "epoch": 0.18666323377960864, "grad_norm": 474.0, "learning_rate": 4.0666838311019564e-07, "logits/chosen": -2.484375, "logits/rejected": -2.40625, "logps/chosen": -312.0, "logps/rejected": -317.0, "loss": 0.5254, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4071289002895355, "rewards/margins": 0.760986328125, "rewards/rejected": -1.169531226158142, "step": 725 }, { "epoch": 0.18795056642636457, "grad_norm": 556.0, "learning_rate": 4.0602471678681773e-07, "logits/chosen": -2.518749952316284, "logits/rejected": -2.4625000953674316, "logps/chosen": -282.3999938964844, "logps/rejected": -336.6000061035156, "loss": 0.5441, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3586181700229645, "rewards/margins": 0.6958984136581421, "rewards/rejected": -1.055761694908142, "step": 730 }, { "epoch": 0.1892378990731205, "grad_norm": 532.0, "learning_rate": 4.053810504634397e-07, "logits/chosen": -2.565624952316284, "logits/rejected": -2.5687499046325684, "logps/chosen": -344.3999938964844, "logps/rejected": -404.20001220703125, "loss": 0.5453, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3208984434604645, "rewards/margins": 0.614013671875, "rewards/rejected": -0.9330078363418579, "step": 735 }, { "epoch": 0.19052523171987643, "grad_norm": 520.0, "learning_rate": 4.0473738414006176e-07, "logits/chosen": -2.5625, "logits/rejected": -2.5687499046325684, "logps/chosen": -271.79998779296875, "logps/rejected": -303.20001220703125, "loss": 0.5836, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.47419434785842896, "rewards/margins": 0.514453113079071, "rewards/rejected": -0.9878906011581421, "step": 740 }, { "epoch": 0.19181256436663233, "grad_norm": 214.0, "learning_rate": 4.040937178166838e-07, "logits/chosen": -2.609375, "logits/rejected": -2.746875047683716, "logps/chosen": -352.3999938964844, "logps/rejected": -344.3999938964844, "loss": 0.4633, "rewards/accuracies": 0.6875, "rewards/chosen": -0.505664050579071, "rewards/margins": 0.9593750238418579, "rewards/rejected": -1.46875, "step": 745 }, { "epoch": 0.19309989701338826, "grad_norm": 454.0, "learning_rate": 4.0345005149330585e-07, "logits/chosen": -2.4453125, "logits/rejected": -2.6031250953674316, "logps/chosen": -312.6000061035156, "logps/rejected": -315.79998779296875, "loss": 0.6016, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.3690429627895355, "rewards/margins": 0.41582030057907104, "rewards/rejected": -0.784375011920929, "step": 750 }, { "epoch": 0.19438722966014418, "grad_norm": 350.0, "learning_rate": 4.028063851699279e-07, "logits/chosen": -2.6156249046325684, "logits/rejected": -2.765625, "logps/chosen": -348.0, "logps/rejected": -350.79998779296875, "loss": 0.577, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4150390625, "rewards/margins": 0.4375976622104645, "rewards/rejected": -0.85205078125, "step": 755 }, { "epoch": 0.1956745623069001, "grad_norm": 520.0, "learning_rate": 4.0216271884654994e-07, "logits/chosen": -2.471874952316284, "logits/rejected": -2.606250047683716, "logps/chosen": -310.79998779296875, "logps/rejected": -376.20001220703125, "loss": 0.5207, "rewards/accuracies": 0.75, "rewards/chosen": -0.7540038824081421, "rewards/margins": 0.7574218511581421, "rewards/rejected": -1.51171875, "step": 760 }, { "epoch": 0.196961894953656, "grad_norm": 520.0, "learning_rate": 4.01519052523172e-07, "logits/chosen": -2.6968750953674316, "logits/rejected": -2.6624999046325684, "logps/chosen": -279.0, "logps/rejected": -259.20001220703125, "loss": 0.6242, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.3454345762729645, "rewards/margins": 0.4268554747104645, "rewards/rejected": -0.771655261516571, "step": 765 }, { "epoch": 0.19824922760041194, "grad_norm": 608.0, "learning_rate": 4.00875386199794e-07, "logits/chosen": -2.640625, "logits/rejected": -2.753124952316284, "logps/chosen": -298.79998779296875, "logps/rejected": -330.79998779296875, "loss": 0.5184, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3304687440395355, "rewards/margins": 0.84912109375, "rewards/rejected": -1.1798827648162842, "step": 770 }, { "epoch": 0.19953656024716787, "grad_norm": 564.0, "learning_rate": 4.0023171987641606e-07, "logits/chosen": -2.6156249046325684, "logits/rejected": -2.721874952316284, "logps/chosen": -310.20001220703125, "logps/rejected": -346.3999938964844, "loss": 0.5242, "rewards/accuracies": 0.6875, "rewards/chosen": -0.44892579317092896, "rewards/margins": 0.792187511920929, "rewards/rejected": -1.239843726158142, "step": 775 }, { "epoch": 0.2008238928939238, "grad_norm": 456.0, "learning_rate": 3.995880535530381e-07, "logits/chosen": -2.4765625, "logits/rejected": -2.8812499046325684, "logps/chosen": -291.3999938964844, "logps/rejected": -322.79998779296875, "loss": 0.559, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.41972655057907104, "rewards/margins": 0.582226574420929, "rewards/rejected": -1.001562476158142, "step": 780 }, { "epoch": 0.2021112255406797, "grad_norm": 564.0, "learning_rate": 3.989443872296601e-07, "logits/chosen": -2.543750047683716, "logits/rejected": -2.640625, "logps/chosen": -284.6000061035156, "logps/rejected": -323.0, "loss": 0.5352, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5809570550918579, "rewards/margins": 1.0, "rewards/rejected": -1.581640601158142, "step": 785 }, { "epoch": 0.20339855818743563, "grad_norm": 218.0, "learning_rate": 3.9830072090628214e-07, "logits/chosen": -2.2328124046325684, "logits/rejected": -2.4312500953674316, "logps/chosen": -314.79998779296875, "logps/rejected": -335.0, "loss": 0.4613, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7314453125, "rewards/margins": 0.996874988079071, "rewards/rejected": -1.728906273841858, "step": 790 }, { "epoch": 0.20468589083419156, "grad_norm": 416.0, "learning_rate": 3.9765705458290423e-07, "logits/chosen": -2.768749952316284, "logits/rejected": -2.8125, "logps/chosen": -319.3999938964844, "logps/rejected": -313.6000061035156, "loss": 0.5734, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.14238281548023224, "rewards/margins": 0.4925781190395355, "rewards/rejected": -0.6351562738418579, "step": 795 }, { "epoch": 0.2059732234809475, "grad_norm": 456.0, "learning_rate": 3.970133882595262e-07, "logits/chosen": -2.5406250953674316, "logits/rejected": -2.528125047683716, "logps/chosen": -340.0, "logps/rejected": -399.6000061035156, "loss": 0.4605, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8031250238418579, "rewards/margins": 1.0509765148162842, "rewards/rejected": -1.850000023841858, "step": 800 }, { "epoch": 0.2072605561277034, "grad_norm": 398.0, "learning_rate": 3.9636972193614827e-07, "logits/chosen": -2.7718749046325684, "logits/rejected": -2.71875, "logps/chosen": -326.6000061035156, "logps/rejected": -358.3999938964844, "loss": 0.4801, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3101806640625, "rewards/margins": 0.9136718511581421, "rewards/rejected": -1.2238280773162842, "step": 805 }, { "epoch": 0.20854788877445932, "grad_norm": 436.0, "learning_rate": 3.9572605561277036e-07, "logits/chosen": -2.456249952316284, "logits/rejected": -2.2874999046325684, "logps/chosen": -314.0, "logps/rejected": -382.79998779296875, "loss": 0.4621, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.646289050579071, "rewards/margins": 0.9281250238418579, "rewards/rejected": -1.5773437023162842, "step": 810 }, { "epoch": 0.20983522142121525, "grad_norm": 394.0, "learning_rate": 3.9508238928939235e-07, "logits/chosen": -2.6968750953674316, "logits/rejected": -2.393749952316284, "logps/chosen": -280.20001220703125, "logps/rejected": -321.20001220703125, "loss": 0.5242, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.2788330018520355, "rewards/margins": 0.732421875, "rewards/rejected": -1.0128905773162842, "step": 815 }, { "epoch": 0.21112255406797117, "grad_norm": 336.0, "learning_rate": 3.944387229660144e-07, "logits/chosen": -2.6624999046325684, "logits/rejected": -2.640625, "logps/chosen": -264.3999938964844, "logps/rejected": -267.6000061035156, "loss": 0.5418, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.2987304627895355, "rewards/margins": 0.809094250202179, "rewards/rejected": -1.109960913658142, "step": 820 }, { "epoch": 0.21240988671472708, "grad_norm": 426.0, "learning_rate": 3.9379505664263644e-07, "logits/chosen": -2.621875047683716, "logits/rejected": -2.737499952316284, "logps/chosen": -367.20001220703125, "logps/rejected": -407.6000061035156, "loss": 0.4625, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5486999750137329, "rewards/margins": 1.024999976158142, "rewards/rejected": -1.573632836341858, "step": 825 }, { "epoch": 0.213697219361483, "grad_norm": 426.0, "learning_rate": 3.931513903192585e-07, "logits/chosen": -2.809375047683716, "logits/rejected": -2.706249952316284, "logps/chosen": -324.0, "logps/rejected": -317.20001220703125, "loss": 0.5617, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.19072265923023224, "rewards/margins": 0.5005859136581421, "rewards/rejected": -0.692187488079071, "step": 830 }, { "epoch": 0.21498455200823893, "grad_norm": 217.0, "learning_rate": 3.925077239958805e-07, "logits/chosen": -2.621875047683716, "logits/rejected": -2.721874952316284, "logps/chosen": -288.20001220703125, "logps/rejected": -316.0, "loss": 0.5047, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.3666015565395355, "rewards/margins": 0.7679687738418579, "rewards/rejected": -1.1355469226837158, "step": 835 }, { "epoch": 0.21627188465499486, "grad_norm": 290.0, "learning_rate": 3.9186405767250257e-07, "logits/chosen": -2.453125, "logits/rejected": -2.3968749046325684, "logps/chosen": -308.3999938964844, "logps/rejected": -300.6000061035156, "loss": 0.5109, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2738281190395355, "rewards/margins": 0.8349609375, "rewards/rejected": -1.10693359375, "step": 840 }, { "epoch": 0.21755921730175076, "grad_norm": 422.0, "learning_rate": 3.912203913491246e-07, "logits/chosen": -2.4312500953674316, "logits/rejected": -2.403125047683716, "logps/chosen": -306.20001220703125, "logps/rejected": -376.79998779296875, "loss": 0.4141, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0695312023162842, "rewards/margins": 1.27734375, "rewards/rejected": -2.346874952316284, "step": 845 }, { "epoch": 0.2188465499485067, "grad_norm": 394.0, "learning_rate": 3.905767250257466e-07, "logits/chosen": -2.543750047683716, "logits/rejected": -2.7562499046325684, "logps/chosen": -277.29998779296875, "logps/rejected": -278.79998779296875, "loss": 0.5621, "rewards/accuracies": 0.5625, "rewards/chosen": -0.244140625, "rewards/margins": 0.46484375, "rewards/rejected": -0.710742175579071, "step": 850 }, { "epoch": 0.22013388259526262, "grad_norm": 504.0, "learning_rate": 3.899330587023687e-07, "logits/chosen": -2.5875000953674316, "logits/rejected": -2.625, "logps/chosen": -320.0, "logps/rejected": -323.6000061035156, "loss": 0.5473, "rewards/accuracies": 0.625, "rewards/chosen": -0.25615233182907104, "rewards/margins": 0.675976574420929, "rewards/rejected": -0.9310547113418579, "step": 855 }, { "epoch": 0.22142121524201855, "grad_norm": 504.0, "learning_rate": 3.8928939237899074e-07, "logits/chosen": -2.5687499046325684, "logits/rejected": -2.628124952316284, "logps/chosen": -341.20001220703125, "logps/rejected": -335.3999938964844, "loss": 0.5336, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4906249940395355, "rewards/margins": 0.6871093511581421, "rewards/rejected": -1.173437476158142, "step": 860 }, { "epoch": 0.22270854788877445, "grad_norm": 450.0, "learning_rate": 3.886457260556127e-07, "logits/chosen": -2.6156249046325684, "logits/rejected": -2.715625047683716, "logps/chosen": -260.6000061035156, "logps/rejected": -253.60000610351562, "loss": 0.5719, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2880859375, "rewards/margins": 0.4952148497104645, "rewards/rejected": -0.782470703125, "step": 865 }, { "epoch": 0.22399588053553038, "grad_norm": 700.0, "learning_rate": 3.8800205973223477e-07, "logits/chosen": -2.6468749046325684, "logits/rejected": -2.518749952316284, "logps/chosen": -277.79998779296875, "logps/rejected": -347.0, "loss": 0.6027, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.574023425579071, "rewards/margins": 0.4775634706020355, "rewards/rejected": -1.0496094226837158, "step": 870 }, { "epoch": 0.2252832131822863, "grad_norm": 544.0, "learning_rate": 3.8735839340885686e-07, "logits/chosen": -2.528125047683716, "logits/rejected": -2.5, "logps/chosen": -302.3999938964844, "logps/rejected": -432.0, "loss": 0.3508, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.733203113079071, "rewards/margins": 1.689062476158142, "rewards/rejected": -2.4234375953674316, "step": 875 }, { "epoch": 0.22657054582904224, "grad_norm": 720.0, "learning_rate": 3.8671472708547885e-07, "logits/chosen": -2.706249952316284, "logits/rejected": -2.690624952316284, "logps/chosen": -300.70001220703125, "logps/rejected": -271.20001220703125, "loss": 0.6789, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.626147449016571, "rewards/margins": 0.38544923067092896, "rewards/rejected": -1.0105469226837158, "step": 880 }, { "epoch": 0.22785787847579814, "grad_norm": 644.0, "learning_rate": 3.860710607621009e-07, "logits/chosen": -2.5062499046325684, "logits/rejected": -2.5078125, "logps/chosen": -341.79998779296875, "logps/rejected": -378.3999938964844, "loss": 0.5129, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.540722668170929, "rewards/margins": 0.90234375, "rewards/rejected": -1.443750023841858, "step": 885 }, { "epoch": 0.22914521112255407, "grad_norm": 1304.0, "learning_rate": 3.8542739443872294e-07, "logits/chosen": -2.424999952316284, "logits/rejected": -2.5093750953674316, "logps/chosen": -329.0, "logps/rejected": -372.79998779296875, "loss": 0.5441, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.98046875, "rewards/margins": 1.0578124523162842, "rewards/rejected": -2.039843797683716, "step": 890 }, { "epoch": 0.23043254376931, "grad_norm": 354.0, "learning_rate": 3.84783728115345e-07, "logits/chosen": -2.4468750953674316, "logits/rejected": -2.46875, "logps/chosen": -264.0, "logps/rejected": -323.6000061035156, "loss": 0.5203, "rewards/accuracies": 0.625, "rewards/chosen": -0.60595703125, "rewards/margins": 0.9292968511581421, "rewards/rejected": -1.532812476158142, "step": 895 }, { "epoch": 0.23171987641606592, "grad_norm": 508.0, "learning_rate": 3.84140061791967e-07, "logits/chosen": -2.621875047683716, "logits/rejected": -2.4937500953674316, "logps/chosen": -337.3999938964844, "logps/rejected": -362.3999938964844, "loss": 0.382, "rewards/accuracies": 0.8125, "rewards/chosen": -0.712890625, "rewards/margins": 1.3390624523162842, "rewards/rejected": -2.049999952316284, "step": 900 }, { "epoch": 0.23300720906282182, "grad_norm": 342.0, "learning_rate": 3.8349639546858907e-07, "logits/chosen": -2.53125, "logits/rejected": -2.703125, "logps/chosen": -338.0, "logps/rejected": -332.6499938964844, "loss": 0.6035, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.990771472454071, "rewards/margins": 0.655517578125, "rewards/rejected": -1.646264672279358, "step": 905 }, { "epoch": 0.23429454170957775, "grad_norm": 572.0, "learning_rate": 3.828527291452111e-07, "logits/chosen": -2.643749952316284, "logits/rejected": -2.721874952316284, "logps/chosen": -334.79998779296875, "logps/rejected": -372.0, "loss": 0.5457, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.15234375, "rewards/margins": 0.6410156488418579, "rewards/rejected": -0.794140636920929, "step": 910 }, { "epoch": 0.23558187435633368, "grad_norm": 328.0, "learning_rate": 3.8220906282183315e-07, "logits/chosen": -2.421875, "logits/rejected": -2.3609375953674316, "logps/chosen": -228.5, "logps/rejected": -330.0, "loss": 0.5328, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.22207030653953552, "rewards/margins": 0.736328125, "rewards/rejected": -0.9593750238418579, "step": 915 }, { "epoch": 0.2368692070030896, "grad_norm": 496.0, "learning_rate": 3.815653964984552e-07, "logits/chosen": -2.721874952316284, "logits/rejected": -2.7874999046325684, "logps/chosen": -232.60000610351562, "logps/rejected": -217.10000610351562, "loss": 0.6277, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.4595947265625, "rewards/margins": 0.35063475370407104, "rewards/rejected": -0.8077331781387329, "step": 920 }, { "epoch": 0.2381565396498455, "grad_norm": 680.0, "learning_rate": 3.8092173017507724e-07, "logits/chosen": -2.4937500953674316, "logits/rejected": -2.559375047683716, "logps/chosen": -260.0, "logps/rejected": -339.6000061035156, "loss": 0.5367, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.41401368379592896, "rewards/margins": 0.8277343511581421, "rewards/rejected": -1.2384765148162842, "step": 925 }, { "epoch": 0.23944387229660144, "grad_norm": 532.0, "learning_rate": 3.8027806385169923e-07, "logits/chosen": -2.534374952316284, "logits/rejected": -2.643749952316284, "logps/chosen": -305.79998779296875, "logps/rejected": -346.20001220703125, "loss": 0.5254, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9462890625, "rewards/margins": 0.9823242425918579, "rewards/rejected": -1.9293701648712158, "step": 930 }, { "epoch": 0.24073120494335737, "grad_norm": 592.0, "learning_rate": 3.796343975283213e-07, "logits/chosen": -2.606250047683716, "logits/rejected": -2.6187500953674316, "logps/chosen": -297.6000061035156, "logps/rejected": -330.20001220703125, "loss": 0.7125, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.688549816608429, "rewards/margins": 0.4180664122104645, "rewards/rejected": -1.107031226158142, "step": 935 }, { "epoch": 0.2420185375901133, "grad_norm": 338.0, "learning_rate": 3.7899073120494337e-07, "logits/chosen": -2.4593749046325684, "logits/rejected": -2.473437547683716, "logps/chosen": -269.3999938964844, "logps/rejected": -304.3999938964844, "loss": 0.5621, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.34721678495407104, "rewards/margins": 0.609667956829071, "rewards/rejected": -0.9546874761581421, "step": 940 }, { "epoch": 0.2433058702368692, "grad_norm": 221.0, "learning_rate": 3.7834706488156536e-07, "logits/chosen": -2.690624952316284, "logits/rejected": -2.731250047683716, "logps/chosen": -295.0, "logps/rejected": -271.6000061035156, "loss": 0.6414, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.19416503608226776, "rewards/margins": 0.310546875, "rewards/rejected": -0.505078136920929, "step": 945 }, { "epoch": 0.24459320288362513, "grad_norm": 568.0, "learning_rate": 3.777033985581874e-07, "logits/chosen": -2.5374999046325684, "logits/rejected": -2.440624952316284, "logps/chosen": -262.20001220703125, "logps/rejected": -327.3999938964844, "loss": 0.5141, "rewards/accuracies": 0.625, "rewards/chosen": -0.4554687440395355, "rewards/margins": 0.794140636920929, "rewards/rejected": -1.2472655773162842, "step": 950 }, { "epoch": 0.24588053553038106, "grad_norm": 460.0, "learning_rate": 3.770597322348095e-07, "logits/chosen": -2.4749999046325684, "logits/rejected": -2.390625, "logps/chosen": -297.20001220703125, "logps/rejected": -350.79998779296875, "loss": 0.5277, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.666015625, "rewards/margins": 0.967578113079071, "rewards/rejected": -1.630468726158142, "step": 955 }, { "epoch": 0.24716786817713698, "grad_norm": 376.0, "learning_rate": 3.764160659114315e-07, "logits/chosen": -2.5875000953674316, "logits/rejected": -2.5093750953674316, "logps/chosen": -384.79998779296875, "logps/rejected": -418.0, "loss": 0.3975, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.5166015625, "rewards/margins": 1.223046898841858, "rewards/rejected": -1.735937476158142, "step": 960 }, { "epoch": 0.24845520082389289, "grad_norm": 604.0, "learning_rate": 3.7577239958805353e-07, "logits/chosen": -2.465625047683716, "logits/rejected": -2.432812452316284, "logps/chosen": -266.79998779296875, "logps/rejected": -294.20001220703125, "loss": 0.4863, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.25535887479782104, "rewards/margins": 0.9058593511581421, "rewards/rejected": -1.160742163658142, "step": 965 }, { "epoch": 0.24974253347064881, "grad_norm": 398.0, "learning_rate": 3.7512873326467557e-07, "logits/chosen": -2.590625047683716, "logits/rejected": -2.7281250953674316, "logps/chosen": -369.20001220703125, "logps/rejected": -411.20001220703125, "loss": 0.4818, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7237304449081421, "rewards/margins": 1.066796898841858, "rewards/rejected": -1.7898437976837158, "step": 970 }, { "epoch": 0.25102986611740474, "grad_norm": 258.0, "learning_rate": 3.744850669412976e-07, "logits/chosen": -2.6875, "logits/rejected": -2.6937499046325684, "logps/chosen": -340.79998779296875, "logps/rejected": -341.6000061035156, "loss": 0.4707, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3404785096645355, "rewards/margins": 0.96484375, "rewards/rejected": -1.306249976158142, "step": 975 }, { "epoch": 0.25231719876416064, "grad_norm": 458.0, "learning_rate": 3.7384140061791965e-07, "logits/chosen": -2.674999952316284, "logits/rejected": -2.768749952316284, "logps/chosen": -339.20001220703125, "logps/rejected": -339.20001220703125, "loss": 0.5344, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4676757752895355, "rewards/margins": 0.6099609136581421, "rewards/rejected": -1.0792968273162842, "step": 980 }, { "epoch": 0.2536045314109166, "grad_norm": 560.0, "learning_rate": 3.731977342945417e-07, "logits/chosen": -2.5406250953674316, "logits/rejected": -2.575000047683716, "logps/chosen": -362.79998779296875, "logps/rejected": -412.6000061035156, "loss": 0.5145, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.066015601158142, "rewards/margins": 0.8935546875, "rewards/rejected": -1.9597656726837158, "step": 985 }, { "epoch": 0.2548918640576725, "grad_norm": 302.0, "learning_rate": 3.7255406797116374e-07, "logits/chosen": -2.4000000953674316, "logits/rejected": -2.2203125953674316, "logps/chosen": -332.79998779296875, "logps/rejected": -442.79998779296875, "loss": 0.3113, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.720507800579071, "rewards/margins": 1.756250023841858, "rewards/rejected": -2.479687452316284, "step": 990 }, { "epoch": 0.2561791967044284, "grad_norm": 388.0, "learning_rate": 3.7191040164778573e-07, "logits/chosen": -2.424999952316284, "logits/rejected": -2.5406250953674316, "logps/chosen": -346.79998779296875, "logps/rejected": -381.20001220703125, "loss": 0.5566, "rewards/accuracies": 0.75, "rewards/chosen": -0.948437511920929, "rewards/margins": 0.8140624761581421, "rewards/rejected": -1.7609374523162842, "step": 995 }, { "epoch": 0.25746652935118436, "grad_norm": 502.0, "learning_rate": 3.712667353244078e-07, "logits/chosen": -2.5875000953674316, "logits/rejected": -2.6031250953674316, "logps/chosen": -294.0, "logps/rejected": -358.0, "loss": 0.5113, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6703125238418579, "rewards/margins": 0.8753906488418579, "rewards/rejected": -1.544921875, "step": 1000 }, { "epoch": 0.25875386199794026, "grad_norm": 166.0, "learning_rate": 3.7062306900102987e-07, "logits/chosen": -2.6937499046325684, "logits/rejected": -2.659374952316284, "logps/chosen": -315.20001220703125, "logps/rejected": -325.3999938964844, "loss": 0.4934, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.48945313692092896, "rewards/margins": 0.7093750238418579, "rewards/rejected": -1.1984374523162842, "step": 1005 }, { "epoch": 0.2600411946446962, "grad_norm": 408.0, "learning_rate": 3.6997940267765186e-07, "logits/chosen": -2.746875047683716, "logits/rejected": -2.590625047683716, "logps/chosen": -373.6000061035156, "logps/rejected": -358.3999938964844, "loss": 0.4625, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.26884764432907104, "rewards/margins": 0.9140625, "rewards/rejected": -1.1828124523162842, "step": 1010 }, { "epoch": 0.2613285272914521, "grad_norm": 356.0, "learning_rate": 3.6933573635427395e-07, "logits/chosen": -2.609375, "logits/rejected": -2.668750047683716, "logps/chosen": -349.3999938964844, "logps/rejected": -326.3999938964844, "loss": 0.5195, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.21918945014476776, "rewards/margins": 0.597460925579071, "rewards/rejected": -0.8150390386581421, "step": 1015 }, { "epoch": 0.262615859938208, "grad_norm": 348.0, "learning_rate": 3.68692070030896e-07, "logits/chosen": -2.542187452316284, "logits/rejected": -2.4828124046325684, "logps/chosen": -328.3999938964844, "logps/rejected": -411.20001220703125, "loss": 0.4004, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6539062261581421, "rewards/margins": 1.06640625, "rewards/rejected": -1.72265625, "step": 1020 }, { "epoch": 0.263903192584964, "grad_norm": 486.0, "learning_rate": 3.68048403707518e-07, "logits/chosen": -2.565624952316284, "logits/rejected": -2.578125, "logps/chosen": -275.0, "logps/rejected": -269.8999938964844, "loss": 0.5168, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3746093809604645, "rewards/margins": 0.721484363079071, "rewards/rejected": -1.0978515148162842, "step": 1025 }, { "epoch": 0.2651905252317199, "grad_norm": 372.0, "learning_rate": 3.6740473738414003e-07, "logits/chosen": -2.609375, "logits/rejected": -2.7249999046325684, "logps/chosen": -302.70001220703125, "logps/rejected": -346.6000061035156, "loss": 0.4949, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.41069334745407104, "rewards/margins": 0.910937488079071, "rewards/rejected": -1.3214843273162842, "step": 1030 }, { "epoch": 0.2664778578784758, "grad_norm": 370.0, "learning_rate": 3.6676107106076207e-07, "logits/chosen": -2.625, "logits/rejected": -2.4984374046325684, "logps/chosen": -241.0, "logps/rejected": -261.3999938964844, "loss": 0.5734, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.2845703065395355, "rewards/margins": 0.455078125, "rewards/rejected": -0.739453136920929, "step": 1035 }, { "epoch": 0.26776519052523173, "grad_norm": 384.0, "learning_rate": 3.661174047373841e-07, "logits/chosen": -2.653125047683716, "logits/rejected": -2.515625, "logps/chosen": -322.79998779296875, "logps/rejected": -387.20001220703125, "loss": 0.4141, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.6507323980331421, "rewards/margins": 1.2351562976837158, "rewards/rejected": -1.885156273841858, "step": 1040 }, { "epoch": 0.26905252317198763, "grad_norm": 456.0, "learning_rate": 3.6547373841400616e-07, "logits/chosen": -2.6312499046325684, "logits/rejected": -2.481250047683716, "logps/chosen": -373.20001220703125, "logps/rejected": -369.0, "loss": 0.4723, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5655273199081421, "rewards/margins": 0.9144531488418579, "rewards/rejected": -1.4816405773162842, "step": 1045 }, { "epoch": 0.2703398558187436, "grad_norm": 334.0, "learning_rate": 3.648300720906282e-07, "logits/chosen": -2.668750047683716, "logits/rejected": -2.674999952316284, "logps/chosen": -265.6000061035156, "logps/rejected": -308.20001220703125, "loss": 0.4902, "rewards/accuracies": 0.625, "rewards/chosen": -0.41376954317092896, "rewards/margins": 0.853710949420929, "rewards/rejected": -1.267578125, "step": 1050 }, { "epoch": 0.2716271884654995, "grad_norm": 560.0, "learning_rate": 3.6418640576725024e-07, "logits/chosen": -2.6468749046325684, "logits/rejected": -2.6937499046325684, "logps/chosen": -292.20001220703125, "logps/rejected": -385.20001220703125, "loss": 0.5297, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.565185546875, "rewards/margins": 0.7880859375, "rewards/rejected": -1.3527343273162842, "step": 1055 }, { "epoch": 0.2729145211122554, "grad_norm": 336.0, "learning_rate": 3.635427394438723e-07, "logits/chosen": -2.7562499046325684, "logits/rejected": -2.715625047683716, "logps/chosen": -281.20001220703125, "logps/rejected": -348.3999938964844, "loss": 0.5062, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7354491949081421, "rewards/margins": 0.8667968511581421, "rewards/rejected": -1.600000023841858, "step": 1060 }, { "epoch": 0.27420185375901135, "grad_norm": 350.0, "learning_rate": 3.6289907312049433e-07, "logits/chosen": -2.7125000953674316, "logits/rejected": -2.628124952316284, "logps/chosen": -324.3999938964844, "logps/rejected": -378.3999938964844, "loss": 0.5484, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.635546863079071, "rewards/margins": 0.76171875, "rewards/rejected": -1.399999976158142, "step": 1065 }, { "epoch": 0.27548918640576725, "grad_norm": 592.0, "learning_rate": 3.6225540679711637e-07, "logits/chosen": -2.549999952316284, "logits/rejected": -2.278125047683716, "logps/chosen": -230.39999389648438, "logps/rejected": -277.79998779296875, "loss": 0.5662, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.641796886920929, "rewards/margins": 0.8916015625, "rewards/rejected": -1.530981421470642, "step": 1070 }, { "epoch": 0.27677651905252315, "grad_norm": 382.0, "learning_rate": 3.6161174047373836e-07, "logits/chosen": -2.609375, "logits/rejected": -2.5531249046325684, "logps/chosen": -354.3999938964844, "logps/rejected": -407.20001220703125, "loss": 0.4996, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.917187511920929, "rewards/margins": 1.0554687976837158, "rewards/rejected": -1.9734375476837158, "step": 1075 }, { "epoch": 0.2780638516992791, "grad_norm": 382.0, "learning_rate": 3.6096807415036046e-07, "logits/chosen": -2.653125047683716, "logits/rejected": -2.690624952316284, "logps/chosen": -271.6000061035156, "logps/rejected": -331.3999938964844, "loss": 0.4344, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.45146483182907104, "rewards/margins": 1.1640625, "rewards/rejected": -1.615625023841858, "step": 1080 }, { "epoch": 0.279351184346035, "grad_norm": 330.0, "learning_rate": 3.603244078269825e-07, "logits/chosen": -2.6875, "logits/rejected": -2.65625, "logps/chosen": -398.0, "logps/rejected": -371.79998779296875, "loss": 0.4945, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6737304925918579, "rewards/margins": 0.947265625, "rewards/rejected": -1.620507836341858, "step": 1085 }, { "epoch": 0.2806385169927909, "grad_norm": 348.0, "learning_rate": 3.596807415036045e-07, "logits/chosen": -2.643749952316284, "logits/rejected": -2.6781249046325684, "logps/chosen": -257.29998779296875, "logps/rejected": -262.20001220703125, "loss": 0.6012, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.08676757663488388, "rewards/margins": 0.38682860136032104, "rewards/rejected": -0.47294920682907104, "step": 1090 }, { "epoch": 0.28192584963954687, "grad_norm": 270.0, "learning_rate": 3.590370751802266e-07, "logits/chosen": -2.546875, "logits/rejected": -2.5546875, "logps/chosen": -274.79998779296875, "logps/rejected": -302.79998779296875, "loss": 0.5637, "rewards/accuracies": 0.5, "rewards/chosen": -0.545166015625, "rewards/margins": 0.650585949420929, "rewards/rejected": -1.1930663585662842, "step": 1095 }, { "epoch": 0.28321318228630277, "grad_norm": 492.0, "learning_rate": 3.583934088568486e-07, "logits/chosen": -2.6624999046325684, "logits/rejected": -2.637500047683716, "logps/chosen": -336.0, "logps/rejected": -346.3999938964844, "loss": 0.4695, "rewards/accuracies": 0.75, "rewards/chosen": -0.48750001192092896, "rewards/margins": 0.869921863079071, "rewards/rejected": -1.3585937023162842, "step": 1100 }, { "epoch": 0.2845005149330587, "grad_norm": 350.0, "learning_rate": 3.577497425334706e-07, "logits/chosen": -2.5250000953674316, "logits/rejected": -2.46875, "logps/chosen": -271.0, "logps/rejected": -334.79998779296875, "loss": 0.4729, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6039062738418579, "rewards/margins": 0.947265625, "rewards/rejected": -1.5515625476837158, "step": 1105 }, { "epoch": 0.2857878475798146, "grad_norm": 888.0, "learning_rate": 3.571060762100927e-07, "logits/chosen": -2.7281250953674316, "logits/rejected": -2.7718749046325684, "logps/chosen": -311.0, "logps/rejected": -281.0, "loss": 0.6227, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.639453113079071, "rewards/margins": 0.5150390863418579, "rewards/rejected": -1.1531250476837158, "step": 1110 }, { "epoch": 0.2870751802265705, "grad_norm": 396.0, "learning_rate": 3.564624098867147e-07, "logits/chosen": -2.3375000953674316, "logits/rejected": -2.4046874046325684, "logps/chosen": -273.0, "logps/rejected": -303.79998779296875, "loss": 0.5605, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.47539061307907104, "rewards/margins": 0.8179687261581421, "rewards/rejected": -1.29296875, "step": 1115 }, { "epoch": 0.2883625128733265, "grad_norm": 310.0, "learning_rate": 3.5581874356333674e-07, "logits/chosen": -2.575000047683716, "logits/rejected": -2.590625047683716, "logps/chosen": -280.6000061035156, "logps/rejected": -348.6000061035156, "loss": 0.5039, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.662548840045929, "rewards/margins": 0.983593761920929, "rewards/rejected": -1.6492187976837158, "step": 1120 }, { "epoch": 0.2896498455200824, "grad_norm": 580.0, "learning_rate": 3.551750772399588e-07, "logits/chosen": -2.5687499046325684, "logits/rejected": -2.653125047683716, "logps/chosen": -304.0, "logps/rejected": -304.0, "loss": 0.5484, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5230468511581421, "rewards/margins": 0.764453113079071, "rewards/rejected": -1.2898437976837158, "step": 1125 }, { "epoch": 0.2909371781668383, "grad_norm": 612.0, "learning_rate": 3.5453141091658083e-07, "logits/chosen": -2.510937452316284, "logits/rejected": -2.668750047683716, "logps/chosen": -292.79998779296875, "logps/rejected": -310.1000061035156, "loss": 0.5656, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7801758050918579, "rewards/margins": 0.798828125, "rewards/rejected": -1.578222632408142, "step": 1130 }, { "epoch": 0.29222451081359424, "grad_norm": 440.0, "learning_rate": 3.5388774459320287e-07, "logits/chosen": -2.6312499046325684, "logits/rejected": -2.528125047683716, "logps/chosen": -276.3999938964844, "logps/rejected": -365.20001220703125, "loss": 0.452, "rewards/accuracies": 0.6875, "rewards/chosen": -0.34980469942092896, "rewards/margins": 1.060156226158142, "rewards/rejected": -1.408593773841858, "step": 1135 }, { "epoch": 0.29351184346035014, "grad_norm": 344.0, "learning_rate": 3.5324407826982486e-07, "logits/chosen": -2.5562500953674316, "logits/rejected": -2.6187500953674316, "logps/chosen": -296.20001220703125, "logps/rejected": -297.3999938964844, "loss": 0.5445, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5495849847793579, "rewards/margins": 0.7982422113418579, "rewards/rejected": -1.348242163658142, "step": 1140 }, { "epoch": 0.2947991761071061, "grad_norm": 516.0, "learning_rate": 3.5260041194644696e-07, "logits/chosen": -2.778125047683716, "logits/rejected": -2.6656250953674316, "logps/chosen": -371.6000061035156, "logps/rejected": -386.0, "loss": 0.6254, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7578125, "rewards/margins": 0.529296875, "rewards/rejected": -1.287500023841858, "step": 1145 }, { "epoch": 0.296086508753862, "grad_norm": 668.0, "learning_rate": 3.51956745623069e-07, "logits/chosen": -2.403125047683716, "logits/rejected": -2.301562547683716, "logps/chosen": -376.3999938964844, "logps/rejected": -365.20001220703125, "loss": 0.5004, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7769531011581421, "rewards/margins": 0.8187500238418579, "rewards/rejected": -1.595312476158142, "step": 1150 }, { "epoch": 0.2973738414006179, "grad_norm": 376.0, "learning_rate": 3.51313079299691e-07, "logits/chosen": -2.5999999046325684, "logits/rejected": -2.4593749046325684, "logps/chosen": -323.79998779296875, "logps/rejected": -374.79998779296875, "loss": 0.4648, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8050781488418579, "rewards/margins": 1.092187523841858, "rewards/rejected": -1.896875023841858, "step": 1155 }, { "epoch": 0.29866117404737386, "grad_norm": 346.0, "learning_rate": 3.506694129763131e-07, "logits/chosen": -2.4281249046325684, "logits/rejected": -2.359375, "logps/chosen": -328.0, "logps/rejected": -376.5, "loss": 0.4656, "rewards/accuracies": 0.6875, "rewards/chosen": -1.058203101158142, "rewards/margins": 1.374414086341858, "rewards/rejected": -2.4281249046325684, "step": 1160 }, { "epoch": 0.29994850669412976, "grad_norm": 668.0, "learning_rate": 3.5002574665293513e-07, "logits/chosen": -2.5062499046325684, "logits/rejected": -2.6500000953674316, "logps/chosen": -370.79998779296875, "logps/rejected": -340.6000061035156, "loss": 0.6109, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7406250238418579, "rewards/margins": 0.6070801019668579, "rewards/rejected": -1.349218726158142, "step": 1165 }, { "epoch": 0.30123583934088566, "grad_norm": 474.0, "learning_rate": 3.493820803295571e-07, "logits/chosen": -2.668750047683716, "logits/rejected": -2.706249952316284, "logps/chosen": -362.3999938964844, "logps/rejected": -426.79998779296875, "loss": 0.5445, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4939941465854645, "rewards/margins": 0.827343761920929, "rewards/rejected": -1.3195312023162842, "step": 1170 }, { "epoch": 0.3025231719876416, "grad_norm": 354.0, "learning_rate": 3.487384140061792e-07, "logits/chosen": -2.6468749046325684, "logits/rejected": -2.6812500953674316, "logps/chosen": -324.20001220703125, "logps/rejected": -352.79998779296875, "loss": 0.5668, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.566601574420929, "rewards/margins": 0.67578125, "rewards/rejected": -1.242578148841858, "step": 1175 }, { "epoch": 0.3038105046343975, "grad_norm": 988.0, "learning_rate": 3.480947476828012e-07, "logits/chosen": -2.53125, "logits/rejected": -2.643749952316284, "logps/chosen": -358.3999938964844, "logps/rejected": -343.0, "loss": 0.5379, "rewards/accuracies": 0.6875, "rewards/chosen": -0.91943359375, "rewards/margins": 0.9766601324081421, "rewards/rejected": -1.8899414539337158, "step": 1180 }, { "epoch": 0.30509783728115347, "grad_norm": 480.0, "learning_rate": 3.4745108135942325e-07, "logits/chosen": -2.4906249046325684, "logits/rejected": -2.5374999046325684, "logps/chosen": -371.20001220703125, "logps/rejected": -363.79998779296875, "loss": 0.4863, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6368163824081421, "rewards/margins": 1.008032202720642, "rewards/rejected": -1.6461913585662842, "step": 1185 }, { "epoch": 0.3063851699279094, "grad_norm": 422.0, "learning_rate": 3.4680741503604534e-07, "logits/chosen": -2.559375047683716, "logits/rejected": -2.59375, "logps/chosen": -363.6000061035156, "logps/rejected": -414.20001220703125, "loss": 0.5254, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.817675769329071, "rewards/margins": 0.811328113079071, "rewards/rejected": -1.6296875476837158, "step": 1190 }, { "epoch": 0.3076725025746653, "grad_norm": 382.0, "learning_rate": 3.4616374871266733e-07, "logits/chosen": -2.512500047683716, "logits/rejected": -2.151562452316284, "logps/chosen": -260.20001220703125, "logps/rejected": -279.1000061035156, "loss": 0.5676, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3158203065395355, "rewards/margins": 0.705078125, "rewards/rejected": -1.0183594226837158, "step": 1195 }, { "epoch": 0.30895983522142123, "grad_norm": 490.0, "learning_rate": 3.455200823892894e-07, "logits/chosen": -2.6624999046325684, "logits/rejected": -2.4609375, "logps/chosen": -337.20001220703125, "logps/rejected": -337.6000061035156, "loss": 0.4848, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.56201171875, "rewards/margins": 1.0408203601837158, "rewards/rejected": -1.6038086414337158, "step": 1200 }, { "epoch": 0.31024716786817713, "grad_norm": 528.0, "learning_rate": 3.448764160659114e-07, "logits/chosen": -2.7562499046325684, "logits/rejected": -2.778125047683716, "logps/chosen": -340.79998779296875, "logps/rejected": -417.20001220703125, "loss": 0.5172, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.774609386920929, "rewards/margins": 0.824999988079071, "rewards/rejected": -1.6007812023162842, "step": 1205 }, { "epoch": 0.31153450051493303, "grad_norm": 354.0, "learning_rate": 3.4423274974253346e-07, "logits/chosen": -2.765625, "logits/rejected": -2.890625, "logps/chosen": -366.79998779296875, "logps/rejected": -357.6000061035156, "loss": 0.491, "rewards/accuracies": 0.75, "rewards/chosen": -0.30546873807907104, "rewards/margins": 0.893750011920929, "rewards/rejected": -1.1994140148162842, "step": 1210 }, { "epoch": 0.312821833161689, "grad_norm": 314.0, "learning_rate": 3.435890834191555e-07, "logits/chosen": -2.5999999046325684, "logits/rejected": -2.596874952316284, "logps/chosen": -322.0, "logps/rejected": -348.0, "loss": 0.484, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6283203363418579, "rewards/margins": 1.0164062976837158, "rewards/rejected": -1.6437499523162842, "step": 1215 }, { "epoch": 0.3141091658084449, "grad_norm": 338.0, "learning_rate": 3.429454170957775e-07, "logits/chosen": -2.5687499046325684, "logits/rejected": -2.5687499046325684, "logps/chosen": -350.6000061035156, "logps/rejected": -459.6000061035156, "loss": 0.3176, "rewards/accuracies": 0.875, "rewards/chosen": -1.005468726158142, "rewards/margins": 1.6171875, "rewards/rejected": -2.6234374046325684, "step": 1220 }, { "epoch": 0.31539649845520085, "grad_norm": 580.0, "learning_rate": 3.423017507723996e-07, "logits/chosen": -2.5531249046325684, "logits/rejected": -2.734375, "logps/chosen": -284.0, "logps/rejected": -367.79998779296875, "loss": 0.4762, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4580078125, "rewards/margins": 1.0750000476837158, "rewards/rejected": -1.5320312976837158, "step": 1225 }, { "epoch": 0.31668383110195675, "grad_norm": 696.0, "learning_rate": 3.4165808444902163e-07, "logits/chosen": -2.4749999046325684, "logits/rejected": -2.471874952316284, "logps/chosen": -330.20001220703125, "logps/rejected": -377.79998779296875, "loss": 0.5078, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.783984363079071, "rewards/margins": 0.9566406011581421, "rewards/rejected": -1.739843726158142, "step": 1230 }, { "epoch": 0.31797116374871265, "grad_norm": 568.0, "learning_rate": 3.410144181256436e-07, "logits/chosen": -2.2796874046325684, "logits/rejected": -2.1953125, "logps/chosen": -334.6000061035156, "logps/rejected": -317.20001220703125, "loss": 0.6199, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.7027343511581421, "rewards/margins": 0.679492175579071, "rewards/rejected": -1.38232421875, "step": 1235 }, { "epoch": 0.3192584963954686, "grad_norm": 456.0, "learning_rate": 3.403707518022657e-07, "logits/chosen": -2.637500047683716, "logits/rejected": -2.7750000953674316, "logps/chosen": -304.6000061035156, "logps/rejected": -325.8999938964844, "loss": 0.5727, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.638378918170929, "rewards/margins": 0.864941418170929, "rewards/rejected": -1.504492163658142, "step": 1240 }, { "epoch": 0.3205458290422245, "grad_norm": 161.0, "learning_rate": 3.3972708547888776e-07, "logits/chosen": -2.596874952316284, "logits/rejected": -2.653125047683716, "logps/chosen": -302.20001220703125, "logps/rejected": -295.29998779296875, "loss": 0.5773, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.734362781047821, "rewards/margins": 0.745312511920929, "rewards/rejected": -1.4833984375, "step": 1245 }, { "epoch": 0.3218331616889804, "grad_norm": 402.0, "learning_rate": 3.3908341915550975e-07, "logits/chosen": -2.6968750953674316, "logits/rejected": -2.637500047683716, "logps/chosen": -311.3999938964844, "logps/rejected": -363.6000061035156, "loss": 0.4977, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.35102540254592896, "rewards/margins": 1.019140601158142, "rewards/rejected": -1.3689453601837158, "step": 1250 }, { "epoch": 0.32312049433573636, "grad_norm": 548.0, "learning_rate": 3.3843975283213184e-07, "logits/chosen": -2.6781249046325684, "logits/rejected": -2.5531249046325684, "logps/chosen": -322.0, "logps/rejected": -390.3999938964844, "loss": 0.5844, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.37114256620407104, "rewards/margins": 0.676562488079071, "rewards/rejected": -1.046875, "step": 1255 }, { "epoch": 0.32440782698249226, "grad_norm": 390.0, "learning_rate": 3.3779608650875383e-07, "logits/chosen": -2.671875, "logits/rejected": -2.78125, "logps/chosen": -301.3999938964844, "logps/rejected": -285.5, "loss": 0.5758, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.43476563692092896, "rewards/margins": 0.593554675579071, "rewards/rejected": -1.026757836341858, "step": 1260 }, { "epoch": 0.3256951596292482, "grad_norm": 492.0, "learning_rate": 3.371524201853759e-07, "logits/chosen": -2.706249952316284, "logits/rejected": -2.71875, "logps/chosen": -253.1999969482422, "logps/rejected": -277.6000061035156, "loss": 0.5156, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.51953125, "rewards/margins": 0.810546875, "rewards/rejected": -1.328125, "step": 1265 }, { "epoch": 0.3269824922760041, "grad_norm": 552.0, "learning_rate": 3.3650875386199797e-07, "logits/chosen": -2.6624999046325684, "logits/rejected": -2.6500000953674316, "logps/chosen": -317.20001220703125, "logps/rejected": -361.6000061035156, "loss": 0.5641, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.785351574420929, "rewards/margins": 0.778564453125, "rewards/rejected": -1.560546875, "step": 1270 }, { "epoch": 0.32826982492276, "grad_norm": 320.0, "learning_rate": 3.3586508753861996e-07, "logits/chosen": -2.8375000953674316, "logits/rejected": -2.828125, "logps/chosen": -369.3999938964844, "logps/rejected": -355.6000061035156, "loss": 0.5535, "rewards/accuracies": 0.625, "rewards/chosen": -0.32539063692092896, "rewards/margins": 0.5765625238418579, "rewards/rejected": -0.901562511920929, "step": 1275 }, { "epoch": 0.329557157569516, "grad_norm": 704.0, "learning_rate": 3.35221421215242e-07, "logits/chosen": -2.6343750953674316, "logits/rejected": -2.6968750953674316, "logps/chosen": -373.6000061035156, "logps/rejected": -430.0, "loss": 0.4926, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.674023449420929, "rewards/margins": 0.954296886920929, "rewards/rejected": -1.627343773841858, "step": 1280 }, { "epoch": 0.3308444902162719, "grad_norm": 544.0, "learning_rate": 3.34577754891864e-07, "logits/chosen": -2.390625, "logits/rejected": -2.862499952316284, "logps/chosen": -311.0, "logps/rejected": -259.3500061035156, "loss": 0.6004, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.3406982421875, "rewards/margins": 0.59228515625, "rewards/rejected": -0.931445300579071, "step": 1285 }, { "epoch": 0.3321318228630278, "grad_norm": 498.0, "learning_rate": 3.339340885684861e-07, "logits/chosen": -2.4312500953674316, "logits/rejected": -2.4437499046325684, "logps/chosen": -312.79998779296875, "logps/rejected": -394.0, "loss": 0.432, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7953125238418579, "rewards/margins": 1.1648437976837158, "rewards/rejected": -1.9617187976837158, "step": 1290 }, { "epoch": 0.33341915550978374, "grad_norm": 572.0, "learning_rate": 3.3329042224510813e-07, "logits/chosen": -2.575000047683716, "logits/rejected": -2.5625, "logps/chosen": -336.6000061035156, "logps/rejected": -434.3999938964844, "loss": 0.4316, "rewards/accuracies": 0.75, "rewards/chosen": -0.936718761920929, "rewards/margins": 1.216406226158142, "rewards/rejected": -2.153125047683716, "step": 1295 }, { "epoch": 0.33470648815653964, "grad_norm": 612.0, "learning_rate": 3.326467559217301e-07, "logits/chosen": -2.5250000953674316, "logits/rejected": -2.565624952316284, "logps/chosen": -384.20001220703125, "logps/rejected": -440.0, "loss": 0.4379, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.549304187297821, "rewards/margins": 1.1023437976837158, "rewards/rejected": -1.65234375, "step": 1300 }, { "epoch": 0.3359938208032956, "grad_norm": 354.0, "learning_rate": 3.320030895983522e-07, "logits/chosen": -2.4437499046325684, "logits/rejected": -2.612499952316284, "logps/chosen": -271.0, "logps/rejected": -328.0, "loss": 0.4703, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.619140625, "rewards/margins": 1.1785156726837158, "rewards/rejected": -1.799218773841858, "step": 1305 }, { "epoch": 0.3372811534500515, "grad_norm": 364.0, "learning_rate": 3.3135942327497426e-07, "logits/chosen": -2.3531250953674316, "logits/rejected": -2.0921874046325684, "logps/chosen": -283.6000061035156, "logps/rejected": -354.3999938964844, "loss": 0.4914, "rewards/accuracies": 0.6875, "rewards/chosen": -0.48322755098342896, "rewards/margins": 1.0021483898162842, "rewards/rejected": -1.4873535633087158, "step": 1310 }, { "epoch": 0.3385684860968074, "grad_norm": 664.0, "learning_rate": 3.3071575695159625e-07, "logits/chosen": -2.378124952316284, "logits/rejected": -2.6109375953674316, "logps/chosen": -314.6000061035156, "logps/rejected": -343.3999938964844, "loss": 0.525, "rewards/accuracies": 0.625, "rewards/chosen": -0.624218761920929, "rewards/margins": 1.1277344226837158, "rewards/rejected": -1.750341773033142, "step": 1315 }, { "epoch": 0.33985581874356335, "grad_norm": 392.0, "learning_rate": 3.3007209062821835e-07, "logits/chosen": -2.671875, "logits/rejected": -2.65625, "logps/chosen": -307.3999938964844, "logps/rejected": -402.6000061035156, "loss": 0.5773, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.608105480670929, "rewards/margins": 0.7007812261581421, "rewards/rejected": -1.308984398841858, "step": 1320 }, { "epoch": 0.34114315139031925, "grad_norm": 684.0, "learning_rate": 3.2942842430484033e-07, "logits/chosen": -2.5625, "logits/rejected": -2.65625, "logps/chosen": -353.6000061035156, "logps/rejected": -313.0, "loss": 0.5316, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5166015625, "rewards/margins": 0.7242187261581421, "rewards/rejected": -1.239843726158142, "step": 1325 }, { "epoch": 0.34243048403707516, "grad_norm": 422.0, "learning_rate": 3.287847579814624e-07, "logits/chosen": -2.242968797683716, "logits/rejected": -2.674999952316284, "logps/chosen": -286.0, "logps/rejected": -316.79998779296875, "loss": 0.5418, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.25898438692092896, "rewards/margins": 0.6605468988418579, "rewards/rejected": -0.9203125238418579, "step": 1330 }, { "epoch": 0.3437178166838311, "grad_norm": 596.0, "learning_rate": 3.2814109165808447e-07, "logits/chosen": -2.5218749046325684, "logits/rejected": -2.578125, "logps/chosen": -303.79998779296875, "logps/rejected": -354.20001220703125, "loss": 0.5348, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.73046875, "rewards/margins": 0.8394531011581421, "rewards/rejected": -1.570703148841858, "step": 1335 }, { "epoch": 0.345005149330587, "grad_norm": 372.0, "learning_rate": 3.2749742533470646e-07, "logits/chosen": -2.809375047683716, "logits/rejected": -2.846874952316284, "logps/chosen": -284.20001220703125, "logps/rejected": -305.3999938964844, "loss": 0.5289, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03170166164636612, "rewards/margins": 0.5972656011581421, "rewards/rejected": -0.630664050579071, "step": 1340 }, { "epoch": 0.34629248197734297, "grad_norm": 692.0, "learning_rate": 3.268537590113285e-07, "logits/chosen": -2.65625, "logits/rejected": -2.5625, "logps/chosen": -256.6000061035156, "logps/rejected": -298.20001220703125, "loss": 0.5687, "rewards/accuracies": 0.5625, "rewards/chosen": -0.422119140625, "rewards/margins": 0.611132800579071, "rewards/rejected": -1.031640648841858, "step": 1345 }, { "epoch": 0.34757981462409887, "grad_norm": 660.0, "learning_rate": 3.262100926879506e-07, "logits/chosen": -2.456249952316284, "logits/rejected": -2.3421874046325684, "logps/chosen": -318.29998779296875, "logps/rejected": -343.6000061035156, "loss": 0.6379, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.36689454317092896, "rewards/margins": 0.4742187559604645, "rewards/rejected": -0.83984375, "step": 1350 }, { "epoch": 0.34886714727085477, "grad_norm": 512.0, "learning_rate": 3.255664263645726e-07, "logits/chosen": -2.59375, "logits/rejected": -2.578125, "logps/chosen": -298.0, "logps/rejected": -391.20001220703125, "loss": 0.4543, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.790234386920929, "rewards/margins": 1.3718750476837158, "rewards/rejected": -2.167187452316284, "step": 1355 }, { "epoch": 0.35015447991761073, "grad_norm": 506.0, "learning_rate": 3.2492276004119463e-07, "logits/chosen": -2.621875047683716, "logits/rejected": -2.5999999046325684, "logps/chosen": -284.6000061035156, "logps/rejected": -326.3999938964844, "loss": 0.5086, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5205078125, "rewards/margins": 0.969531238079071, "rewards/rejected": -1.4914062023162842, "step": 1360 }, { "epoch": 0.35144181256436663, "grad_norm": 344.0, "learning_rate": 3.242790937178166e-07, "logits/chosen": -2.7562499046325684, "logits/rejected": -2.8125, "logps/chosen": -328.20001220703125, "logps/rejected": -404.79998779296875, "loss": 0.5344, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.529296875, "rewards/margins": 0.769726574420929, "rewards/rejected": -1.3015625476837158, "step": 1365 }, { "epoch": 0.35272914521112253, "grad_norm": 716.0, "learning_rate": 3.236354273944387e-07, "logits/chosen": -2.6156249046325684, "logits/rejected": -2.5093750953674316, "logps/chosen": -289.0, "logps/rejected": -277.20001220703125, "loss": 0.6547, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.780468761920929, "rewards/margins": 0.4281249940395355, "rewards/rejected": -1.2101562023162842, "step": 1370 }, { "epoch": 0.3540164778578785, "grad_norm": 474.0, "learning_rate": 3.2299176107106076e-07, "logits/chosen": -2.674999952316284, "logits/rejected": -2.684375047683716, "logps/chosen": -347.20001220703125, "logps/rejected": -440.3999938964844, "loss": 0.6012, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.640625, "rewards/margins": 0.568164050579071, "rewards/rejected": -1.208593726158142, "step": 1375 }, { "epoch": 0.3553038105046344, "grad_norm": 384.0, "learning_rate": 3.2234809474768275e-07, "logits/chosen": -2.753124952316284, "logits/rejected": -2.799999952316284, "logps/chosen": -369.6000061035156, "logps/rejected": -365.20001220703125, "loss": 0.5301, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7079101800918579, "rewards/margins": 0.731640636920929, "rewards/rejected": -1.439062476158142, "step": 1380 }, { "epoch": 0.35659114315139034, "grad_norm": 616.0, "learning_rate": 3.2170442842430485e-07, "logits/chosen": -2.6312499046325684, "logits/rejected": -2.590625047683716, "logps/chosen": -253.1999969482422, "logps/rejected": -313.6000061035156, "loss": 0.5504, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3958984315395355, "rewards/margins": 0.6366211175918579, "rewards/rejected": -1.035070776939392, "step": 1385 }, { "epoch": 0.35787847579814624, "grad_norm": 368.0, "learning_rate": 3.2106076210092684e-07, "logits/chosen": -2.625, "logits/rejected": -2.700000047683716, "logps/chosen": -359.79998779296875, "logps/rejected": -385.6000061035156, "loss": 0.4867, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.617968738079071, "rewards/margins": 0.9408203363418579, "rewards/rejected": -1.560156226158142, "step": 1390 }, { "epoch": 0.35916580844490215, "grad_norm": 436.0, "learning_rate": 3.204170957775489e-07, "logits/chosen": -2.4281249046325684, "logits/rejected": -2.2640624046325684, "logps/chosen": -247.8000030517578, "logps/rejected": -292.6000061035156, "loss": 0.5367, "rewards/accuracies": 0.5, "rewards/chosen": -0.48124998807907104, "rewards/margins": 0.7593749761581421, "rewards/rejected": -1.2410156726837158, "step": 1395 }, { "epoch": 0.3604531410916581, "grad_norm": 340.0, "learning_rate": 3.19773429454171e-07, "logits/chosen": -2.7125000953674316, "logits/rejected": -2.7093749046325684, "logps/chosen": -362.0, "logps/rejected": -363.20001220703125, "loss": 0.4816, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.58056640625, "rewards/margins": 1.017187476158142, "rewards/rejected": -1.59765625, "step": 1400 }, { "epoch": 0.361740473738414, "grad_norm": 544.0, "learning_rate": 3.1912976313079296e-07, "logits/chosen": -2.734375, "logits/rejected": -2.746875047683716, "logps/chosen": -313.79998779296875, "logps/rejected": -306.0, "loss": 0.6359, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09511718899011612, "rewards/margins": 0.42939454317092896, "rewards/rejected": -0.524365246295929, "step": 1405 }, { "epoch": 0.3630278063851699, "grad_norm": 756.0, "learning_rate": 3.18486096807415e-07, "logits/chosen": -2.512500047683716, "logits/rejected": -2.3265624046325684, "logps/chosen": -306.3999938964844, "logps/rejected": -410.0, "loss": 0.4711, "rewards/accuracies": 0.75, "rewards/chosen": -0.696484386920929, "rewards/margins": 0.9599609375, "rewards/rejected": -1.65625, "step": 1410 }, { "epoch": 0.36431513903192586, "grad_norm": 332.0, "learning_rate": 3.178424304840371e-07, "logits/chosen": -2.59375, "logits/rejected": -2.575000047683716, "logps/chosen": -310.3999938964844, "logps/rejected": -350.0, "loss": 0.5007, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.570117175579071, "rewards/margins": 1.0735352039337158, "rewards/rejected": -1.640234351158142, "step": 1415 }, { "epoch": 0.36560247167868176, "grad_norm": 446.0, "learning_rate": 3.171987641606591e-07, "logits/chosen": -2.390625, "logits/rejected": -2.503124952316284, "logps/chosen": -312.8999938964844, "logps/rejected": -348.20001220703125, "loss": 0.5516, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.862988293170929, "rewards/margins": 0.82373046875, "rewards/rejected": -1.6867187023162842, "step": 1420 }, { "epoch": 0.3668898043254377, "grad_norm": 334.0, "learning_rate": 3.1655509783728114e-07, "logits/chosen": -2.734375, "logits/rejected": -2.7906250953674316, "logps/chosen": -348.0, "logps/rejected": -410.3999938964844, "loss": 0.5219, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5433593988418579, "rewards/margins": 0.8355468511581421, "rewards/rejected": -1.379296898841858, "step": 1425 }, { "epoch": 0.3681771369721936, "grad_norm": 282.0, "learning_rate": 3.159114315139032e-07, "logits/chosen": -2.628124952316284, "logits/rejected": -2.7249999046325684, "logps/chosen": -280.6000061035156, "logps/rejected": -244.0, "loss": 0.5773, "rewards/accuracies": 0.5, "rewards/chosen": -0.510449230670929, "rewards/margins": 0.4896484315395355, "rewards/rejected": -1.0009765625, "step": 1430 }, { "epoch": 0.3694644696189495, "grad_norm": 446.0, "learning_rate": 3.152677651905252e-07, "logits/chosen": -2.190624952316284, "logits/rejected": -2.46875, "logps/chosen": -272.20001220703125, "logps/rejected": -342.6000061035156, "loss": 0.392, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7266601324081421, "rewards/margins": 1.2765624523162842, "rewards/rejected": -2.00390625, "step": 1435 }, { "epoch": 0.3707518022657055, "grad_norm": 620.0, "learning_rate": 3.1462409886714726e-07, "logits/chosen": -2.543750047683716, "logits/rejected": -2.4437499046325684, "logps/chosen": -311.70001220703125, "logps/rejected": -325.5, "loss": 0.558, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6791015863418579, "rewards/margins": 0.7354491949081421, "rewards/rejected": -1.4148437976837158, "step": 1440 }, { "epoch": 0.3720391349124614, "grad_norm": 580.0, "learning_rate": 3.1398043254376925e-07, "logits/chosen": -2.581249952316284, "logits/rejected": -2.674999952316284, "logps/chosen": -342.20001220703125, "logps/rejected": -371.20001220703125, "loss": 0.4875, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6156250238418579, "rewards/margins": 0.921093761920929, "rewards/rejected": -1.536718726158142, "step": 1445 }, { "epoch": 0.3733264675592173, "grad_norm": 328.0, "learning_rate": 3.1333676622039135e-07, "logits/chosen": -2.840625047683716, "logits/rejected": -2.871875047683716, "logps/chosen": -350.79998779296875, "logps/rejected": -322.79998779296875, "loss": 0.5922, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.19306640326976776, "rewards/margins": 0.498046875, "rewards/rejected": -0.6910156011581421, "step": 1450 }, { "epoch": 0.37461380020597324, "grad_norm": 704.0, "learning_rate": 3.126930998970134e-07, "logits/chosen": -2.6656250953674316, "logits/rejected": -2.546875, "logps/chosen": -316.0, "logps/rejected": -367.20001220703125, "loss": 0.4922, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.938281238079071, "rewards/margins": 0.942187488079071, "rewards/rejected": -1.8820312023162842, "step": 1455 }, { "epoch": 0.37590113285272914, "grad_norm": 406.0, "learning_rate": 3.120494335736354e-07, "logits/chosen": -2.515625, "logits/rejected": -2.690624952316284, "logps/chosen": -329.0, "logps/rejected": -356.3999938964844, "loss": 0.6016, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.534167468547821, "rewards/margins": 0.587554931640625, "rewards/rejected": -1.120703101158142, "step": 1460 }, { "epoch": 0.3771884654994851, "grad_norm": 342.0, "learning_rate": 3.114057672502575e-07, "logits/chosen": -2.65625, "logits/rejected": -2.640625, "logps/chosen": -334.0, "logps/rejected": -366.3999938964844, "loss": 0.4645, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.4637695252895355, "rewards/margins": 1.031640648841858, "rewards/rejected": -1.495703101158142, "step": 1465 }, { "epoch": 0.378475798146241, "grad_norm": 292.0, "learning_rate": 3.1076210092687947e-07, "logits/chosen": -2.671875, "logits/rejected": -2.6500000953674316, "logps/chosen": -315.3999938964844, "logps/rejected": -304.6000061035156, "loss": 0.5324, "rewards/accuracies": 0.5625, "rewards/chosen": -0.24667969346046448, "rewards/margins": 0.838671863079071, "rewards/rejected": -1.086523413658142, "step": 1470 }, { "epoch": 0.3797631307929969, "grad_norm": 592.0, "learning_rate": 3.101184346035015e-07, "logits/chosen": -2.753124952316284, "logits/rejected": -2.8187499046325684, "logps/chosen": -342.3999938964844, "logps/rejected": -387.0, "loss": 0.518, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5988525152206421, "rewards/margins": 0.7466796636581421, "rewards/rejected": -1.3450195789337158, "step": 1475 }, { "epoch": 0.38105046343975285, "grad_norm": 334.0, "learning_rate": 3.094747682801236e-07, "logits/chosen": -2.6875, "logits/rejected": -2.022937059402466, "logps/chosen": -224.0500030517578, "logps/rejected": -298.29998779296875, "loss": 0.6156, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.4722656309604645, "rewards/margins": 0.4546875059604645, "rewards/rejected": -0.926257312297821, "step": 1480 }, { "epoch": 0.38233779608650875, "grad_norm": 366.0, "learning_rate": 3.088311019567456e-07, "logits/chosen": -2.596874952316284, "logits/rejected": -2.59375, "logps/chosen": -334.0, "logps/rejected": -367.20001220703125, "loss": 0.4092, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.43956297636032104, "rewards/margins": 1.181640625, "rewards/rejected": -1.6242187023162842, "step": 1485 }, { "epoch": 0.38362512873326465, "grad_norm": 438.0, "learning_rate": 3.0818743563336764e-07, "logits/chosen": -2.6875, "logits/rejected": -2.596874952316284, "logps/chosen": -329.20001220703125, "logps/rejected": -335.20001220703125, "loss": 0.4738, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.396728515625, "rewards/margins": 0.925000011920929, "rewards/rejected": -1.32421875, "step": 1490 }, { "epoch": 0.3849124613800206, "grad_norm": 374.0, "learning_rate": 3.0754376930998973e-07, "logits/chosen": -2.762500047683716, "logits/rejected": -2.75, "logps/chosen": -310.0, "logps/rejected": -311.79998779296875, "loss": 0.5715, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.3052734434604645, "rewards/margins": 0.619140625, "rewards/rejected": -0.9228515625, "step": 1495 }, { "epoch": 0.3861997940267765, "grad_norm": 356.0, "learning_rate": 3.069001029866117e-07, "logits/chosen": -2.590625047683716, "logits/rejected": -2.671875, "logps/chosen": -306.79998779296875, "logps/rejected": -355.3999938964844, "loss": 0.5391, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.43651121854782104, "rewards/margins": 0.9208008050918579, "rewards/rejected": -1.3562500476837158, "step": 1500 }, { "epoch": 0.38748712667353247, "grad_norm": 480.0, "learning_rate": 3.0625643666323377e-07, "logits/chosen": -2.6937499046325684, "logits/rejected": -2.762500047683716, "logps/chosen": -336.0, "logps/rejected": -398.3999938964844, "loss": 0.5672, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.588623046875, "rewards/margins": 0.720507800579071, "rewards/rejected": -1.310937523841858, "step": 1505 }, { "epoch": 0.38877445932028837, "grad_norm": 612.0, "learning_rate": 3.056127703398558e-07, "logits/chosen": -2.671875, "logits/rejected": -2.5140624046325684, "logps/chosen": -338.20001220703125, "logps/rejected": -295.20001220703125, "loss": 0.6117, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.5210937261581421, "rewards/margins": 0.42890626192092896, "rewards/rejected": -0.9507812261581421, "step": 1510 }, { "epoch": 0.39006179196704427, "grad_norm": 280.0, "learning_rate": 3.0496910401647785e-07, "logits/chosen": -2.8218750953674316, "logits/rejected": -2.5999999046325684, "logps/chosen": -329.6000061035156, "logps/rejected": -347.20001220703125, "loss": 0.6309, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.49482423067092896, "rewards/margins": 0.5938476324081421, "rewards/rejected": -1.08984375, "step": 1515 }, { "epoch": 0.3913491246138002, "grad_norm": 520.0, "learning_rate": 3.043254376930999e-07, "logits/chosen": -2.5875000953674316, "logits/rejected": -2.5, "logps/chosen": -333.0, "logps/rejected": -432.79998779296875, "loss": 0.4402, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8023437261581421, "rewards/margins": 1.189062476158142, "rewards/rejected": -1.9890625476837158, "step": 1520 }, { "epoch": 0.3926364572605561, "grad_norm": 434.0, "learning_rate": 3.036817713697219e-07, "logits/chosen": -2.621875047683716, "logits/rejected": -2.671875, "logps/chosen": -264.79998779296875, "logps/rejected": -266.3999938964844, "loss": 0.5309, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.185302734375, "rewards/margins": 0.637890636920929, "rewards/rejected": -0.8226562738418579, "step": 1525 }, { "epoch": 0.393923789907312, "grad_norm": 548.0, "learning_rate": 3.03038105046344e-07, "logits/chosen": -2.487499952316284, "logits/rejected": -2.575000047683716, "logps/chosen": -280.20001220703125, "logps/rejected": -354.29998779296875, "loss": 0.5379, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.482421875, "rewards/margins": 0.8974853754043579, "rewards/rejected": -1.3796875476837158, "step": 1530 }, { "epoch": 0.395211122554068, "grad_norm": 322.0, "learning_rate": 3.0239443872296597e-07, "logits/chosen": -2.5843749046325684, "logits/rejected": -2.6468749046325684, "logps/chosen": -297.6000061035156, "logps/rejected": -337.3999938964844, "loss": 0.5316, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6875, "rewards/margins": 0.814257800579071, "rewards/rejected": -1.5, "step": 1535 }, { "epoch": 0.3964984552008239, "grad_norm": 524.0, "learning_rate": 3.01750772399588e-07, "logits/chosen": -2.515625, "logits/rejected": -2.596874952316284, "logps/chosen": -346.6000061035156, "logps/rejected": -406.3999938964844, "loss": 0.5609, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.571240246295929, "rewards/margins": 0.7984374761581421, "rewards/rejected": -1.368749976158142, "step": 1540 }, { "epoch": 0.39778578784757984, "grad_norm": 274.0, "learning_rate": 3.011071060762101e-07, "logits/chosen": -2.3671875, "logits/rejected": -2.4546875953674316, "logps/chosen": -273.6000061035156, "logps/rejected": -356.3999938964844, "loss": 0.4395, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3473877012729645, "rewards/margins": 1.048828125, "rewards/rejected": -1.3984375, "step": 1545 }, { "epoch": 0.39907312049433574, "grad_norm": 704.0, "learning_rate": 3.004634397528321e-07, "logits/chosen": -2.5843749046325684, "logits/rejected": -2.6171875, "logps/chosen": -300.6000061035156, "logps/rejected": -288.70001220703125, "loss": 0.5832, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.608203113079071, "rewards/margins": 0.650585949420929, "rewards/rejected": -1.2589843273162842, "step": 1550 }, { "epoch": 0.40036045314109164, "grad_norm": 584.0, "learning_rate": 2.9981977342945414e-07, "logits/chosen": -2.612499952316284, "logits/rejected": -2.674999952316284, "logps/chosen": -328.6000061035156, "logps/rejected": -376.0, "loss": 0.5266, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6019531488418579, "rewards/margins": 0.810351550579071, "rewards/rejected": -1.4109375476837158, "step": 1555 }, { "epoch": 0.4016477857878476, "grad_norm": 636.0, "learning_rate": 2.9917610710607623e-07, "logits/chosen": -2.668750047683716, "logits/rejected": -2.643749952316284, "logps/chosen": -373.6000061035156, "logps/rejected": -371.6000061035156, "loss": 0.4863, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3636230528354645, "rewards/margins": 0.875, "rewards/rejected": -1.23828125, "step": 1560 }, { "epoch": 0.4029351184346035, "grad_norm": 616.0, "learning_rate": 2.985324407826982e-07, "logits/chosen": -2.6812500953674316, "logits/rejected": -2.6156249046325684, "logps/chosen": -312.6000061035156, "logps/rejected": -313.3999938964844, "loss": 0.5816, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.3766113221645355, "rewards/margins": 0.644726574420929, "rewards/rejected": -1.020898461341858, "step": 1565 }, { "epoch": 0.4042224510813594, "grad_norm": 444.0, "learning_rate": 2.9788877445932027e-07, "logits/chosen": -2.628124952316284, "logits/rejected": -2.596874952316284, "logps/chosen": -334.0, "logps/rejected": -392.79998779296875, "loss": 0.5227, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.032812476158142, "rewards/margins": 0.974609375, "rewards/rejected": -2.0078125, "step": 1570 }, { "epoch": 0.40550978372811536, "grad_norm": 544.0, "learning_rate": 2.972451081359423e-07, "logits/chosen": -2.339062452316284, "logits/rejected": -2.3218750953674316, "logps/chosen": -304.3999938964844, "logps/rejected": -397.20001220703125, "loss": 0.4391, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0343749523162842, "rewards/margins": 1.4890625476837158, "rewards/rejected": -2.528125047683716, "step": 1575 }, { "epoch": 0.40679711637487126, "grad_norm": 390.0, "learning_rate": 2.9660144181256435e-07, "logits/chosen": -2.746875047683716, "logits/rejected": -2.8062500953674316, "logps/chosen": -264.0, "logps/rejected": -232.0, "loss": 0.552, "rewards/accuracies": 0.5625, "rewards/chosen": -0.32587891817092896, "rewards/margins": 0.58984375, "rewards/rejected": -0.9175781011581421, "step": 1580 }, { "epoch": 0.4080844490216272, "grad_norm": 302.0, "learning_rate": 2.959577754891864e-07, "logits/chosen": -2.7750000953674316, "logits/rejected": -2.7437500953674316, "logps/chosen": -327.20001220703125, "logps/rejected": -308.3999938964844, "loss": 0.6047, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.3077148497104645, "rewards/margins": 0.551562488079071, "rewards/rejected": -0.858593761920929, "step": 1585 }, { "epoch": 0.4093717816683831, "grad_norm": 390.0, "learning_rate": 2.9531410916580844e-07, "logits/chosen": -2.71875, "logits/rejected": -2.596874952316284, "logps/chosen": -283.20001220703125, "logps/rejected": -331.79998779296875, "loss": 0.4863, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3466796875, "rewards/margins": 0.8349609375, "rewards/rejected": -1.18359375, "step": 1590 }, { "epoch": 0.410659114315139, "grad_norm": 528.0, "learning_rate": 2.946704428424305e-07, "logits/chosen": -2.731250047683716, "logits/rejected": -2.534374952316284, "logps/chosen": -310.0, "logps/rejected": -320.20001220703125, "loss": 0.4871, "rewards/accuracies": 0.625, "rewards/chosen": -0.31147462129592896, "rewards/margins": 0.844921886920929, "rewards/rejected": -1.1554687023162842, "step": 1595 }, { "epoch": 0.411946446961895, "grad_norm": 476.0, "learning_rate": 2.940267765190525e-07, "logits/chosen": -2.625, "logits/rejected": -2.737499952316284, "logps/chosen": -277.20001220703125, "logps/rejected": -281.20001220703125, "loss": 0.5617, "rewards/accuracies": 0.5, "rewards/chosen": -0.2760009765625, "rewards/margins": 0.48261719942092896, "rewards/rejected": -0.7593749761581421, "step": 1600 }, { "epoch": 0.4132337796086509, "grad_norm": 418.0, "learning_rate": 2.933831101956745e-07, "logits/chosen": -2.5250000953674316, "logits/rejected": -2.5250000953674316, "logps/chosen": -297.0, "logps/rejected": -295.20001220703125, "loss": 0.5078, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.2992187440395355, "rewards/margins": 0.8158203363418579, "rewards/rejected": -1.114233374595642, "step": 1605 }, { "epoch": 0.4145211122554068, "grad_norm": 243.0, "learning_rate": 2.927394438722966e-07, "logits/chosen": -2.6468749046325684, "logits/rejected": -2.6781249046325684, "logps/chosen": -280.0, "logps/rejected": -342.3999938964844, "loss": 0.4707, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5245116949081421, "rewards/margins": 0.989453136920929, "rewards/rejected": -1.5125000476837158, "step": 1610 }, { "epoch": 0.41580844490216273, "grad_norm": 500.0, "learning_rate": 2.920957775489186e-07, "logits/chosen": -2.534374952316284, "logits/rejected": -2.5250000953674316, "logps/chosen": -331.20001220703125, "logps/rejected": -366.3999938964844, "loss": 0.6094, "rewards/accuracies": 0.5625, "rewards/chosen": -0.694140613079071, "rewards/margins": 0.629101574420929, "rewards/rejected": -1.321874976158142, "step": 1615 }, { "epoch": 0.41709577754891863, "grad_norm": 336.0, "learning_rate": 2.9145211122554064e-07, "logits/chosen": -2.799999952316284, "logits/rejected": -2.609375, "logps/chosen": -322.0, "logps/rejected": -304.6000061035156, "loss": 0.6414, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.39167481660842896, "rewards/margins": 0.4271484315395355, "rewards/rejected": -0.818652331829071, "step": 1620 }, { "epoch": 0.41838311019567453, "grad_norm": 486.0, "learning_rate": 2.9080844490216274e-07, "logits/chosen": -2.700000047683716, "logits/rejected": -2.528125047683716, "logps/chosen": -265.79998779296875, "logps/rejected": -285.6000061035156, "loss": 0.6211, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.50439453125, "rewards/margins": 0.573437511920929, "rewards/rejected": -1.077734351158142, "step": 1625 }, { "epoch": 0.4196704428424305, "grad_norm": 418.0, "learning_rate": 2.9016477857878473e-07, "logits/chosen": -2.784374952316284, "logits/rejected": -2.7562499046325684, "logps/chosen": -334.3999938964844, "logps/rejected": -329.3999938964844, "loss": 0.6121, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4639648497104645, "rewards/margins": 0.58203125, "rewards/rejected": -1.0441405773162842, "step": 1630 }, { "epoch": 0.4209577754891864, "grad_norm": 450.0, "learning_rate": 2.8952111225540677e-07, "logits/chosen": -2.1734375953674316, "logits/rejected": -2.168750047683716, "logps/chosen": -291.20001220703125, "logps/rejected": -375.79998779296875, "loss": 0.4219, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.65478515625, "rewards/margins": 1.3515625, "rewards/rejected": -2.005627393722534, "step": 1635 }, { "epoch": 0.42224510813594235, "grad_norm": 360.0, "learning_rate": 2.8887744593202886e-07, "logits/chosen": -2.918750047683716, "logits/rejected": -2.753124952316284, "logps/chosen": -337.3999938964844, "logps/rejected": -349.6000061035156, "loss": 0.5273, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.15771484375, "rewards/margins": 0.649218738079071, "rewards/rejected": -0.8082031011581421, "step": 1640 }, { "epoch": 0.42353244078269825, "grad_norm": 498.0, "learning_rate": 2.8823377960865085e-07, "logits/chosen": -2.71875, "logits/rejected": -2.7406249046325684, "logps/chosen": -322.3999938964844, "logps/rejected": -343.6000061035156, "loss": 0.6262, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.4129882752895355, "rewards/margins": 0.546704113483429, "rewards/rejected": -0.960156261920929, "step": 1645 }, { "epoch": 0.42481977342945415, "grad_norm": 302.0, "learning_rate": 2.875901132852729e-07, "logits/chosen": -2.653125047683716, "logits/rejected": -2.871875047683716, "logps/chosen": -277.3999938964844, "logps/rejected": -327.3999938964844, "loss": 0.4941, "rewards/accuracies": 0.625, "rewards/chosen": -0.2584472596645355, "rewards/margins": 0.8306640386581421, "rewards/rejected": -1.0890624523162842, "step": 1650 }, { "epoch": 0.4261071060762101, "grad_norm": 564.0, "learning_rate": 2.8694644696189494e-07, "logits/chosen": -2.762500047683716, "logits/rejected": -2.7281250953674316, "logps/chosen": -274.20001220703125, "logps/rejected": -326.3999938964844, "loss": 0.5543, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.326171875, "rewards/margins": 0.646289050579071, "rewards/rejected": -0.97265625, "step": 1655 }, { "epoch": 0.427394438722966, "grad_norm": 414.0, "learning_rate": 2.86302780638517e-07, "logits/chosen": -2.53125, "logits/rejected": -2.6156249046325684, "logps/chosen": -399.6000061035156, "logps/rejected": -450.0, "loss": 0.4133, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5849609375, "rewards/margins": 1.19140625, "rewards/rejected": -1.774999976158142, "step": 1660 }, { "epoch": 0.4286817713697219, "grad_norm": 244.0, "learning_rate": 2.85659114315139e-07, "logits/chosen": -2.4375, "logits/rejected": -2.4359374046325684, "logps/chosen": -289.0, "logps/rejected": -415.79998779296875, "loss": 0.516, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3701171875, "rewards/margins": 0.823437511920929, "rewards/rejected": -1.195703148841858, "step": 1665 }, { "epoch": 0.42996910401647787, "grad_norm": 482.0, "learning_rate": 2.8501544799176107e-07, "logits/chosen": -2.668750047683716, "logits/rejected": -2.59375, "logps/chosen": -323.79998779296875, "logps/rejected": -328.3999938964844, "loss": 0.5625, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.24101562798023224, "rewards/margins": 0.595507800579071, "rewards/rejected": -0.837109386920929, "step": 1670 }, { "epoch": 0.43125643666323377, "grad_norm": 624.0, "learning_rate": 2.843717816683831e-07, "logits/chosen": -2.5625, "logits/rejected": -2.653125047683716, "logps/chosen": -244.1999969482422, "logps/rejected": -231.1999969482422, "loss": 0.6105, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.298583984375, "rewards/margins": 0.4479003846645355, "rewards/rejected": -0.7467285394668579, "step": 1675 }, { "epoch": 0.4325437693099897, "grad_norm": 612.0, "learning_rate": 2.837281153450051e-07, "logits/chosen": -2.6812500953674316, "logits/rejected": -2.6624999046325684, "logps/chosen": -321.79998779296875, "logps/rejected": -304.20001220703125, "loss": 0.5484, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.729296863079071, "rewards/margins": 0.8404296636581421, "rewards/rejected": -1.571435570716858, "step": 1680 }, { "epoch": 0.4338311019567456, "grad_norm": 648.0, "learning_rate": 2.830844490216272e-07, "logits/chosen": -2.65625, "logits/rejected": -2.7906250953674316, "logps/chosen": -374.0, "logps/rejected": -353.6000061035156, "loss": 0.5738, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.48131102323532104, "rewards/margins": 0.5121093988418579, "rewards/rejected": -0.9925781488418579, "step": 1685 }, { "epoch": 0.4351184346035015, "grad_norm": 494.0, "learning_rate": 2.8244078269824924e-07, "logits/chosen": -2.559375047683716, "logits/rejected": -2.575000047683716, "logps/chosen": -348.0, "logps/rejected": -394.0, "loss": 0.5102, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.772265613079071, "rewards/margins": 1.015625, "rewards/rejected": -1.790624976158142, "step": 1690 }, { "epoch": 0.4364057672502575, "grad_norm": 444.0, "learning_rate": 2.8179711637487123e-07, "logits/chosen": -2.512500047683716, "logits/rejected": -2.4156250953674316, "logps/chosen": -308.6000061035156, "logps/rejected": -381.20001220703125, "loss": 0.434, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8333984613418579, "rewards/margins": 1.2000000476837158, "rewards/rejected": -2.0335936546325684, "step": 1695 }, { "epoch": 0.4376930998970134, "grad_norm": 225.0, "learning_rate": 2.8115345005149327e-07, "logits/chosen": -2.5953125953674316, "logits/rejected": -2.4625000953674316, "logps/chosen": -312.0, "logps/rejected": -315.20001220703125, "loss": 0.6117, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.19406738877296448, "rewards/margins": 0.423828125, "rewards/rejected": -0.6181640625, "step": 1700 }, { "epoch": 0.4389804325437693, "grad_norm": 358.0, "learning_rate": 2.8050978372811537e-07, "logits/chosen": -2.6968750953674316, "logits/rejected": -2.484375, "logps/chosen": -308.3999938964844, "logps/rejected": -334.79998779296875, "loss": 0.5523, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3523925840854645, "rewards/margins": 0.615429699420929, "rewards/rejected": -0.969531238079071, "step": 1705 }, { "epoch": 0.44026776519052524, "grad_norm": 704.0, "learning_rate": 2.7986611740473736e-07, "logits/chosen": -2.4625000953674316, "logits/rejected": -2.4625000953674316, "logps/chosen": -360.0, "logps/rejected": -454.3999938964844, "loss": 0.4453, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.830859363079071, "rewards/margins": 1.365625023841858, "rewards/rejected": -2.196093797683716, "step": 1710 }, { "epoch": 0.44155509783728114, "grad_norm": 504.0, "learning_rate": 2.792224510813594e-07, "logits/chosen": -2.768749952316284, "logits/rejected": -2.0, "logps/chosen": -267.79998779296875, "logps/rejected": -377.6000061035156, "loss": 0.4676, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6625000238418579, "rewards/margins": 1.2062499523162842, "rewards/rejected": -1.868749976158142, "step": 1715 }, { "epoch": 0.4428424304840371, "grad_norm": 472.0, "learning_rate": 2.7857878475798144e-07, "logits/chosen": -2.737499952316284, "logits/rejected": -2.7281250953674316, "logps/chosen": -379.20001220703125, "logps/rejected": -377.20001220703125, "loss": 0.5449, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.606640636920929, "rewards/margins": 0.7144531011581421, "rewards/rejected": -1.3200194835662842, "step": 1720 }, { "epoch": 0.444129763130793, "grad_norm": 308.0, "learning_rate": 2.779351184346035e-07, "logits/chosen": -2.6468749046325684, "logits/rejected": -2.671875, "logps/chosen": -288.79998779296875, "logps/rejected": -286.6000061035156, "loss": 0.5098, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.34375, "rewards/margins": 0.697265625, "rewards/rejected": -1.039453148841858, "step": 1725 }, { "epoch": 0.4454170957775489, "grad_norm": 316.0, "learning_rate": 2.7729145211122553e-07, "logits/chosen": -2.7562499046325684, "logits/rejected": -2.731250047683716, "logps/chosen": -363.6000061035156, "logps/rejected": -388.79998779296875, "loss": 0.5727, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5375000238418579, "rewards/margins": 0.7339843511581421, "rewards/rejected": -1.2726562023162842, "step": 1730 }, { "epoch": 0.44670442842430486, "grad_norm": 249.0, "learning_rate": 2.7664778578784757e-07, "logits/chosen": -2.543750047683716, "logits/rejected": -2.2914061546325684, "logps/chosen": -266.20001220703125, "logps/rejected": -254.39999389648438, "loss": 0.5875, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.1131591796875, "rewards/margins": 0.39472657442092896, "rewards/rejected": -0.5074218511581421, "step": 1735 }, { "epoch": 0.44799176107106076, "grad_norm": 556.0, "learning_rate": 2.760041194644696e-07, "logits/chosen": -2.549999952316284, "logits/rejected": -2.4625000953674316, "logps/chosen": -281.3999938964844, "logps/rejected": -319.6000061035156, "loss": 0.4645, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.615234375, "rewards/margins": 1.181640625, "rewards/rejected": -1.796875, "step": 1740 }, { "epoch": 0.44927909371781666, "grad_norm": 464.0, "learning_rate": 2.7536045314109166e-07, "logits/chosen": -2.6812500953674316, "logits/rejected": -2.828125, "logps/chosen": -242.39999389648438, "logps/rejected": -229.60000610351562, "loss": 0.598, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.22138671576976776, "rewards/margins": 0.46049803495407104, "rewards/rejected": -0.682421863079071, "step": 1745 }, { "epoch": 0.4505664263645726, "grad_norm": 600.0, "learning_rate": 2.747167868177137e-07, "logits/chosen": -2.6937499046325684, "logits/rejected": -2.6812500953674316, "logps/chosen": -349.20001220703125, "logps/rejected": -398.79998779296875, "loss": 0.5797, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.903515636920929, "rewards/margins": 0.818164050579071, "rewards/rejected": -1.7255370616912842, "step": 1750 }, { "epoch": 0.4518537590113285, "grad_norm": 274.0, "learning_rate": 2.7407312049433574e-07, "logits/chosen": -2.5687499046325684, "logits/rejected": -2.8031249046325684, "logps/chosen": -354.0, "logps/rejected": -310.20001220703125, "loss": 0.5184, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2505859434604645, "rewards/margins": 0.7109375, "rewards/rejected": -0.9609375, "step": 1755 }, { "epoch": 0.45314109165808447, "grad_norm": 374.0, "learning_rate": 2.7342945417095773e-07, "logits/chosen": -2.4937500953674316, "logits/rejected": -2.4281249046325684, "logps/chosen": -275.6000061035156, "logps/rejected": -308.6000061035156, "loss": 0.5828, "rewards/accuracies": 0.5625, "rewards/chosen": -0.579785168170929, "rewards/margins": 0.724609375, "rewards/rejected": -1.303125023841858, "step": 1760 }, { "epoch": 0.4544284243048404, "grad_norm": 428.0, "learning_rate": 2.727857878475798e-07, "logits/chosen": -2.6812500953674316, "logits/rejected": -2.6781249046325684, "logps/chosen": -332.0, "logps/rejected": -418.0, "loss": 0.5473, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5425781011581421, "rewards/margins": 0.766308605670929, "rewards/rejected": -1.3097655773162842, "step": 1765 }, { "epoch": 0.4557157569515963, "grad_norm": 300.0, "learning_rate": 2.7214212152420187e-07, "logits/chosen": -2.375, "logits/rejected": -2.4281249046325684, "logps/chosen": -313.20001220703125, "logps/rejected": -358.3999938964844, "loss": 0.4758, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.39960938692092896, "rewards/margins": 0.940234363079071, "rewards/rejected": -1.3380858898162842, "step": 1770 }, { "epoch": 0.45700308959835223, "grad_norm": 450.0, "learning_rate": 2.7149845520082386e-07, "logits/chosen": -2.5015625953674316, "logits/rejected": -2.6875, "logps/chosen": -366.0, "logps/rejected": -342.0, "loss": 0.4711, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3873046934604645, "rewards/margins": 0.994140625, "rewards/rejected": -1.3831055164337158, "step": 1775 }, { "epoch": 0.45829042224510813, "grad_norm": 644.0, "learning_rate": 2.708547888774459e-07, "logits/chosen": -2.684375047683716, "logits/rejected": -2.7093749046325684, "logps/chosen": -337.20001220703125, "logps/rejected": -371.20001220703125, "loss": 0.5773, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6231445074081421, "rewards/margins": 0.66796875, "rewards/rejected": -1.291406273841858, "step": 1780 }, { "epoch": 0.45957775489186403, "grad_norm": 450.0, "learning_rate": 2.70211122554068e-07, "logits/chosen": -2.7281250953674316, "logits/rejected": -2.6875, "logps/chosen": -315.3999938964844, "logps/rejected": -311.6000061035156, "loss": 0.5293, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5166015625, "rewards/margins": 0.8271484375, "rewards/rejected": -1.3449218273162842, "step": 1785 }, { "epoch": 0.46086508753862, "grad_norm": 178.0, "learning_rate": 2.6956745623069e-07, "logits/chosen": -2.534374952316284, "logits/rejected": -2.715625047683716, "logps/chosen": -278.6000061035156, "logps/rejected": -295.3999938964844, "loss": 0.5703, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.3626464903354645, "rewards/margins": 0.8878418207168579, "rewards/rejected": -1.2488281726837158, "step": 1790 }, { "epoch": 0.4621524201853759, "grad_norm": 262.0, "learning_rate": 2.6892378990731203e-07, "logits/chosen": -2.653125047683716, "logits/rejected": -2.674999952316284, "logps/chosen": -314.20001220703125, "logps/rejected": -343.6000061035156, "loss": 0.5262, "rewards/accuracies": 0.625, "rewards/chosen": -0.3267578184604645, "rewards/margins": 0.8929687738418579, "rewards/rejected": -1.2194335460662842, "step": 1795 }, { "epoch": 0.46343975283213185, "grad_norm": 688.0, "learning_rate": 2.6828012358393407e-07, "logits/chosen": -2.6968750953674316, "logits/rejected": -2.65625, "logps/chosen": -280.3999938964844, "logps/rejected": -370.0, "loss": 0.5156, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.34184569120407104, "rewards/margins": 0.9976562261581421, "rewards/rejected": -1.337499976158142, "step": 1800 }, { "epoch": 0.46472708547888775, "grad_norm": 404.0, "learning_rate": 2.676364572605561e-07, "logits/chosen": -2.4937500953674316, "logits/rejected": -2.6656250953674316, "logps/chosen": -211.8000030517578, "logps/rejected": -238.89999389648438, "loss": 0.6211, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.297341912984848, "rewards/margins": 0.42402344942092896, "rewards/rejected": -0.722216784954071, "step": 1805 }, { "epoch": 0.46601441812564365, "grad_norm": 496.0, "learning_rate": 2.6699279093717816e-07, "logits/chosen": -2.20703125, "logits/rejected": -2.073046922683716, "logps/chosen": -254.1999969482422, "logps/rejected": -317.29998779296875, "loss": 0.5051, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.47832030057907104, "rewards/margins": 0.800097644329071, "rewards/rejected": -1.28125, "step": 1810 }, { "epoch": 0.4673017507723996, "grad_norm": 408.0, "learning_rate": 2.663491246138002e-07, "logits/chosen": -2.59375, "logits/rejected": -2.606250047683716, "logps/chosen": -275.0, "logps/rejected": -346.79998779296875, "loss": 0.482, "rewards/accuracies": 0.75, "rewards/chosen": -0.529101550579071, "rewards/margins": 0.9423828125, "rewards/rejected": -1.470312476158142, "step": 1815 }, { "epoch": 0.4685890834191555, "grad_norm": 282.0, "learning_rate": 2.6570545829042224e-07, "logits/chosen": -2.5718750953674316, "logits/rejected": -2.4781250953674316, "logps/chosen": -270.79998779296875, "logps/rejected": -331.20001220703125, "loss": 0.5266, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.27392578125, "rewards/margins": 0.7250000238418579, "rewards/rejected": -0.9984375238418579, "step": 1820 }, { "epoch": 0.4698764160659114, "grad_norm": 656.0, "learning_rate": 2.6506179196704423e-07, "logits/chosen": -2.5484375953674316, "logits/rejected": -2.5531249046325684, "logps/chosen": -368.3999938964844, "logps/rejected": -389.3999938964844, "loss": 0.5297, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6011718511581421, "rewards/margins": 1.0144531726837158, "rewards/rejected": -1.61328125, "step": 1825 }, { "epoch": 0.47116374871266736, "grad_norm": 314.0, "learning_rate": 2.6441812564366633e-07, "logits/chosen": -2.549999952316284, "logits/rejected": -2.659374952316284, "logps/chosen": -244.5, "logps/rejected": -312.0, "loss": 0.4977, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.540234386920929, "rewards/margins": 1.0499999523162842, "rewards/rejected": -1.588281273841858, "step": 1830 }, { "epoch": 0.47245108135942326, "grad_norm": 468.0, "learning_rate": 2.6377445932028837e-07, "logits/chosen": -2.496875047683716, "logits/rejected": -2.40625, "logps/chosen": -310.20001220703125, "logps/rejected": -350.0, "loss": 0.4854, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3841796815395355, "rewards/margins": 0.92578125, "rewards/rejected": -1.3117187023162842, "step": 1835 }, { "epoch": 0.4737384140061792, "grad_norm": 350.0, "learning_rate": 2.6313079299691036e-07, "logits/chosen": -2.6312499046325684, "logits/rejected": -2.6500000953674316, "logps/chosen": -327.20001220703125, "logps/rejected": -293.6000061035156, "loss": 0.4957, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.11298827826976776, "rewards/margins": 0.826953113079071, "rewards/rejected": -0.939453125, "step": 1840 }, { "epoch": 0.4750257466529351, "grad_norm": 300.0, "learning_rate": 2.6248712667353246e-07, "logits/chosen": -2.65625, "logits/rejected": -2.7125000953674316, "logps/chosen": -336.3999938964844, "logps/rejected": -382.79998779296875, "loss": 0.3559, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.4139160215854645, "rewards/margins": 1.384374976158142, "rewards/rejected": -1.7999999523162842, "step": 1845 }, { "epoch": 0.476313079299691, "grad_norm": 300.0, "learning_rate": 2.618434603501545e-07, "logits/chosen": -2.700000047683716, "logits/rejected": -2.746875047683716, "logps/chosen": -329.6000061035156, "logps/rejected": -372.79998779296875, "loss": 0.4656, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.12443847954273224, "rewards/margins": 1.0148437023162842, "rewards/rejected": -1.138671875, "step": 1850 }, { "epoch": 0.477600411946447, "grad_norm": 506.0, "learning_rate": 2.611997940267765e-07, "logits/chosen": -2.6187500953674316, "logits/rejected": -2.731250047683716, "logps/chosen": -249.5, "logps/rejected": -272.8999938964844, "loss": 0.5629, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.36713868379592896, "rewards/margins": 0.751269519329071, "rewards/rejected": -1.11767578125, "step": 1855 }, { "epoch": 0.4788877445932029, "grad_norm": 644.0, "learning_rate": 2.6055612770339853e-07, "logits/chosen": -2.5562500953674316, "logits/rejected": -2.5875000953674316, "logps/chosen": -203.3000030517578, "logps/rejected": -252.64999389648438, "loss": 0.5852, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.24394531548023224, "rewards/margins": 0.4900878965854645, "rewards/rejected": -0.7325683832168579, "step": 1860 }, { "epoch": 0.4801750772399588, "grad_norm": 532.0, "learning_rate": 2.599124613800206e-07, "logits/chosen": -2.590625047683716, "logits/rejected": -2.628124952316284, "logps/chosen": -332.6000061035156, "logps/rejected": -379.6000061035156, "loss": 0.5594, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.00390625, "rewards/margins": 0.924609363079071, "rewards/rejected": -1.9249999523162842, "step": 1865 }, { "epoch": 0.48146240988671474, "grad_norm": 316.0, "learning_rate": 2.592687950566426e-07, "logits/chosen": -2.609375, "logits/rejected": -2.621875047683716, "logps/chosen": -338.3999938964844, "logps/rejected": -412.79998779296875, "loss": 0.3889, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7579590082168579, "rewards/margins": 1.2468750476837158, "rewards/rejected": -2.0023436546325684, "step": 1870 }, { "epoch": 0.48274974253347064, "grad_norm": 532.0, "learning_rate": 2.5862512873326466e-07, "logits/chosen": -2.59375, "logits/rejected": -2.768749952316284, "logps/chosen": -387.79998779296875, "logps/rejected": -349.6000061035156, "loss": 0.5305, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3650573790073395, "rewards/margins": 0.6792968511581421, "rewards/rejected": -1.043096899986267, "step": 1875 }, { "epoch": 0.4840370751802266, "grad_norm": 360.0, "learning_rate": 2.579814624098867e-07, "logits/chosen": -2.5687499046325684, "logits/rejected": -2.453125, "logps/chosen": -192.39999389648438, "logps/rejected": -218.39999389648438, "loss": 0.5863, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.4339843690395355, "rewards/margins": 0.578125, "rewards/rejected": -1.0126953125, "step": 1880 }, { "epoch": 0.4853244078269825, "grad_norm": 270.0, "learning_rate": 2.5733779608650874e-07, "logits/chosen": -2.543750047683716, "logits/rejected": -2.512500047683716, "logps/chosen": -346.20001220703125, "logps/rejected": -333.0, "loss": 0.5506, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.24648436903953552, "rewards/margins": 0.7342773675918579, "rewards/rejected": -0.9818359613418579, "step": 1885 }, { "epoch": 0.4866117404737384, "grad_norm": 676.0, "learning_rate": 2.566941297631308e-07, "logits/chosen": -2.421875, "logits/rejected": -2.4593749046325684, "logps/chosen": -304.6000061035156, "logps/rejected": -382.0, "loss": 0.4582, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.90625, "rewards/margins": 1.2625000476837158, "rewards/rejected": -2.168750047683716, "step": 1890 }, { "epoch": 0.48789907312049435, "grad_norm": 268.0, "learning_rate": 2.5605046343975283e-07, "logits/chosen": -2.1703124046325684, "logits/rejected": -2.778125047683716, "logps/chosen": -190.8000030517578, "logps/rejected": -227.8000030517578, "loss": 0.5863, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.3798828125, "rewards/margins": 0.515917956829071, "rewards/rejected": -0.8964508175849915, "step": 1895 }, { "epoch": 0.48918640576725025, "grad_norm": 233.0, "learning_rate": 2.5540679711637487e-07, "logits/chosen": -2.7093749046325684, "logits/rejected": -2.706249952316284, "logps/chosen": -316.8999938964844, "logps/rejected": -318.29998779296875, "loss": 0.5766, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.798632800579071, "rewards/margins": 0.57177734375, "rewards/rejected": -1.3718750476837158, "step": 1900 }, { "epoch": 0.49047373841400616, "grad_norm": 564.0, "learning_rate": 2.5476313079299686e-07, "logits/chosen": -2.565624952316284, "logits/rejected": -2.518749952316284, "logps/chosen": -303.3999938964844, "logps/rejected": -319.20001220703125, "loss": 0.5941, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.4140625, "rewards/margins": 0.6026366949081421, "rewards/rejected": -1.01611328125, "step": 1905 }, { "epoch": 0.4917610710607621, "grad_norm": 548.0, "learning_rate": 2.5411946446961896e-07, "logits/chosen": -2.5406250953674316, "logits/rejected": -2.4375, "logps/chosen": -329.20001220703125, "logps/rejected": -405.20001220703125, "loss": 0.4316, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7890625, "rewards/margins": 1.212499976158142, "rewards/rejected": -1.998437523841858, "step": 1910 }, { "epoch": 0.493048403707518, "grad_norm": 458.0, "learning_rate": 2.53475798146241e-07, "logits/chosen": -2.4625000953674316, "logits/rejected": -2.5875000953674316, "logps/chosen": -291.20001220703125, "logps/rejected": -385.6000061035156, "loss": 0.4486, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5162597894668579, "rewards/margins": 1.196874976158142, "rewards/rejected": -1.712499976158142, "step": 1915 }, { "epoch": 0.49433573635427397, "grad_norm": 290.0, "learning_rate": 2.52832131822863e-07, "logits/chosen": -2.5562500953674316, "logits/rejected": -2.575000047683716, "logps/chosen": -330.3999938964844, "logps/rejected": -357.20001220703125, "loss": 0.4578, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.543408215045929, "rewards/margins": 1.0304687023162842, "rewards/rejected": -1.572656273841858, "step": 1920 }, { "epoch": 0.49562306900102987, "grad_norm": 840.0, "learning_rate": 2.521884654994851e-07, "logits/chosen": -2.6656250953674316, "logits/rejected": -2.71875, "logps/chosen": -302.0, "logps/rejected": -319.6000061035156, "loss": 0.5309, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.740185558795929, "rewards/margins": 0.8921874761581421, "rewards/rejected": -1.6328125, "step": 1925 }, { "epoch": 0.49691040164778577, "grad_norm": 332.0, "learning_rate": 2.5154479917610713e-07, "logits/chosen": -2.440624952316284, "logits/rejected": -2.65625, "logps/chosen": -323.20001220703125, "logps/rejected": -323.6000061035156, "loss": 0.6566, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.8861328363418579, "rewards/margins": 0.776562511920929, "rewards/rejected": -1.662500023841858, "step": 1930 }, { "epoch": 0.4981977342945417, "grad_norm": 93.0, "learning_rate": 2.509011328527291e-07, "logits/chosen": NaN, "logits/rejected": -2.659374952316284, "logps/chosen": -247.1999969482422, "logps/rejected": -235.02499389648438, "loss": 0.5684, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.36259764432907104, "rewards/margins": 0.658251941204071, "rewards/rejected": -1.019921898841858, "step": 1935 }, { "epoch": 0.49948506694129763, "grad_norm": 724.0, "learning_rate": 2.5025746652935116e-07, "logits/chosen": -2.5374999046325684, "logits/rejected": -2.65625, "logps/chosen": -280.20001220703125, "logps/rejected": -321.79998779296875, "loss": 0.5402, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3802734315395355, "rewards/margins": 0.7216796875, "rewards/rejected": -1.103515625, "step": 1940 }, { "epoch": 0.5007723995880535, "grad_norm": 178.0, "learning_rate": 2.496138002059732e-07, "logits/chosen": -2.606250047683716, "logits/rejected": -2.5062499046325684, "logps/chosen": -263.0, "logps/rejected": -267.20001220703125, "loss": 0.5859, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4427734315395355, "rewards/margins": 0.560546875, "rewards/rejected": -1.0011718273162842, "step": 1945 }, { "epoch": 0.5020597322348095, "grad_norm": 488.0, "learning_rate": 2.4897013388259525e-07, "logits/chosen": -2.5687499046325684, "logits/rejected": -2.643749952316284, "logps/chosen": -385.6000061035156, "logps/rejected": -398.3999938964844, "loss": 0.5145, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6246093511581421, "rewards/margins": 0.8296874761581421, "rewards/rejected": -1.451562523841858, "step": 1950 }, { "epoch": 0.5033470648815654, "grad_norm": 390.0, "learning_rate": 2.483264675592173e-07, "logits/chosen": -2.3968749046325684, "logits/rejected": -2.424999952316284, "logps/chosen": -291.0, "logps/rejected": -328.3999938964844, "loss": 0.6, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.7562500238418579, "rewards/margins": 0.6871093511581421, "rewards/rejected": -1.442968726158142, "step": 1955 }, { "epoch": 0.5046343975283213, "grad_norm": 568.0, "learning_rate": 2.4768280123583933e-07, "logits/chosen": -2.6187500953674316, "logits/rejected": -2.5625, "logps/chosen": -326.79998779296875, "logps/rejected": -450.79998779296875, "loss": 0.4303, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8216308355331421, "rewards/margins": 1.4246094226837158, "rewards/rejected": -2.245312452316284, "step": 1960 }, { "epoch": 0.5059217301750772, "grad_norm": 600.0, "learning_rate": 2.470391349124614e-07, "logits/chosen": -2.6312499046325684, "logits/rejected": -2.6624999046325684, "logps/chosen": -271.3999938964844, "logps/rejected": -273.20001220703125, "loss": 0.7164, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.4625244140625, "rewards/margins": 0.23847655951976776, "rewards/rejected": -0.700976550579071, "step": 1965 }, { "epoch": 0.5072090628218332, "grad_norm": 428.0, "learning_rate": 2.463954685890834e-07, "logits/chosen": -2.5562500953674316, "logits/rejected": -2.643749952316284, "logps/chosen": -364.0, "logps/rejected": -349.6000061035156, "loss": 0.568, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.25605469942092896, "rewards/margins": 0.579882800579071, "rewards/rejected": -0.835156261920929, "step": 1970 }, { "epoch": 0.508496395468589, "grad_norm": 490.0, "learning_rate": 2.4575180226570546e-07, "logits/chosen": -2.6031250953674316, "logits/rejected": -2.674999952316284, "logps/chosen": -328.79998779296875, "logps/rejected": -352.79998779296875, "loss": 0.5797, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.49531251192092896, "rewards/margins": 0.637890636920929, "rewards/rejected": -1.1335937976837158, "step": 1975 }, { "epoch": 0.509783728115345, "grad_norm": 336.0, "learning_rate": 2.451081359423275e-07, "logits/chosen": -2.5875000953674316, "logits/rejected": -2.590625047683716, "logps/chosen": -319.79998779296875, "logps/rejected": -423.6000061035156, "loss": 0.5352, "rewards/accuracies": 0.75, "rewards/chosen": -0.681445300579071, "rewards/margins": 0.8824218511581421, "rewards/rejected": -1.564843773841858, "step": 1980 }, { "epoch": 0.511071060762101, "grad_norm": 568.0, "learning_rate": 2.4446446961894955e-07, "logits/chosen": -2.606250047683716, "logits/rejected": -2.6875, "logps/chosen": -297.0, "logps/rejected": -377.3999938964844, "loss": 0.5348, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.819995105266571, "rewards/margins": 0.753125011920929, "rewards/rejected": -1.572265625, "step": 1985 }, { "epoch": 0.5123583934088568, "grad_norm": 510.0, "learning_rate": 2.4382080329557153e-07, "logits/chosen": -2.515625, "logits/rejected": -2.317187547683716, "logps/chosen": -341.0, "logps/rejected": -333.20001220703125, "loss": 0.4598, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.515332043170929, "rewards/margins": 0.9789062738418579, "rewards/rejected": -1.4929687976837158, "step": 1990 }, { "epoch": 0.5136457260556128, "grad_norm": 454.0, "learning_rate": 2.4317713697219363e-07, "logits/chosen": -2.7406249046325684, "logits/rejected": -2.8499999046325684, "logps/chosen": -283.79998779296875, "logps/rejected": -349.79998779296875, "loss": 0.643, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.6470702886581421, "rewards/margins": 0.3970703184604645, "rewards/rejected": -1.04296875, "step": 1995 }, { "epoch": 0.5149330587023687, "grad_norm": 716.0, "learning_rate": 2.425334706488156e-07, "logits/chosen": -2.7750000953674316, "logits/rejected": -2.481250047683716, "logps/chosen": -282.79998779296875, "logps/rejected": -281.79998779296875, "loss": 0.5535, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.21826171875, "rewards/margins": 0.6778320074081421, "rewards/rejected": -0.8934570550918579, "step": 2000 }, { "epoch": 0.5162203913491246, "grad_norm": 668.0, "learning_rate": 2.4188980432543766e-07, "logits/chosen": -2.8343749046325684, "logits/rejected": -2.796875, "logps/chosen": -336.0, "logps/rejected": -346.0, "loss": 0.6234, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.599609375, "rewards/margins": 0.5661255121231079, "rewards/rejected": -1.1648437976837158, "step": 2005 }, { "epoch": 0.5175077239958805, "grad_norm": 292.0, "learning_rate": 2.412461380020597e-07, "logits/chosen": -2.6812500953674316, "logits/rejected": -2.621875047683716, "logps/chosen": -367.6000061035156, "logps/rejected": -374.0, "loss": 0.4941, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.6953125, "rewards/margins": 0.8960937261581421, "rewards/rejected": -1.588281273841858, "step": 2010 }, { "epoch": 0.5187950566426365, "grad_norm": 672.0, "learning_rate": 2.4060247167868175e-07, "logits/chosen": -2.6656250953674316, "logits/rejected": -2.450000047683716, "logps/chosen": -269.79998779296875, "logps/rejected": -301.0, "loss": 0.6457, "rewards/accuracies": 0.5, "rewards/chosen": -0.47929686307907104, "rewards/margins": 0.541015625, "rewards/rejected": -1.020898461341858, "step": 2015 }, { "epoch": 0.5200823892893924, "grad_norm": 448.0, "learning_rate": 2.399588053553038e-07, "logits/chosen": -2.6187500953674316, "logits/rejected": -2.3671875, "logps/chosen": -249.8000030517578, "logps/rejected": -216.5, "loss": 0.6906, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.11014404147863388, "rewards/margins": 0.12255859375, "rewards/rejected": -0.23273925483226776, "step": 2020 }, { "epoch": 0.5213697219361483, "grad_norm": 536.0, "learning_rate": 2.3931513903192583e-07, "logits/chosen": -2.659374952316284, "logits/rejected": -2.674999952316284, "logps/chosen": -323.0, "logps/rejected": -372.79998779296875, "loss": 0.3793, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.622021496295929, "rewards/margins": 1.2585937976837158, "rewards/rejected": -1.87890625, "step": 2025 }, { "epoch": 0.5226570545829042, "grad_norm": 406.0, "learning_rate": 2.386714727085479e-07, "logits/chosen": -2.721874952316284, "logits/rejected": -2.6656250953674316, "logps/chosen": -359.6000061035156, "logps/rejected": -391.79998779296875, "loss": 0.4406, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5478515625, "rewards/margins": 1.025781273841858, "rewards/rejected": -1.5730469226837158, "step": 2030 }, { "epoch": 0.5239443872296602, "grad_norm": 556.0, "learning_rate": 2.3802780638516992e-07, "logits/chosen": -2.609375, "logits/rejected": -2.5531249046325684, "logps/chosen": -262.20001220703125, "logps/rejected": -260.6000061035156, "loss": 0.5559, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5130859613418579, "rewards/margins": 0.6276000738143921, "rewards/rejected": -1.1408202648162842, "step": 2035 }, { "epoch": 0.525231719876416, "grad_norm": 424.0, "learning_rate": 2.3738414006179194e-07, "logits/chosen": -2.456249952316284, "logits/rejected": -2.301562547683716, "logps/chosen": -296.79998779296875, "logps/rejected": -342.20001220703125, "loss": 0.602, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.606249988079071, "rewards/margins": 0.6880859136581421, "rewards/rejected": -1.2917969226837158, "step": 2040 }, { "epoch": 0.526519052523172, "grad_norm": 490.0, "learning_rate": 2.36740473738414e-07, "logits/chosen": -2.549999952316284, "logits/rejected": -2.5843749046325684, "logps/chosen": -362.0, "logps/rejected": -373.20001220703125, "loss": 0.4543, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.537353515625, "rewards/margins": 0.975781261920929, "rewards/rejected": -1.5125000476837158, "step": 2045 }, { "epoch": 0.527806385169928, "grad_norm": 324.0, "learning_rate": 2.3609680741503605e-07, "logits/chosen": -2.6875, "logits/rejected": -2.325000047683716, "logps/chosen": -277.5, "logps/rejected": -280.79998779296875, "loss": 0.5215, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.36311644315719604, "rewards/margins": 0.69921875, "rewards/rejected": -1.062890648841858, "step": 2050 }, { "epoch": 0.5290937178166838, "grad_norm": 420.0, "learning_rate": 2.3545314109165806e-07, "logits/chosen": -2.5921874046325684, "logits/rejected": -2.5015625953674316, "logps/chosen": -299.20001220703125, "logps/rejected": -314.6000061035156, "loss": 0.5805, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6024414300918579, "rewards/margins": 0.8373047113418579, "rewards/rejected": -1.4396483898162842, "step": 2055 }, { "epoch": 0.5303810504634398, "grad_norm": 536.0, "learning_rate": 2.348094747682801e-07, "logits/chosen": -2.6031250953674316, "logits/rejected": -2.620312452316284, "logps/chosen": -304.6000061035156, "logps/rejected": -331.3999938964844, "loss": 0.4318, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.508593738079071, "rewards/margins": 1.1339843273162842, "rewards/rejected": -1.642968773841858, "step": 2060 }, { "epoch": 0.5316683831101957, "grad_norm": 420.0, "learning_rate": 2.3416580844490218e-07, "logits/chosen": -2.590625047683716, "logits/rejected": -2.6624999046325684, "logps/chosen": -332.79998779296875, "logps/rejected": -371.20001220703125, "loss": 0.5535, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5326172113418579, "rewards/margins": 0.7259765863418579, "rewards/rejected": -1.2585937976837158, "step": 2065 }, { "epoch": 0.5329557157569516, "grad_norm": 620.0, "learning_rate": 2.335221421215242e-07, "logits/chosen": -2.637500047683716, "logits/rejected": -2.706249952316284, "logps/chosen": -290.79998779296875, "logps/rejected": -351.79998779296875, "loss": 0.4621, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6615234613418579, "rewards/margins": 0.9609375, "rewards/rejected": -1.6228516101837158, "step": 2070 }, { "epoch": 0.5342430484037075, "grad_norm": 528.0, "learning_rate": 2.3287847579814623e-07, "logits/chosen": -2.8968749046325684, "logits/rejected": -2.6859374046325684, "logps/chosen": -240.10000610351562, "logps/rejected": -226.60000610351562, "loss": 0.6414, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.17319336533546448, "rewards/margins": 0.30615234375, "rewards/rejected": -0.4789062440395355, "step": 2075 }, { "epoch": 0.5355303810504635, "grad_norm": 436.0, "learning_rate": 2.3223480947476825e-07, "logits/chosen": -2.596874952316284, "logits/rejected": -2.465625047683716, "logps/chosen": -370.79998779296875, "logps/rejected": -327.0, "loss": 0.5641, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.43867188692092896, "rewards/margins": 0.527539074420929, "rewards/rejected": -0.967968761920929, "step": 2080 }, { "epoch": 0.5368177136972193, "grad_norm": 632.0, "learning_rate": 2.315911431513903e-07, "logits/chosen": -2.6656250953674316, "logits/rejected": -2.453125, "logps/chosen": -338.0, "logps/rejected": -331.6000061035156, "loss": 0.6547, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.534375011920929, "rewards/margins": 0.3656249940395355, "rewards/rejected": -0.900390625, "step": 2085 }, { "epoch": 0.5381050463439753, "grad_norm": 304.0, "learning_rate": 2.3094747682801236e-07, "logits/chosen": -2.456249952316284, "logits/rejected": -2.378124952316284, "logps/chosen": -266.20001220703125, "logps/rejected": -252.39999389648438, "loss": 0.5777, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.36127930879592896, "rewards/margins": 0.5462890863418579, "rewards/rejected": -0.908203125, "step": 2090 }, { "epoch": 0.5393923789907312, "grad_norm": 510.0, "learning_rate": 2.3030381050463438e-07, "logits/chosen": -2.684375047683716, "logits/rejected": -2.609375, "logps/chosen": -300.20001220703125, "logps/rejected": -347.6000061035156, "loss": 0.5648, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.44599610567092896, "rewards/margins": 0.7416015863418579, "rewards/rejected": -1.1906249523162842, "step": 2095 }, { "epoch": 0.5406797116374872, "grad_norm": 466.0, "learning_rate": 2.2966014418125642e-07, "logits/chosen": -2.721874952316284, "logits/rejected": -2.7437500953674316, "logps/chosen": -307.0, "logps/rejected": -322.6000061035156, "loss": 0.5906, "rewards/accuracies": 0.5625, "rewards/chosen": -0.586132824420929, "rewards/margins": 0.578906238079071, "rewards/rejected": -1.1640625, "step": 2100 }, { "epoch": 0.541967044284243, "grad_norm": 588.0, "learning_rate": 2.2901647785787846e-07, "logits/chosen": -2.6500000953674316, "logits/rejected": -2.659374952316284, "logps/chosen": -356.0, "logps/rejected": -416.0, "loss": 0.4496, "rewards/accuracies": 0.75, "rewards/chosen": -0.7757812738418579, "rewards/margins": 1.1006348133087158, "rewards/rejected": -1.879296898841858, "step": 2105 }, { "epoch": 0.543254376930999, "grad_norm": 552.0, "learning_rate": 2.283728115345005e-07, "logits/chosen": -2.7093749046325684, "logits/rejected": -2.7437500953674316, "logps/chosen": -315.20001220703125, "logps/rejected": -287.79998779296875, "loss": 0.5836, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.26520997285842896, "rewards/margins": 0.588671863079071, "rewards/rejected": -0.8539062738418579, "step": 2110 }, { "epoch": 0.5445417095777549, "grad_norm": 612.0, "learning_rate": 2.2772914521112255e-07, "logits/chosen": -2.6343750953674316, "logits/rejected": -2.5687499046325684, "logps/chosen": -331.20001220703125, "logps/rejected": -370.3999938964844, "loss": 0.4898, "rewards/accuracies": 0.6875, "rewards/chosen": -0.36113280057907104, "rewards/margins": 0.9839843511581421, "rewards/rejected": -1.346093773841858, "step": 2115 }, { "epoch": 0.5458290422245108, "grad_norm": 189.0, "learning_rate": 2.2708547888774457e-07, "logits/chosen": -2.734375, "logits/rejected": -2.762500047683716, "logps/chosen": -306.20001220703125, "logps/rejected": -334.0, "loss": 0.5543, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.571362316608429, "rewards/margins": 0.8062499761581421, "rewards/rejected": -1.3781249523162842, "step": 2120 }, { "epoch": 0.5471163748712667, "grad_norm": 588.0, "learning_rate": 2.264418125643666e-07, "logits/chosen": -2.75, "logits/rejected": -2.6968750953674316, "logps/chosen": -349.79998779296875, "logps/rejected": -385.20001220703125, "loss": 0.4695, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3628906309604645, "rewards/margins": 0.8770507574081421, "rewards/rejected": -1.2412109375, "step": 2125 }, { "epoch": 0.5484037075180227, "grad_norm": 624.0, "learning_rate": 2.2579814624098868e-07, "logits/chosen": -2.4000000953674316, "logits/rejected": -2.3499999046325684, "logps/chosen": -335.6000061035156, "logps/rejected": -305.20001220703125, "loss": 0.5891, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.46992188692092896, "rewards/margins": 0.6634765863418579, "rewards/rejected": -1.1335937976837158, "step": 2130 }, { "epoch": 0.5496910401647785, "grad_norm": 360.0, "learning_rate": 2.251544799176107e-07, "logits/chosen": -2.628124952316284, "logits/rejected": -2.628124952316284, "logps/chosen": -351.20001220703125, "logps/rejected": -376.79998779296875, "loss": 0.5242, "rewards/accuracies": 0.6875, "rewards/chosen": -0.694775402545929, "rewards/margins": 0.916015625, "rewards/rejected": -1.609375, "step": 2135 }, { "epoch": 0.5509783728115345, "grad_norm": 322.0, "learning_rate": 2.2451081359423274e-07, "logits/chosen": -2.512500047683716, "logits/rejected": -2.6031250953674316, "logps/chosen": -261.0, "logps/rejected": -351.6000061035156, "loss": 0.4707, "rewards/accuracies": 0.625, "rewards/chosen": -0.5244140625, "rewards/margins": 1.00830078125, "rewards/rejected": -1.530859351158142, "step": 2140 }, { "epoch": 0.5522657054582905, "grad_norm": 516.0, "learning_rate": 2.2386714727085478e-07, "logits/chosen": -2.6312499046325684, "logits/rejected": -2.703125, "logps/chosen": -350.20001220703125, "logps/rejected": -318.79998779296875, "loss": 0.5813, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.373046875, "rewards/margins": 0.5, "rewards/rejected": -0.8726562261581421, "step": 2145 }, { "epoch": 0.5535530381050463, "grad_norm": 628.0, "learning_rate": 2.2322348094747682e-07, "logits/chosen": -2.6656250953674316, "logits/rejected": -2.734375, "logps/chosen": -300.3999938964844, "logps/rejected": -376.20001220703125, "loss": 0.484, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.41191405057907104, "rewards/margins": 1.07421875, "rewards/rejected": -1.4874999523162842, "step": 2150 }, { "epoch": 0.5548403707518023, "grad_norm": 688.0, "learning_rate": 2.2257981462409886e-07, "logits/chosen": -2.612499952316284, "logits/rejected": -2.5390625, "logps/chosen": -354.79998779296875, "logps/rejected": -383.3999938964844, "loss": 0.5832, "rewards/accuracies": 0.625, "rewards/chosen": -0.9765625, "rewards/margins": 0.786328136920929, "rewards/rejected": -1.7628905773162842, "step": 2155 }, { "epoch": 0.5561277033985582, "grad_norm": 576.0, "learning_rate": 2.219361483007209e-07, "logits/chosen": -2.4906249046325684, "logits/rejected": -2.5015625953674316, "logps/chosen": -324.0, "logps/rejected": -380.0, "loss": 0.502, "rewards/accuracies": 0.6875, "rewards/chosen": -0.33232420682907104, "rewards/margins": 0.875781238079071, "rewards/rejected": -1.205468773841858, "step": 2160 }, { "epoch": 0.5574150360453141, "grad_norm": 888.0, "learning_rate": 2.2129248197734292e-07, "logits/chosen": -2.6156249046325684, "logits/rejected": -2.53125, "logps/chosen": -347.79998779296875, "logps/rejected": -353.0, "loss": 0.6707, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.770922839641571, "rewards/margins": 0.7535156011581421, "rewards/rejected": -1.5261719226837158, "step": 2165 }, { "epoch": 0.55870236869207, "grad_norm": 684.0, "learning_rate": 2.20648815653965e-07, "logits/chosen": -2.575000047683716, "logits/rejected": -2.65625, "logps/chosen": -285.6000061035156, "logps/rejected": -300.20001220703125, "loss": 0.5277, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4578613340854645, "rewards/margins": 0.742382824420929, "rewards/rejected": -1.199609398841858, "step": 2170 }, { "epoch": 0.559989701338826, "grad_norm": 398.0, "learning_rate": 2.20005149330587e-07, "logits/chosen": -2.621875047683716, "logits/rejected": -2.487499952316284, "logps/chosen": -306.6000061035156, "logps/rejected": -333.20001220703125, "loss": 0.5191, "rewards/accuracies": 0.6875, "rewards/chosen": -0.746874988079071, "rewards/margins": 1.076562523841858, "rewards/rejected": -1.825781226158142, "step": 2175 }, { "epoch": 0.5612770339855818, "grad_norm": 464.0, "learning_rate": 2.1936148300720905e-07, "logits/chosen": -2.596874952316284, "logits/rejected": -2.628124952316284, "logps/chosen": -293.6000061035156, "logps/rejected": -364.0, "loss": 0.4535, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.564453125, "rewards/margins": 1.181249976158142, "rewards/rejected": -1.744531273841858, "step": 2180 }, { "epoch": 0.5625643666323378, "grad_norm": 892.0, "learning_rate": 2.187178166838311e-07, "logits/chosen": -2.637500047683716, "logits/rejected": -2.653125047683716, "logps/chosen": -265.79998779296875, "logps/rejected": -356.0, "loss": 0.5023, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5074218511581421, "rewards/margins": 1.0984375476837158, "rewards/rejected": -1.607031226158142, "step": 2185 }, { "epoch": 0.5638516992790937, "grad_norm": 920.0, "learning_rate": 2.1807415036045314e-07, "logits/chosen": -2.5609374046325684, "logits/rejected": -2.479687452316284, "logps/chosen": -322.79998779296875, "logps/rejected": -378.3999938964844, "loss": 0.4902, "rewards/accuracies": 0.6875, "rewards/chosen": -0.48359376192092896, "rewards/margins": 0.9527343511581421, "rewards/rejected": -1.439062476158142, "step": 2190 }, { "epoch": 0.5651390319258497, "grad_norm": 420.0, "learning_rate": 2.1743048403707518e-07, "logits/chosen": -2.6656250953674316, "logits/rejected": NaN, "logps/chosen": -285.6000061035156, "logps/rejected": -311.6000061035156, "loss": 0.548, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.49321287870407104, "rewards/margins": 0.7130371332168579, "rewards/rejected": -1.206640601158142, "step": 2195 }, { "epoch": 0.5664263645726055, "grad_norm": 482.0, "learning_rate": 2.1678681771369722e-07, "logits/chosen": -2.59375, "logits/rejected": -2.512500047683716, "logps/chosen": -321.6000061035156, "logps/rejected": -376.0, "loss": 0.5582, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.781054675579071, "rewards/margins": 0.7953125238418579, "rewards/rejected": -1.578125, "step": 2200 }, { "epoch": 0.5677136972193615, "grad_norm": 322.0, "learning_rate": 2.1614315139031924e-07, "logits/chosen": -2.762500047683716, "logits/rejected": -2.7593750953674316, "logps/chosen": -313.20001220703125, "logps/rejected": -347.6000061035156, "loss": 0.6445, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.23652343451976776, "rewards/margins": 0.42363280057907104, "rewards/rejected": -0.658984363079071, "step": 2205 }, { "epoch": 0.5690010298661174, "grad_norm": 756.0, "learning_rate": 2.154994850669413e-07, "logits/chosen": -2.5234375, "logits/rejected": -2.512500047683716, "logps/chosen": -316.3999938964844, "logps/rejected": -339.0, "loss": 0.5582, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6444336175918579, "rewards/margins": 0.8277343511581421, "rewards/rejected": -1.4724609851837158, "step": 2210 }, { "epoch": 0.5702883625128733, "grad_norm": 564.0, "learning_rate": 2.1485581874356332e-07, "logits/chosen": -2.643749952316284, "logits/rejected": -2.628124952316284, "logps/chosen": -300.3999938964844, "logps/rejected": -359.20001220703125, "loss": 0.4785, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.33867186307907104, "rewards/margins": 0.9453125, "rewards/rejected": -1.2843749523162842, "step": 2215 }, { "epoch": 0.5715756951596292, "grad_norm": 474.0, "learning_rate": 2.1421215242018537e-07, "logits/chosen": -2.3218750953674316, "logits/rejected": -2.4375, "logps/chosen": -277.0, "logps/rejected": -344.79998779296875, "loss": 0.5184, "rewards/accuracies": 0.6875, "rewards/chosen": -0.701733410358429, "rewards/margins": 0.970703125, "rewards/rejected": -1.675390601158142, "step": 2220 }, { "epoch": 0.5728630278063852, "grad_norm": 490.0, "learning_rate": 2.135684860968074e-07, "logits/chosen": -2.721874952316284, "logits/rejected": -2.546875, "logps/chosen": -272.79998779296875, "logps/rejected": -358.3999938964844, "loss": 0.5461, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3916259706020355, "rewards/margins": 0.7314453125, "rewards/rejected": -1.124414086341858, "step": 2225 }, { "epoch": 0.574150360453141, "grad_norm": 560.0, "learning_rate": 2.1292481977342942e-07, "logits/chosen": -2.5843749046325684, "logits/rejected": -2.487499952316284, "logps/chosen": -304.20001220703125, "logps/rejected": -331.20001220703125, "loss": 0.4852, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2662109434604645, "rewards/margins": 0.8968750238418579, "rewards/rejected": -1.162500023841858, "step": 2230 }, { "epoch": 0.575437693099897, "grad_norm": 572.0, "learning_rate": 2.122811534500515e-07, "logits/chosen": -2.5640625953674316, "logits/rejected": -2.621875047683716, "logps/chosen": -328.20001220703125, "logps/rejected": -315.79998779296875, "loss": 0.575, "rewards/accuracies": 0.625, "rewards/chosen": -0.3343872129917145, "rewards/margins": 0.514453113079071, "rewards/rejected": -0.848437488079071, "step": 2235 }, { "epoch": 0.576725025746653, "grad_norm": 524.0, "learning_rate": 2.1163748712667354e-07, "logits/chosen": -2.856250047683716, "logits/rejected": -2.6500000953674316, "logps/chosen": -251.3000030517578, "logps/rejected": -240.10000610351562, "loss": 0.6125, "rewards/accuracies": 0.4375, "rewards/chosen": -0.27104490995407104, "rewards/margins": 0.3282714784145355, "rewards/rejected": -0.6001952886581421, "step": 2240 }, { "epoch": 0.5780123583934088, "grad_norm": 564.0, "learning_rate": 2.1099382080329555e-07, "logits/chosen": -2.6187500953674316, "logits/rejected": -2.715625047683716, "logps/chosen": -311.79998779296875, "logps/rejected": -327.70001220703125, "loss": 0.5809, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5155273675918579, "rewards/margins": 0.8690429925918579, "rewards/rejected": -1.3820312023162842, "step": 2245 }, { "epoch": 0.5792996910401648, "grad_norm": 304.0, "learning_rate": 2.103501544799176e-07, "logits/chosen": -2.4609375, "logits/rejected": -2.7906250953674316, "logps/chosen": -288.6000061035156, "logps/rejected": -377.79998779296875, "loss": 0.5973, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4786926209926605, "rewards/margins": 0.517578125, "rewards/rejected": -0.99609375, "step": 2250 }, { "epoch": 0.5805870236869207, "grad_norm": 408.0, "learning_rate": 2.0970648815653964e-07, "logits/chosen": -2.6624999046325684, "logits/rejected": -2.606250047683716, "logps/chosen": -382.0, "logps/rejected": -352.20001220703125, "loss": 0.5297, "rewards/accuracies": 0.625, "rewards/chosen": -0.41523438692092896, "rewards/margins": 0.6910156011581421, "rewards/rejected": -1.105859398841858, "step": 2255 }, { "epoch": 0.5818743563336766, "grad_norm": 620.0, "learning_rate": 2.0906282183316168e-07, "logits/chosen": -2.512500047683716, "logits/rejected": -2.534374952316284, "logps/chosen": -286.29998779296875, "logps/rejected": -314.79998779296875, "loss": 0.5281, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4403076171875, "rewards/margins": 0.8604491949081421, "rewards/rejected": -1.30078125, "step": 2260 }, { "epoch": 0.5831616889804325, "grad_norm": 568.0, "learning_rate": 2.0841915550978372e-07, "logits/chosen": -2.418750047683716, "logits/rejected": -2.715625047683716, "logps/chosen": -284.0, "logps/rejected": -275.29998779296875, "loss": 0.6344, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.3591552674770355, "rewards/margins": 0.34923094511032104, "rewards/rejected": -0.708789050579071, "step": 2265 }, { "epoch": 0.5844490216271885, "grad_norm": 516.0, "learning_rate": 2.0777548918640574e-07, "logits/chosen": -2.424999952316284, "logits/rejected": -2.5062499046325684, "logps/chosen": -295.0, "logps/rejected": -356.20001220703125, "loss": 0.4984, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.767773449420929, "rewards/margins": 1.095703125, "rewards/rejected": -1.860937476158142, "step": 2270 }, { "epoch": 0.5857363542739444, "grad_norm": 334.0, "learning_rate": 2.071318228630278e-07, "logits/chosen": -2.6343750953674316, "logits/rejected": -2.684375047683716, "logps/chosen": -325.20001220703125, "logps/rejected": -359.79998779296875, "loss": 0.5418, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2608398497104645, "rewards/margins": 0.6636718511581421, "rewards/rejected": -0.923828125, "step": 2275 }, { "epoch": 0.5870236869207003, "grad_norm": 676.0, "learning_rate": 2.0648815653964985e-07, "logits/chosen": -2.2203125953674316, "logits/rejected": -2.331249952316284, "logps/chosen": -302.79998779296875, "logps/rejected": -364.0, "loss": 0.468, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2537109851837158, "rewards/margins": 1.1375000476837158, "rewards/rejected": -2.39453125, "step": 2280 }, { "epoch": 0.5883110195674562, "grad_norm": 408.0, "learning_rate": 2.0584449021627187e-07, "logits/chosen": -2.6312499046325684, "logits/rejected": -2.6187500953674316, "logps/chosen": -321.79998779296875, "logps/rejected": -320.0, "loss": 0.5035, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3365722596645355, "rewards/margins": 0.7587890625, "rewards/rejected": -1.0964844226837158, "step": 2285 }, { "epoch": 0.5895983522142122, "grad_norm": 624.0, "learning_rate": 2.052008238928939e-07, "logits/chosen": -2.6968750953674316, "logits/rejected": -2.715625047683716, "logps/chosen": -314.0, "logps/rejected": -367.0, "loss": 0.5355, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4595703184604645, "rewards/margins": 0.712109386920929, "rewards/rejected": -1.171289086341858, "step": 2290 }, { "epoch": 0.590885684860968, "grad_norm": 452.0, "learning_rate": 2.0455715756951595e-07, "logits/chosen": -2.637500047683716, "logits/rejected": -2.6875, "logps/chosen": -282.79998779296875, "logps/rejected": -336.0, "loss": 0.5684, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.47832030057907104, "rewards/margins": 0.675976574420929, "rewards/rejected": -1.155615210533142, "step": 2295 }, { "epoch": 0.592173017507724, "grad_norm": 628.0, "learning_rate": 2.03913491246138e-07, "logits/chosen": -2.5093750953674316, "logits/rejected": -2.6312499046325684, "logps/chosen": -309.6000061035156, "logps/rejected": -331.20001220703125, "loss": 0.5193, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.28632813692092896, "rewards/margins": 0.6546875238418579, "rewards/rejected": -0.9390624761581421, "step": 2300 }, { "epoch": 0.59346035015448, "grad_norm": 370.0, "learning_rate": 2.0326982492276004e-07, "logits/chosen": -2.7281250953674316, "logits/rejected": -2.621875047683716, "logps/chosen": -350.3999938964844, "logps/rejected": -393.20001220703125, "loss": 0.5145, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4366699159145355, "rewards/margins": 0.740771472454071, "rewards/rejected": -1.177148461341858, "step": 2305 }, { "epoch": 0.5947476828012358, "grad_norm": 512.0, "learning_rate": 2.0262615859938205e-07, "logits/chosen": -2.5718750953674316, "logits/rejected": -2.6312499046325684, "logps/chosen": -388.3999938964844, "logps/rejected": -462.79998779296875, "loss": 0.4121, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6226562261581421, "rewards/margins": 1.23046875, "rewards/rejected": -1.8562500476837158, "step": 2310 }, { "epoch": 0.5960350154479918, "grad_norm": 402.0, "learning_rate": 2.0198249227600412e-07, "logits/chosen": -2.5999999046325684, "logits/rejected": -2.65625, "logps/chosen": -284.6000061035156, "logps/rejected": -283.3999938964844, "loss": 0.5836, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.21845702826976776, "rewards/margins": 0.502734363079071, "rewards/rejected": -0.7203124761581421, "step": 2315 }, { "epoch": 0.5973223480947477, "grad_norm": 508.0, "learning_rate": 2.0133882595262617e-07, "logits/chosen": -2.4906249046325684, "logits/rejected": -2.237499952316284, "logps/chosen": -329.79998779296875, "logps/rejected": -368.0, "loss": 0.5426, "rewards/accuracies": 0.6875, "rewards/chosen": -0.70166015625, "rewards/margins": 0.7835937738418579, "rewards/rejected": -1.4851562976837158, "step": 2320 }, { "epoch": 0.5986096807415036, "grad_norm": 560.0, "learning_rate": 2.0069515962924818e-07, "logits/chosen": -2.5875000953674316, "logits/rejected": -2.575000047683716, "logps/chosen": -255.1999969482422, "logps/rejected": -355.79998779296875, "loss": 0.4926, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5751953125, "rewards/margins": 0.9947265386581421, "rewards/rejected": -1.570703148841858, "step": 2325 }, { "epoch": 0.5998970133882595, "grad_norm": 652.0, "learning_rate": 2.0005149330587023e-07, "logits/chosen": -2.6781249046325684, "logits/rejected": -2.53125, "logps/chosen": -200.39999389648438, "logps/rejected": -202.1999969482422, "loss": 0.634, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.06672362983226776, "rewards/margins": 0.36591798067092896, "rewards/rejected": -0.4326171875, "step": 2330 }, { "epoch": 0.6011843460350155, "grad_norm": 362.0, "learning_rate": 1.9940782698249227e-07, "logits/chosen": -2.6812500953674316, "logits/rejected": -2.59375, "logps/chosen": -309.20001220703125, "logps/rejected": -412.0, "loss": 0.4547, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4061645567417145, "rewards/margins": 1.2234375476837158, "rewards/rejected": -1.62890625, "step": 2335 }, { "epoch": 0.6024716786817713, "grad_norm": 524.0, "learning_rate": 1.987641606591143e-07, "logits/chosen": -2.5406250953674316, "logits/rejected": -2.4937500953674316, "logps/chosen": -320.79998779296875, "logps/rejected": -366.0, "loss": 0.5805, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.35966795682907104, "rewards/margins": 0.520703136920929, "rewards/rejected": -0.8804687261581421, "step": 2340 }, { "epoch": 0.6037590113285273, "grad_norm": 370.0, "learning_rate": 1.9812049433573635e-07, "logits/chosen": -2.606250047683716, "logits/rejected": -2.5062499046325684, "logps/chosen": -300.79998779296875, "logps/rejected": -327.3999938964844, "loss": 0.49, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4873046875, "rewards/margins": 0.8724120855331421, "rewards/rejected": -1.3591797351837158, "step": 2345 }, { "epoch": 0.6050463439752832, "grad_norm": 540.0, "learning_rate": 1.9747682801235837e-07, "logits/chosen": -2.4781250953674316, "logits/rejected": -2.1078124046325684, "logps/chosen": -253.1999969482422, "logps/rejected": -322.3999938964844, "loss": 0.5465, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.6568359136581421, "rewards/margins": 0.9867187738418579, "rewards/rejected": -1.6447265148162842, "step": 2350 }, { "epoch": 0.6063336766220392, "grad_norm": 446.0, "learning_rate": 1.9683316168898044e-07, "logits/chosen": -2.456249952316284, "logits/rejected": -2.578125, "logps/chosen": -317.20001220703125, "logps/rejected": -339.20001220703125, "loss": 0.5125, "rewards/accuracies": 0.625, "rewards/chosen": -0.3324218690395355, "rewards/margins": 0.9359375238418579, "rewards/rejected": -1.267968773841858, "step": 2355 }, { "epoch": 0.607621009268795, "grad_norm": 884.0, "learning_rate": 1.9618949536560248e-07, "logits/chosen": -2.512500047683716, "logits/rejected": -2.5562500953674316, "logps/chosen": -328.0, "logps/rejected": -429.20001220703125, "loss": 0.4437, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.953125, "rewards/margins": 1.3046875, "rewards/rejected": -2.253124952316284, "step": 2360 }, { "epoch": 0.608908341915551, "grad_norm": 448.0, "learning_rate": 1.955458290422245e-07, "logits/chosen": -2.75, "logits/rejected": -2.815624952316284, "logps/chosen": -279.3999938964844, "logps/rejected": -290.5, "loss": 0.5539, "rewards/accuracies": 0.5, "rewards/chosen": -0.39277344942092896, "rewards/margins": 0.5552734136581421, "rewards/rejected": -0.947265625, "step": 2365 }, { "epoch": 0.6101956745623069, "grad_norm": 294.0, "learning_rate": 1.9490216271884654e-07, "logits/chosen": -2.4000000953674316, "logits/rejected": -2.1640625, "logps/chosen": -288.20001220703125, "logps/rejected": -336.20001220703125, "loss": 0.4928, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.607226550579071, "rewards/margins": 1.0107421875, "rewards/rejected": -1.6183593273162842, "step": 2370 }, { "epoch": 0.6114830072090628, "grad_norm": 616.0, "learning_rate": 1.9425849639546856e-07, "logits/chosen": -2.5687499046325684, "logits/rejected": -2.581249952316284, "logps/chosen": -256.0, "logps/rejected": -310.6000061035156, "loss": 0.6383, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.47998046875, "rewards/margins": 0.660351574420929, "rewards/rejected": -1.143945336341858, "step": 2375 }, { "epoch": 0.6127703398558187, "grad_norm": 394.0, "learning_rate": 1.9361483007209063e-07, "logits/chosen": -2.762500047683716, "logits/rejected": -2.7906250953674316, "logps/chosen": -310.6000061035156, "logps/rejected": -343.6000061035156, "loss": 0.4652, "rewards/accuracies": 0.75, "rewards/chosen": -0.22709961235523224, "rewards/margins": 0.91015625, "rewards/rejected": -1.137304663658142, "step": 2380 }, { "epoch": 0.6140576725025747, "grad_norm": 256.0, "learning_rate": 1.9297116374871267e-07, "logits/chosen": -2.674999952316284, "logits/rejected": -2.3843750953674316, "logps/chosen": -274.8999938964844, "logps/rejected": -252.0, "loss": 0.5496, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3921875059604645, "rewards/margins": 0.5853973627090454, "rewards/rejected": -0.977734386920929, "step": 2385 }, { "epoch": 0.6153450051493305, "grad_norm": 374.0, "learning_rate": 1.9232749742533468e-07, "logits/chosen": -2.5, "logits/rejected": -2.778125047683716, "logps/chosen": -239.0, "logps/rejected": -268.1000061035156, "loss": 0.5578, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.24223633110523224, "rewards/margins": 0.42109376192092896, "rewards/rejected": -0.662890613079071, "step": 2390 }, { "epoch": 0.6166323377960865, "grad_norm": 2256.0, "learning_rate": 1.9168383110195673e-07, "logits/chosen": -2.487499952316284, "logits/rejected": -2.53125, "logps/chosen": -368.0, "logps/rejected": -410.3999938964844, "loss": 0.6293, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.68115234375, "rewards/margins": 0.8472656011581421, "rewards/rejected": -1.525390625, "step": 2395 }, { "epoch": 0.6179196704428425, "grad_norm": 644.0, "learning_rate": 1.910401647785788e-07, "logits/chosen": -2.4906249046325684, "logits/rejected": -2.284374952316284, "logps/chosen": -288.0, "logps/rejected": -407.20001220703125, "loss": 0.4547, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7826172113418579, "rewards/margins": 1.403906226158142, "rewards/rejected": -2.1898436546325684, "step": 2400 }, { "epoch": 0.6192070030895983, "grad_norm": 512.0, "learning_rate": 1.903964984552008e-07, "logits/chosen": -2.5625, "logits/rejected": -2.643749952316284, "logps/chosen": -301.79998779296875, "logps/rejected": -270.20001220703125, "loss": 0.6281, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.527050793170929, "rewards/margins": 0.4427734315395355, "rewards/rejected": -0.9697265625, "step": 2405 }, { "epoch": 0.6204943357363543, "grad_norm": 604.0, "learning_rate": 1.8975283213182286e-07, "logits/chosen": -2.6624999046325684, "logits/rejected": -2.8031249046325684, "logps/chosen": -387.6000061035156, "logps/rejected": -392.0, "loss": 0.4301, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.49580079317092896, "rewards/margins": 1.189843773841858, "rewards/rejected": -1.6843750476837158, "step": 2410 }, { "epoch": 0.6217816683831102, "grad_norm": 820.0, "learning_rate": 1.8910916580844487e-07, "logits/chosen": -2.628124952316284, "logits/rejected": -2.78125, "logps/chosen": -328.3999938964844, "logps/rejected": -293.79998779296875, "loss": 0.7281, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.3107666075229645, "rewards/margins": 0.23707886040210724, "rewards/rejected": -0.5472656488418579, "step": 2415 }, { "epoch": 0.6230690010298661, "grad_norm": 408.0, "learning_rate": 1.8846549948506694e-07, "logits/chosen": -2.674999952316284, "logits/rejected": -2.637500047683716, "logps/chosen": -311.6000061035156, "logps/rejected": -346.3999938964844, "loss": 0.4926, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6163574457168579, "rewards/margins": 0.8648437261581421, "rewards/rejected": -1.4816405773162842, "step": 2420 }, { "epoch": 0.624356333676622, "grad_norm": 462.0, "learning_rate": 1.8782183316168898e-07, "logits/chosen": -2.628124952316284, "logits/rejected": -2.5562500953674316, "logps/chosen": -330.3999938964844, "logps/rejected": -388.3999938964844, "loss": 0.4223, "rewards/accuracies": 0.75, "rewards/chosen": -0.563281238079071, "rewards/margins": 1.146093726158142, "rewards/rejected": -1.7109375, "step": 2425 }, { "epoch": 0.625643666323378, "grad_norm": 632.0, "learning_rate": 1.87178166838311e-07, "logits/chosen": -2.590625047683716, "logits/rejected": -2.659374952316284, "logps/chosen": -329.6000061035156, "logps/rejected": -355.3999938964844, "loss": 0.5898, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.5369628667831421, "rewards/margins": 0.570507824420929, "rewards/rejected": -1.1062500476837158, "step": 2430 }, { "epoch": 0.6269309989701339, "grad_norm": 350.0, "learning_rate": 1.8653450051493304e-07, "logits/chosen": -2.6624999046325684, "logits/rejected": -2.703125, "logps/chosen": -329.79998779296875, "logps/rejected": -343.3999938964844, "loss": 0.6375, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.44633787870407104, "rewards/margins": 0.45246583223342896, "rewards/rejected": -0.8970702886581421, "step": 2435 }, { "epoch": 0.6282183316168898, "grad_norm": 544.0, "learning_rate": 1.858908341915551e-07, "logits/chosen": -2.6312499046325684, "logits/rejected": -2.6500000953674316, "logps/chosen": -336.6000061035156, "logps/rejected": -398.6000061035156, "loss": 0.6461, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.8838866949081421, "rewards/margins": 0.658154308795929, "rewards/rejected": -1.54296875, "step": 2440 }, { "epoch": 0.6295056642636457, "grad_norm": 362.0, "learning_rate": 1.8524716786817713e-07, "logits/chosen": -2.5250000953674316, "logits/rejected": -2.659374952316284, "logps/chosen": -247.39999389648438, "logps/rejected": -265.79998779296875, "loss": 0.6344, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.3335937559604645, "rewards/margins": 0.29936522245407104, "rewards/rejected": -0.633837878704071, "step": 2445 }, { "epoch": 0.6307929969104017, "grad_norm": 414.0, "learning_rate": 1.8460350154479917e-07, "logits/chosen": -2.7093749046325684, "logits/rejected": -2.643749952316284, "logps/chosen": -320.6000061035156, "logps/rejected": -328.20001220703125, "loss": 0.5609, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3935302793979645, "rewards/margins": 0.7708984613418579, "rewards/rejected": -1.163671851158142, "step": 2450 }, { "epoch": 0.6320803295571575, "grad_norm": 474.0, "learning_rate": 1.8395983522142119e-07, "logits/chosen": -2.7281250953674316, "logits/rejected": -2.684375047683716, "logps/chosen": -347.6000061035156, "logps/rejected": -410.3999938964844, "loss": 0.5422, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.610546886920929, "rewards/margins": 0.9717041254043579, "rewards/rejected": -1.5828125476837158, "step": 2455 }, { "epoch": 0.6333676622039135, "grad_norm": 528.0, "learning_rate": 1.8331616889804326e-07, "logits/chosen": -2.559375047683716, "logits/rejected": -2.581249952316284, "logps/chosen": -348.79998779296875, "logps/rejected": -415.6000061035156, "loss": 0.4504, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8125, "rewards/margins": 1.1203124523162842, "rewards/rejected": -1.932031273841858, "step": 2460 }, { "epoch": 0.6346549948506695, "grad_norm": 476.0, "learning_rate": 1.826725025746653e-07, "logits/chosen": -2.768749952316284, "logits/rejected": -2.7281250953674316, "logps/chosen": -366.3999938964844, "logps/rejected": -415.20001220703125, "loss": 0.5047, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4140625, "rewards/margins": 0.913281261920929, "rewards/rejected": -1.328125, "step": 2465 }, { "epoch": 0.6359423274974253, "grad_norm": 380.0, "learning_rate": 1.8202883625128731e-07, "logits/chosen": -2.6312499046325684, "logits/rejected": -2.659374952316284, "logps/chosen": -257.3999938964844, "logps/rejected": -313.79998779296875, "loss": 0.5109, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.08572997897863388, "rewards/margins": 0.710742175579071, "rewards/rejected": -0.797070324420929, "step": 2470 }, { "epoch": 0.6372296601441813, "grad_norm": 490.0, "learning_rate": 1.8138516992790936e-07, "logits/chosen": -2.456249952316284, "logits/rejected": -2.5718750953674316, "logps/chosen": -342.3999938964844, "logps/rejected": -386.79998779296875, "loss": 0.5188, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.795703113079071, "rewards/margins": 1.000830054283142, "rewards/rejected": -1.794531226158142, "step": 2475 }, { "epoch": 0.6385169927909372, "grad_norm": 360.0, "learning_rate": 1.8074150360453143e-07, "logits/chosen": -2.734375, "logits/rejected": -2.75, "logps/chosen": -301.20001220703125, "logps/rejected": -384.0, "loss": 0.4434, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.41938477754592896, "rewards/margins": 1.067968726158142, "rewards/rejected": -1.489843726158142, "step": 2480 }, { "epoch": 0.6398043254376931, "grad_norm": 258.0, "learning_rate": 1.8009783728115344e-07, "logits/chosen": -2.668750047683716, "logits/rejected": -2.575000047683716, "logps/chosen": -340.6000061035156, "logps/rejected": -393.20001220703125, "loss": 0.5113, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9437500238418579, "rewards/margins": 0.847851574420929, "rewards/rejected": -1.796240210533142, "step": 2485 }, { "epoch": 0.641091658084449, "grad_norm": 420.0, "learning_rate": 1.7945417095777549e-07, "logits/chosen": -2.515625, "logits/rejected": -2.387500047683716, "logps/chosen": -262.6000061035156, "logps/rejected": -353.79998779296875, "loss": 0.4555, "rewards/accuracies": 0.6875, "rewards/chosen": -0.612109363079071, "rewards/margins": 1.1570312976837158, "rewards/rejected": -1.76953125, "step": 2490 }, { "epoch": 0.642378990731205, "grad_norm": 516.0, "learning_rate": 1.788105046343975e-07, "logits/chosen": -2.53125, "logits/rejected": -2.481250047683716, "logps/chosen": -253.6999969482422, "logps/rejected": -269.3999938964844, "loss": 0.6082, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.41337889432907104, "rewards/margins": 0.566699206829071, "rewards/rejected": -0.9789062738418579, "step": 2495 }, { "epoch": 0.6436663233779608, "grad_norm": 486.0, "learning_rate": 1.7816683831101954e-07, "logits/chosen": -2.659374952316284, "logits/rejected": -2.59375, "logps/chosen": -368.0, "logps/rejected": -408.6000061035156, "loss": 0.468, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8382323980331421, "rewards/margins": 0.987500011920929, "rewards/rejected": -1.8273437023162842, "step": 2500 }, { "epoch": 0.6449536560247168, "grad_norm": 1128.0, "learning_rate": 1.7752317198764161e-07, "logits/chosen": -2.7249999046325684, "logits/rejected": -2.7593750953674316, "logps/chosen": -305.79998779296875, "logps/rejected": -292.0, "loss": 0.6242, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.40556639432907104, "rewards/margins": 0.46992188692092896, "rewards/rejected": -0.8765624761581421, "step": 2505 }, { "epoch": 0.6462409886714727, "grad_norm": 430.0, "learning_rate": 1.7687950566426363e-07, "logits/chosen": -2.4375, "logits/rejected": -2.5843749046325684, "logps/chosen": -275.0, "logps/rejected": -265.3500061035156, "loss": 0.6086, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.107421875, "rewards/margins": 0.3509887754917145, "rewards/rejected": -0.45917969942092896, "step": 2510 }, { "epoch": 0.6475283213182287, "grad_norm": 788.0, "learning_rate": 1.7623583934088567e-07, "logits/chosen": -2.65625, "logits/rejected": -2.7562499046325684, "logps/chosen": -373.20001220703125, "logps/rejected": -325.20001220703125, "loss": 0.6113, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.33671873807907104, "rewards/margins": 0.4564453065395355, "rewards/rejected": -0.792187511920929, "step": 2515 }, { "epoch": 0.6488156539649845, "grad_norm": 388.0, "learning_rate": 1.7559217301750771e-07, "logits/chosen": -2.765625, "logits/rejected": -2.7249999046325684, "logps/chosen": -349.3999938964844, "logps/rejected": -368.3999938964844, "loss": 0.4977, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.28544920682907104, "rewards/margins": 0.7953125238418579, "rewards/rejected": -1.080468773841858, "step": 2520 }, { "epoch": 0.6501029866117405, "grad_norm": 540.0, "learning_rate": 1.7494850669412976e-07, "logits/chosen": -2.565624952316284, "logits/rejected": -2.7093749046325684, "logps/chosen": -367.6000061035156, "logps/rejected": -407.20001220703125, "loss": 0.4652, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.564648449420929, "rewards/margins": 0.991406261920929, "rewards/rejected": -1.5554687976837158, "step": 2525 }, { "epoch": 0.6513903192584964, "grad_norm": 680.0, "learning_rate": 1.743048403707518e-07, "logits/chosen": -2.609375, "logits/rejected": -2.5843749046325684, "logps/chosen": -340.20001220703125, "logps/rejected": -387.79998779296875, "loss": 0.4354, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.44184571504592896, "rewards/margins": 1.240625023841858, "rewards/rejected": -1.683203101158142, "step": 2530 }, { "epoch": 0.6526776519052523, "grad_norm": 556.0, "learning_rate": 1.7366117404737382e-07, "logits/chosen": -2.609375, "logits/rejected": -2.731250047683716, "logps/chosen": -328.3999938964844, "logps/rejected": -230.0, "loss": 0.5648, "rewards/accuracies": 0.5625, "rewards/chosen": -0.16191406548023224, "rewards/margins": 0.5634765625, "rewards/rejected": -0.725390613079071, "step": 2535 }, { "epoch": 0.6539649845520082, "grad_norm": 532.0, "learning_rate": 1.7301750772399586e-07, "logits/chosen": -2.6343750953674316, "logits/rejected": -2.5718750953674316, "logps/chosen": -341.79998779296875, "logps/rejected": -399.3999938964844, "loss": 0.4762, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0548827648162842, "rewards/margins": 1.177343726158142, "rewards/rejected": -2.2359375953674316, "step": 2540 }, { "epoch": 0.6552523171987642, "grad_norm": 516.0, "learning_rate": 1.7237384140061793e-07, "logits/chosen": -2.549999952316284, "logits/rejected": -2.578125, "logps/chosen": -270.3999938964844, "logps/rejected": -324.3999938964844, "loss": 0.5684, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8359375, "rewards/margins": 0.9652343988418579, "rewards/rejected": -1.8003418445587158, "step": 2545 }, { "epoch": 0.65653964984552, "grad_norm": 768.0, "learning_rate": 1.7173017507723994e-07, "logits/chosen": -2.543750047683716, "logits/rejected": -2.621875047683716, "logps/chosen": -303.6000061035156, "logps/rejected": -378.0, "loss": 0.5137, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.699414074420929, "rewards/margins": 1.2488281726837158, "rewards/rejected": -1.9490234851837158, "step": 2550 }, { "epoch": 0.657826982492276, "grad_norm": 380.0, "learning_rate": 1.71086508753862e-07, "logits/chosen": -2.4375, "logits/rejected": -2.299999952316284, "logps/chosen": -302.0, "logps/rejected": -312.3999938964844, "loss": 0.4871, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.44160157442092896, "rewards/margins": 1.0808594226837158, "rewards/rejected": -1.5234375, "step": 2555 }, { "epoch": 0.659114315139032, "grad_norm": 320.0, "learning_rate": 1.7044284243048403e-07, "logits/chosen": -2.653125047683716, "logits/rejected": -2.668750047683716, "logps/chosen": -261.79998779296875, "logps/rejected": -298.29998779296875, "loss": 0.6211, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.44648438692092896, "rewards/margins": 0.5501953363418579, "rewards/rejected": -0.99444580078125, "step": 2560 }, { "epoch": 0.6604016477857878, "grad_norm": 482.0, "learning_rate": 1.6979917610710607e-07, "logits/chosen": -2.549999952316284, "logits/rejected": -2.6187500953674316, "logps/chosen": -243.10000610351562, "logps/rejected": -336.79998779296875, "loss": 0.4918, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.321533203125, "rewards/margins": 0.8912109136581421, "rewards/rejected": -1.2125976085662842, "step": 2565 }, { "epoch": 0.6616889804325438, "grad_norm": 394.0, "learning_rate": 1.6915550978372812e-07, "logits/chosen": -2.796875, "logits/rejected": -2.8125, "logps/chosen": -382.0, "logps/rejected": -334.3999938964844, "loss": 0.5699, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.258544921875, "rewards/margins": 0.521484375, "rewards/rejected": -0.779296875, "step": 2570 }, { "epoch": 0.6629763130792997, "grad_norm": 356.0, "learning_rate": 1.6851184346035013e-07, "logits/chosen": -2.7718749046325684, "logits/rejected": -2.721874952316284, "logps/chosen": -350.3999938964844, "logps/rejected": -305.79998779296875, "loss": 0.5391, "rewards/accuracies": 0.625, "rewards/chosen": -0.31416016817092896, "rewards/margins": 0.727734386920929, "rewards/rejected": -1.041406273841858, "step": 2575 }, { "epoch": 0.6642636457260556, "grad_norm": 424.0, "learning_rate": 1.6786817713697217e-07, "logits/chosen": -2.5843749046325684, "logits/rejected": -2.609375, "logps/chosen": -382.0, "logps/rejected": -498.3999938964844, "loss": 0.4025, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8578125238418579, "rewards/margins": 1.254296898841858, "rewards/rejected": -2.1109375953674316, "step": 2580 }, { "epoch": 0.6655509783728115, "grad_norm": 225.0, "learning_rate": 1.6722451081359424e-07, "logits/chosen": -2.784374952316284, "logits/rejected": -2.7125000953674316, "logps/chosen": -272.0, "logps/rejected": -370.20001220703125, "loss": 0.4793, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4853515625, "rewards/margins": 0.981640636920929, "rewards/rejected": -1.4666016101837158, "step": 2585 }, { "epoch": 0.6668383110195675, "grad_norm": 540.0, "learning_rate": 1.6658084449021626e-07, "logits/chosen": -2.721874952316284, "logits/rejected": -2.6937499046325684, "logps/chosen": -258.20001220703125, "logps/rejected": -256.20001220703125, "loss": 0.5488, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.23075561225414276, "rewards/margins": 0.6781250238418579, "rewards/rejected": -0.9097656011581421, "step": 2590 }, { "epoch": 0.6681256436663234, "grad_norm": 616.0, "learning_rate": 1.659371781668383e-07, "logits/chosen": -2.643749952316284, "logits/rejected": -2.6781249046325684, "logps/chosen": -290.79998779296875, "logps/rejected": -360.79998779296875, "loss": 0.5043, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.42241209745407104, "rewards/margins": 0.993945300579071, "rewards/rejected": -1.4152343273162842, "step": 2595 }, { "epoch": 0.6694129763130793, "grad_norm": 328.0, "learning_rate": 1.6529351184346034e-07, "logits/chosen": -2.684375047683716, "logits/rejected": -2.690624952316284, "logps/chosen": -339.0, "logps/rejected": -369.6000061035156, "loss": 0.4289, "rewards/accuracies": 0.6875, "rewards/chosen": -0.46455079317092896, "rewards/margins": 1.1953125, "rewards/rejected": -1.6570312976837158, "step": 2600 }, { "epoch": 0.6707003089598352, "grad_norm": 386.0, "learning_rate": 1.646498455200824e-07, "logits/chosen": -2.5250000953674316, "logits/rejected": -2.581249952316284, "logps/chosen": -279.79998779296875, "logps/rejected": -324.0, "loss": 0.4953, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4580078125, "rewards/margins": 0.7964843511581421, "rewards/rejected": -1.256250023841858, "step": 2605 }, { "epoch": 0.6719876416065912, "grad_norm": 386.0, "learning_rate": 1.6400617919670443e-07, "logits/chosen": -2.643749952316284, "logits/rejected": NaN, "logps/chosen": -287.6000061035156, "logps/rejected": -292.0, "loss": 0.4809, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.23133544623851776, "rewards/margins": 0.842041015625, "rewards/rejected": -1.07373046875, "step": 2610 }, { "epoch": 0.673274974253347, "grad_norm": 544.0, "learning_rate": 1.6336251287332645e-07, "logits/chosen": -2.4156250953674316, "logits/rejected": -2.575000047683716, "logps/chosen": -362.3999938964844, "logps/rejected": -395.20001220703125, "loss": 0.5844, "rewards/accuracies": 0.625, "rewards/chosen": -0.928906261920929, "rewards/margins": 0.752734363079071, "rewards/rejected": -1.683203101158142, "step": 2615 }, { "epoch": 0.674562306900103, "grad_norm": 480.0, "learning_rate": 1.627188465499485e-07, "logits/chosen": -2.546875, "logits/rejected": -2.637500047683716, "logps/chosen": -290.79998779296875, "logps/rejected": -380.79998779296875, "loss": 0.4844, "rewards/accuracies": 0.6875, "rewards/chosen": -0.29963380098342896, "rewards/margins": 0.917675793170929, "rewards/rejected": -1.2156250476837158, "step": 2620 }, { "epoch": 0.675849639546859, "grad_norm": 270.0, "learning_rate": 1.6207518022657056e-07, "logits/chosen": -2.65625, "logits/rejected": -2.2464842796325684, "logps/chosen": -243.0124969482422, "logps/rejected": -217.60000610351562, "loss": 0.6195, "rewards/accuracies": 0.4375, "rewards/chosen": -0.17729492485523224, "rewards/margins": 0.3058105409145355, "rewards/rejected": -0.4839843809604645, "step": 2625 }, { "epoch": 0.6771369721936148, "grad_norm": 418.0, "learning_rate": 1.6143151390319257e-07, "logits/chosen": -2.465625047683716, "logits/rejected": -2.5062499046325684, "logps/chosen": -294.6000061035156, "logps/rejected": -362.0, "loss": 0.452, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8399902582168579, "rewards/margins": 1.106689453125, "rewards/rejected": -1.9435546398162842, "step": 2630 }, { "epoch": 0.6784243048403708, "grad_norm": 366.0, "learning_rate": 1.6078784757981462e-07, "logits/chosen": -2.5250000953674316, "logits/rejected": -2.496875047683716, "logps/chosen": -261.0, "logps/rejected": -244.0, "loss": 0.5289, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3173828125, "rewards/margins": 0.7718750238418579, "rewards/rejected": -1.0886719226837158, "step": 2635 }, { "epoch": 0.6797116374871267, "grad_norm": 364.0, "learning_rate": 1.6014418125643666e-07, "logits/chosen": -2.659374952316284, "logits/rejected": -2.765625, "logps/chosen": -289.3999938964844, "logps/rejected": -339.20001220703125, "loss": 0.4699, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.16416016221046448, "rewards/margins": 1.0574219226837158, "rewards/rejected": -1.223046898841858, "step": 2640 }, { "epoch": 0.6809989701338826, "grad_norm": 159.0, "learning_rate": 1.5950051493305868e-07, "logits/chosen": -2.546875, "logits/rejected": -2.643749952316284, "logps/chosen": -206.14999389648438, "logps/rejected": -226.64999389648438, "loss": 0.5715, "rewards/accuracies": 0.375, "rewards/chosen": -0.06494140625, "rewards/margins": 0.5435546636581421, "rewards/rejected": -0.6087890863418579, "step": 2645 }, { "epoch": 0.6822863027806385, "grad_norm": 466.0, "learning_rate": 1.5885684860968075e-07, "logits/chosen": -2.6624999046325684, "logits/rejected": -2.6343750953674316, "logps/chosen": -324.3999938964844, "logps/rejected": -370.79998779296875, "loss": 0.4688, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7738281488418579, "rewards/margins": 1.033593773841858, "rewards/rejected": -1.807031273841858, "step": 2650 }, { "epoch": 0.6835736354273945, "grad_norm": 370.0, "learning_rate": 1.5821318228630276e-07, "logits/chosen": -2.5875000953674316, "logits/rejected": -2.575000047683716, "logps/chosen": -320.0, "logps/rejected": -413.6000061035156, "loss": 0.418, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6363281011581421, "rewards/margins": 1.338281273841858, "rewards/rejected": -1.9734375476837158, "step": 2655 }, { "epoch": 0.6848609680741503, "grad_norm": 624.0, "learning_rate": 1.575695159629248e-07, "logits/chosen": -2.534374952316284, "logits/rejected": -2.471874952316284, "logps/chosen": -395.6000061035156, "logps/rejected": -370.79998779296875, "loss": 0.5609, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.135156273841858, "rewards/margins": 0.989062488079071, "rewards/rejected": -2.128124952316284, "step": 2660 }, { "epoch": 0.6861483007209063, "grad_norm": 556.0, "learning_rate": 1.5692584963954685e-07, "logits/chosen": -2.700000047683716, "logits/rejected": -2.703125, "logps/chosen": -262.3999938964844, "logps/rejected": -300.79998779296875, "loss": 0.523, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3162597715854645, "rewards/margins": 0.767578125, "rewards/rejected": -1.083593726158142, "step": 2665 }, { "epoch": 0.6874356333676622, "grad_norm": 520.0, "learning_rate": 1.562821833161689e-07, "logits/chosen": -2.609375, "logits/rejected": -2.7437500953674316, "logps/chosen": -308.79998779296875, "logps/rejected": -343.6000061035156, "loss": 0.5793, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.606396496295929, "rewards/margins": 0.623242199420929, "rewards/rejected": -1.227929711341858, "step": 2670 }, { "epoch": 0.6887229660144182, "grad_norm": 564.0, "learning_rate": 1.5563851699279093e-07, "logits/chosen": -2.715625047683716, "logits/rejected": -2.690624952316284, "logps/chosen": -285.20001220703125, "logps/rejected": -325.79998779296875, "loss": 0.5332, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3843750059604645, "rewards/margins": 0.8482421636581421, "rewards/rejected": -1.232031226158142, "step": 2675 }, { "epoch": 0.690010298661174, "grad_norm": 544.0, "learning_rate": 1.5499485066941297e-07, "logits/chosen": -2.515625, "logits/rejected": -2.53125, "logps/chosen": -353.20001220703125, "logps/rejected": -462.0, "loss": 0.4105, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8893066644668579, "rewards/margins": 1.4921875, "rewards/rejected": -2.385937452316284, "step": 2680 }, { "epoch": 0.69129763130793, "grad_norm": 392.0, "learning_rate": 1.54351184346035e-07, "logits/chosen": -2.590625047683716, "logits/rejected": -2.5328125953674316, "logps/chosen": -290.0, "logps/rejected": -361.6000061035156, "loss": 0.5012, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4346679747104645, "rewards/margins": 0.9593750238418579, "rewards/rejected": -1.3927733898162842, "step": 2685 }, { "epoch": 0.6925849639546859, "grad_norm": 430.0, "learning_rate": 1.5370751802265706e-07, "logits/chosen": -2.5250000953674316, "logits/rejected": -2.4937500953674316, "logps/chosen": -323.3999938964844, "logps/rejected": -380.0, "loss": 0.4203, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.47935181856155396, "rewards/margins": 1.208593726158142, "rewards/rejected": -1.685156226158142, "step": 2690 }, { "epoch": 0.6938722966014418, "grad_norm": 680.0, "learning_rate": 1.5306385169927908e-07, "logits/chosen": -2.606250047683716, "logits/rejected": -2.643749952316284, "logps/chosen": -335.6000061035156, "logps/rejected": -311.79998779296875, "loss": 0.5781, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.510546863079071, "rewards/margins": 0.720703125, "rewards/rejected": -1.2292969226837158, "step": 2695 }, { "epoch": 0.6951596292481977, "grad_norm": 376.0, "learning_rate": 1.5242018537590112e-07, "logits/chosen": -2.5625, "logits/rejected": -2.021484375, "logps/chosen": -286.79998779296875, "logps/rejected": -291.3999938964844, "loss": 0.5875, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.22780761122703552, "rewards/margins": 0.427490234375, "rewards/rejected": -0.65625, "step": 2700 }, { "epoch": 0.6964469618949537, "grad_norm": 386.0, "learning_rate": 1.5177651905252316e-07, "logits/chosen": -2.5859375, "logits/rejected": -2.7718749046325684, "logps/chosen": -319.6000061035156, "logps/rejected": -306.79998779296875, "loss": 0.5531, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3067871034145355, "rewards/margins": 0.58154296875, "rewards/rejected": -0.8892577886581421, "step": 2705 }, { "epoch": 0.6977342945417095, "grad_norm": 512.0, "learning_rate": 1.511328527291452e-07, "logits/chosen": -2.5531249046325684, "logits/rejected": -2.609375, "logps/chosen": -312.79998779296875, "logps/rejected": -372.0, "loss": 0.4789, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5572265386581421, "rewards/margins": 0.935546875, "rewards/rejected": -1.493749976158142, "step": 2710 }, { "epoch": 0.6990216271884655, "grad_norm": 432.0, "learning_rate": 1.5048918640576725e-07, "logits/chosen": -2.575000047683716, "logits/rejected": -2.4390625953674316, "logps/chosen": -318.79998779296875, "logps/rejected": -406.79998779296875, "loss": 0.4461, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.840624988079071, "rewards/margins": 1.209375023841858, "rewards/rejected": -2.051562547683716, "step": 2715 }, { "epoch": 0.7003089598352215, "grad_norm": 592.0, "learning_rate": 1.498455200823893e-07, "logits/chosen": -2.606250047683716, "logits/rejected": -2.65625, "logps/chosen": -345.6000061035156, "logps/rejected": -370.20001220703125, "loss": 0.5168, "rewards/accuracies": 0.625, "rewards/chosen": -0.4556640684604645, "rewards/margins": 0.8873046636581421, "rewards/rejected": -1.340234398841858, "step": 2720 }, { "epoch": 0.7015962924819773, "grad_norm": 388.0, "learning_rate": 1.492018537590113e-07, "logits/chosen": -2.674999952316284, "logits/rejected": -2.6812500953674316, "logps/chosen": -323.6000061035156, "logps/rejected": -329.0, "loss": 0.4562, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.40302735567092896, "rewards/margins": 1.0203125476837158, "rewards/rejected": -1.4246094226837158, "step": 2725 }, { "epoch": 0.7028836251287333, "grad_norm": 384.0, "learning_rate": 1.4855818743563338e-07, "logits/chosen": -2.6500000953674316, "logits/rejected": -2.8125, "logps/chosen": -302.6000061035156, "logps/rejected": -330.0, "loss": 0.5387, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.18574218451976776, "rewards/margins": 0.678515613079071, "rewards/rejected": -0.864062488079071, "step": 2730 }, { "epoch": 0.7041709577754892, "grad_norm": 632.0, "learning_rate": 1.4791452111225542e-07, "logits/chosen": -2.612499952316284, "logits/rejected": -2.6468749046325684, "logps/chosen": -304.20001220703125, "logps/rejected": -361.20001220703125, "loss": 0.541, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.6514648199081421, "rewards/margins": 0.878613293170929, "rewards/rejected": -1.5300781726837158, "step": 2735 }, { "epoch": 0.7054582904222451, "grad_norm": 724.0, "learning_rate": 1.4727085478887743e-07, "logits/chosen": -2.6468749046325684, "logits/rejected": -2.7406249046325684, "logps/chosen": -299.20001220703125, "logps/rejected": -308.6000061035156, "loss": 0.5813, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5006958246231079, "rewards/margins": 0.699999988079071, "rewards/rejected": -1.2023437023162842, "step": 2740 }, { "epoch": 0.706745623069001, "grad_norm": 592.0, "learning_rate": 1.4662718846549948e-07, "logits/chosen": -2.75, "logits/rejected": -2.609375, "logps/chosen": -308.20001220703125, "logps/rejected": -350.0, "loss": 0.607, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6744629144668579, "rewards/margins": 0.616406261920929, "rewards/rejected": -1.2902343273162842, "step": 2745 }, { "epoch": 0.708032955715757, "grad_norm": 366.0, "learning_rate": 1.4598352214212152e-07, "logits/chosen": -2.7093749046325684, "logits/rejected": -2.456249952316284, "logps/chosen": -319.20001220703125, "logps/rejected": -333.79998779296875, "loss": 0.593, "rewards/accuracies": 0.625, "rewards/chosen": -0.2964843809604645, "rewards/margins": 0.39570313692092896, "rewards/rejected": -0.691210925579071, "step": 2750 }, { "epoch": 0.7093202883625128, "grad_norm": 344.0, "learning_rate": 1.4533985581874356e-07, "logits/chosen": -2.6812500953674316, "logits/rejected": -2.856250047683716, "logps/chosen": -291.79998779296875, "logps/rejected": -300.0, "loss": 0.557, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.16796875, "rewards/margins": 0.522656261920929, "rewards/rejected": -0.690234363079071, "step": 2755 }, { "epoch": 0.7106076210092688, "grad_norm": 394.0, "learning_rate": 1.446961894953656e-07, "logits/chosen": -2.465625047683716, "logits/rejected": -2.546875, "logps/chosen": -259.6000061035156, "logps/rejected": -330.6000061035156, "loss": 0.5293, "rewards/accuracies": 0.6875, "rewards/chosen": -0.537109375, "rewards/margins": 0.8970702886581421, "rewards/rejected": -1.435156226158142, "step": 2760 }, { "epoch": 0.7118949536560247, "grad_norm": 286.0, "learning_rate": 1.4405252317198762e-07, "logits/chosen": -2.4671874046325684, "logits/rejected": -2.2632813453674316, "logps/chosen": -225.6999969482422, "logps/rejected": -280.6000061035156, "loss": 0.5633, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.3643554747104645, "rewards/margins": 0.572460949420929, "rewards/rejected": -0.9359375238418579, "step": 2765 }, { "epoch": 0.7131822863027807, "grad_norm": 318.0, "learning_rate": 1.434088568486097e-07, "logits/chosen": -2.609375, "logits/rejected": -2.5625, "logps/chosen": -320.20001220703125, "logps/rejected": -379.20001220703125, "loss": 0.4684, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.3599609434604645, "rewards/margins": 0.8902343511581421, "rewards/rejected": -1.248437523841858, "step": 2770 }, { "epoch": 0.7144696189495365, "grad_norm": 488.0, "learning_rate": 1.4276519052523173e-07, "logits/chosen": -2.5218749046325684, "logits/rejected": -2.825000047683716, "logps/chosen": -247.60000610351562, "logps/rejected": -266.6000061035156, "loss": 0.5988, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.44746094942092896, "rewards/margins": 0.528076171875, "rewards/rejected": -0.9761718511581421, "step": 2775 }, { "epoch": 0.7157569515962925, "grad_norm": 306.0, "learning_rate": 1.4212152420185375e-07, "logits/chosen": -2.690624952316284, "logits/rejected": -2.65625, "logps/chosen": -334.3999938964844, "logps/rejected": -345.20001220703125, "loss": 0.5129, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5379883050918579, "rewards/margins": 0.798828125, "rewards/rejected": -1.3367187976837158, "step": 2780 }, { "epoch": 0.7170442842430484, "grad_norm": 572.0, "learning_rate": 1.414778578784758e-07, "logits/chosen": -2.559375047683716, "logits/rejected": -2.293750047683716, "logps/chosen": -256.3999938964844, "logps/rejected": -319.3999938964844, "loss": 0.577, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.47216796875, "rewards/margins": 0.5557616949081421, "rewards/rejected": -1.029687523841858, "step": 2785 }, { "epoch": 0.7183316168898043, "grad_norm": 212.0, "learning_rate": 1.408341915550978e-07, "logits/chosen": -2.6031250953674316, "logits/rejected": -2.659374952316284, "logps/chosen": -327.6000061035156, "logps/rejected": -380.79998779296875, "loss": 0.4545, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4457031190395355, "rewards/margins": 1.1160156726837158, "rewards/rejected": -1.5597655773162842, "step": 2790 }, { "epoch": 0.7196189495365602, "grad_norm": 528.0, "learning_rate": 1.4019052523171988e-07, "logits/chosen": -2.487499952316284, "logits/rejected": -2.4375, "logps/chosen": -312.79998779296875, "logps/rejected": -402.3999938964844, "loss": 0.4766, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.672656238079071, "rewards/margins": 1.19921875, "rewards/rejected": -1.87109375, "step": 2795 }, { "epoch": 0.7209062821833162, "grad_norm": 212.0, "learning_rate": 1.3954685890834192e-07, "logits/chosen": -2.6812500953674316, "logits/rejected": -2.721874952316284, "logps/chosen": -234.8000030517578, "logps/rejected": -304.3999938964844, "loss": 0.527, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.3563476502895355, "rewards/margins": 0.7398437261581421, "rewards/rejected": -1.096093773841858, "step": 2800 }, { "epoch": 0.722193614830072, "grad_norm": 356.0, "learning_rate": 1.3890319258496394e-07, "logits/chosen": -2.7281250953674316, "logits/rejected": -2.721874952316284, "logps/chosen": -291.20001220703125, "logps/rejected": -316.20001220703125, "loss": 0.6047, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3203125, "rewards/margins": 0.40919190645217896, "rewards/rejected": -0.729296863079071, "step": 2805 }, { "epoch": 0.723480947476828, "grad_norm": 236.0, "learning_rate": 1.3825952626158598e-07, "logits/chosen": -2.784374952316284, "logits/rejected": -2.7562499046325684, "logps/chosen": -284.1000061035156, "logps/rejected": -310.0, "loss": 0.5914, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.48750001192092896, "rewards/margins": 0.7123047113418579, "rewards/rejected": -1.198583960533142, "step": 2810 }, { "epoch": 0.724768280123584, "grad_norm": 174.0, "learning_rate": 1.3761585993820805e-07, "logits/chosen": -2.737499952316284, "logits/rejected": -2.6624999046325684, "logps/chosen": -314.0, "logps/rejected": -327.3999938964844, "loss": 0.5797, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3680175840854645, "rewards/margins": 0.6812988519668579, "rewards/rejected": -1.049414038658142, "step": 2815 }, { "epoch": 0.7260556127703398, "grad_norm": 394.0, "learning_rate": 1.3697219361483006e-07, "logits/chosen": -2.25, "logits/rejected": -2.1578125953674316, "logps/chosen": -297.6000061035156, "logps/rejected": -352.6000061035156, "loss": 0.4906, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.71484375, "rewards/margins": 1.146093726158142, "rewards/rejected": -1.8601562976837158, "step": 2820 }, { "epoch": 0.7273429454170958, "grad_norm": 438.0, "learning_rate": 1.363285272914521e-07, "logits/chosen": -2.484375, "logits/rejected": -2.465625047683716, "logps/chosen": -345.20001220703125, "logps/rejected": -414.0, "loss": 0.4023, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.131250023841858, "rewards/margins": 1.42578125, "rewards/rejected": -2.559375047683716, "step": 2825 }, { "epoch": 0.7286302780638517, "grad_norm": 296.0, "learning_rate": 1.3568486096807412e-07, "logits/chosen": -2.5875000953674316, "logits/rejected": -2.418750047683716, "logps/chosen": -310.6000061035156, "logps/rejected": -307.20001220703125, "loss": 0.5047, "rewards/accuracies": 0.6875, "rewards/chosen": -0.378173828125, "rewards/margins": 0.9657226800918579, "rewards/rejected": -1.347265601158142, "step": 2830 }, { "epoch": 0.7299176107106076, "grad_norm": 488.0, "learning_rate": 1.350411946446962e-07, "logits/chosen": -2.6156249046325684, "logits/rejected": -2.621875047683716, "logps/chosen": -339.3999938964844, "logps/rejected": -370.0, "loss": 0.4641, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5435546636581421, "rewards/margins": 0.947265625, "rewards/rejected": -1.490625023841858, "step": 2835 }, { "epoch": 0.7312049433573635, "grad_norm": 414.0, "learning_rate": 1.3439752832131823e-07, "logits/chosen": -2.6156249046325684, "logits/rejected": -2.653125047683716, "logps/chosen": -397.20001220703125, "logps/rejected": -405.20001220703125, "loss": 0.4457, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4736328125, "rewards/margins": 1.07421875, "rewards/rejected": -1.549218773841858, "step": 2840 }, { "epoch": 0.7324922760041195, "grad_norm": 996.0, "learning_rate": 1.3375386199794025e-07, "logits/chosen": -2.53125, "logits/rejected": -2.734375, "logps/chosen": -279.3999938964844, "logps/rejected": -300.6000061035156, "loss": 0.5613, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.676953136920929, "rewards/margins": 0.883593738079071, "rewards/rejected": -1.5578124523162842, "step": 2845 }, { "epoch": 0.7337796086508754, "grad_norm": 432.0, "learning_rate": 1.331101956745623e-07, "logits/chosen": -2.6187500953674316, "logits/rejected": -2.5374999046325684, "logps/chosen": -275.3999938964844, "logps/rejected": -320.3999938964844, "loss": 0.5363, "rewards/accuracies": 0.625, "rewards/chosen": -0.40629881620407104, "rewards/margins": 0.8226562738418579, "rewards/rejected": -1.228124976158142, "step": 2850 }, { "epoch": 0.7350669412976313, "grad_norm": 438.0, "learning_rate": 1.3246652935118436e-07, "logits/chosen": -2.6624999046325684, "logits/rejected": -2.71875, "logps/chosen": -315.6000061035156, "logps/rejected": -342.0, "loss": 0.416, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.537792980670929, "rewards/margins": 1.042578101158142, "rewards/rejected": -1.5773437023162842, "step": 2855 }, { "epoch": 0.7363542739443872, "grad_norm": 252.0, "learning_rate": 1.3182286302780638e-07, "logits/chosen": -2.590625047683716, "logits/rejected": -2.612499952316284, "logps/chosen": -280.70001220703125, "logps/rejected": -284.20001220703125, "loss": 0.5773, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.6435546875, "rewards/margins": 0.6591796875, "rewards/rejected": -1.302343726158142, "step": 2860 }, { "epoch": 0.7376416065911432, "grad_norm": 740.0, "learning_rate": 1.3117919670442842e-07, "logits/chosen": -2.4593749046325684, "logits/rejected": -2.5562500953674316, "logps/chosen": -233.1999969482422, "logps/rejected": -274.20001220703125, "loss": 0.5508, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4351562559604645, "rewards/margins": 0.646289050579071, "rewards/rejected": -1.0812499523162842, "step": 2865 }, { "epoch": 0.738928939237899, "grad_norm": 492.0, "learning_rate": 1.3053553038105044e-07, "logits/chosen": -2.621875047683716, "logits/rejected": -2.596874952316284, "logps/chosen": -316.79998779296875, "logps/rejected": -370.3999938964844, "loss": 0.4871, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.525195300579071, "rewards/margins": 0.8705078363418579, "rewards/rejected": -1.4004638195037842, "step": 2870 }, { "epoch": 0.740216271884655, "grad_norm": 368.0, "learning_rate": 1.298918640576725e-07, "logits/chosen": -2.565624952316284, "logits/rejected": -2.3843750953674316, "logps/chosen": -396.3999938964844, "logps/rejected": -442.0, "loss": 0.4656, "rewards/accuracies": 0.8125, "rewards/chosen": -0.836718738079071, "rewards/margins": 1.4421875476837158, "rewards/rejected": -2.2828125953674316, "step": 2875 }, { "epoch": 0.741503604531411, "grad_norm": 516.0, "learning_rate": 1.2924819773429455e-07, "logits/chosen": -2.440624952316284, "logits/rejected": -2.3296875953674316, "logps/chosen": -263.0, "logps/rejected": -262.6000061035156, "loss": 0.5785, "rewards/accuracies": 0.5, "rewards/chosen": -0.5234375, "rewards/margins": 0.574902355670929, "rewards/rejected": -1.098046898841858, "step": 2880 }, { "epoch": 0.7427909371781668, "grad_norm": 692.0, "learning_rate": 1.2860453141091657e-07, "logits/chosen": -2.590625047683716, "logits/rejected": -2.628124952316284, "logps/chosen": -273.20001220703125, "logps/rejected": -287.20001220703125, "loss": 0.6813, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.511523425579071, "rewards/margins": 0.2779296934604645, "rewards/rejected": -0.788281261920929, "step": 2885 }, { "epoch": 0.7440782698249228, "grad_norm": 446.0, "learning_rate": 1.279608650875386e-07, "logits/chosen": -2.778125047683716, "logits/rejected": -2.7562499046325684, "logps/chosen": -299.79998779296875, "logps/rejected": -348.3999938964844, "loss": 0.5238, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.37675780057907104, "rewards/margins": 0.766406238079071, "rewards/rejected": -1.142578125, "step": 2890 }, { "epoch": 0.7453656024716787, "grad_norm": 564.0, "learning_rate": 1.2731719876416068e-07, "logits/chosen": -2.542187452316284, "logits/rejected": -2.6343750953674316, "logps/chosen": -335.20001220703125, "logps/rejected": -342.6000061035156, "loss": 0.5096, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2781127989292145, "rewards/margins": 0.9222656488418579, "rewards/rejected": -1.1984374523162842, "step": 2895 }, { "epoch": 0.7466529351184346, "grad_norm": 424.0, "learning_rate": 1.266735324407827e-07, "logits/chosen": -2.606250047683716, "logits/rejected": -2.659374952316284, "logps/chosen": -330.3999938964844, "logps/rejected": -460.3999938964844, "loss": 0.4246, "rewards/accuracies": 0.75, "rewards/chosen": -0.655078113079071, "rewards/margins": 1.139062523841858, "rewards/rejected": -1.794531226158142, "step": 2900 }, { "epoch": 0.7479402677651905, "grad_norm": 756.0, "learning_rate": 1.2602986611740474e-07, "logits/chosen": -2.4281249046325684, "logits/rejected": -2.418750047683716, "logps/chosen": -305.20001220703125, "logps/rejected": -321.0, "loss": 0.4836, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4568847715854645, "rewards/margins": 1.072265625, "rewards/rejected": -1.5304687023162842, "step": 2905 }, { "epoch": 0.7492276004119465, "grad_norm": 328.0, "learning_rate": 1.2538619979402675e-07, "logits/chosen": -2.5687499046325684, "logits/rejected": -2.356250047683716, "logps/chosen": -231.5, "logps/rejected": -209.39999389648438, "loss": 0.5551, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.09160156548023224, "rewards/margins": 0.505664050579071, "rewards/rejected": -0.41455078125, "step": 2910 }, { "epoch": 0.7505149330587023, "grad_norm": 564.0, "learning_rate": 1.247425334706488e-07, "logits/chosen": -2.640625, "logits/rejected": -2.512500047683716, "logps/chosen": -265.0, "logps/rejected": -328.20001220703125, "loss": 0.4758, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.40312498807907104, "rewards/margins": 0.889843761920929, "rewards/rejected": -1.2917969226837158, "step": 2915 }, { "epoch": 0.7518022657054583, "grad_norm": 406.0, "learning_rate": 1.2409886714727084e-07, "logits/chosen": -2.703125, "logits/rejected": -2.6968750953674316, "logps/chosen": -335.79998779296875, "logps/rejected": -369.6000061035156, "loss": 0.468, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.09404297173023224, "rewards/margins": 1.1023437976837158, "rewards/rejected": -1.1984374523162842, "step": 2920 }, { "epoch": 0.7530895983522142, "grad_norm": 572.0, "learning_rate": 1.2345520082389288e-07, "logits/chosen": -2.625, "logits/rejected": -2.5999999046325684, "logps/chosen": -317.6000061035156, "logps/rejected": -339.0, "loss": 0.5305, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.504931628704071, "rewards/margins": 0.74560546875, "rewards/rejected": -1.2517211437225342, "step": 2925 }, { "epoch": 0.7543769309989702, "grad_norm": 306.0, "learning_rate": 1.2281153450051492e-07, "logits/chosen": -2.671875, "logits/rejected": -2.6156249046325684, "logps/chosen": -297.3999938964844, "logps/rejected": -308.0, "loss": 0.6082, "rewards/accuracies": 0.5, "rewards/chosen": -0.25537109375, "rewards/margins": 0.560351550579071, "rewards/rejected": -0.8137451410293579, "step": 2930 }, { "epoch": 0.755664263645726, "grad_norm": 924.0, "learning_rate": 1.2216786817713697e-07, "logits/chosen": -2.7874999046325684, "logits/rejected": -2.796875, "logps/chosen": -286.20001220703125, "logps/rejected": -288.0, "loss": 0.5207, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3642578125, "rewards/margins": 0.752734363079071, "rewards/rejected": -1.1179687976837158, "step": 2935 }, { "epoch": 0.756951596292482, "grad_norm": 296.0, "learning_rate": 1.21524201853759e-07, "logits/chosen": -2.8125, "logits/rejected": -2.809375047683716, "logps/chosen": -350.3999938964844, "logps/rejected": -343.3999938964844, "loss": 0.557, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.29827880859375, "rewards/margins": 0.660937488079071, "rewards/rejected": -0.9609375, "step": 2940 }, { "epoch": 0.7582389289392379, "grad_norm": 524.0, "learning_rate": 1.2088053553038105e-07, "logits/chosen": -2.690624952316284, "logits/rejected": -2.559375047683716, "logps/chosen": -301.20001220703125, "logps/rejected": -311.3999938964844, "loss": 0.6523, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.4061523377895355, "rewards/margins": 0.38789063692092896, "rewards/rejected": -0.7935546636581421, "step": 2945 }, { "epoch": 0.7595262615859938, "grad_norm": 248.0, "learning_rate": 1.202368692070031e-07, "logits/chosen": -2.59375, "logits/rejected": -2.5875000953674316, "logps/chosen": -297.6000061035156, "logps/rejected": -355.6000061035156, "loss": 0.3555, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.4888916015625, "rewards/margins": 1.326562523841858, "rewards/rejected": -1.814062476158142, "step": 2950 }, { "epoch": 0.7608135942327497, "grad_norm": 428.0, "learning_rate": 1.195932028836251e-07, "logits/chosen": -2.578125, "logits/rejected": -2.6468749046325684, "logps/chosen": -305.20001220703125, "logps/rejected": -324.20001220703125, "loss": 0.5746, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4066406190395355, "rewards/margins": 0.7092040777206421, "rewards/rejected": -1.1168701648712158, "step": 2955 }, { "epoch": 0.7621009268795057, "grad_norm": 494.0, "learning_rate": 1.1894953656024715e-07, "logits/chosen": -2.606250047683716, "logits/rejected": -2.5062499046325684, "logps/chosen": -306.20001220703125, "logps/rejected": -376.3999938964844, "loss": 0.5074, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8060547113418579, "rewards/margins": 0.756640613079071, "rewards/rejected": -1.5656249523162842, "step": 2960 }, { "epoch": 0.7633882595262615, "grad_norm": 292.0, "learning_rate": 1.1830587023686921e-07, "logits/chosen": -2.934375047683716, "logits/rejected": -2.7562499046325684, "logps/chosen": -289.0, "logps/rejected": -265.0, "loss": 0.5813, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.039306640625, "rewards/margins": 0.500781238079071, "rewards/rejected": -0.541210949420929, "step": 2965 }, { "epoch": 0.7646755921730175, "grad_norm": 382.0, "learning_rate": 1.1766220391349124e-07, "logits/chosen": -2.253124952316284, "logits/rejected": -2.4312500953674316, "logps/chosen": -328.0, "logps/rejected": -310.20001220703125, "loss": 0.5148, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6382812261581421, "rewards/margins": 0.9564453363418579, "rewards/rejected": -1.59716796875, "step": 2970 }, { "epoch": 0.7659629248197735, "grad_norm": 536.0, "learning_rate": 1.1701853759011328e-07, "logits/chosen": -2.628124952316284, "logits/rejected": -2.6343750953674316, "logps/chosen": -341.3999938964844, "logps/rejected": -367.3999938964844, "loss": 0.5785, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6107422113418579, "rewards/margins": 0.8179687261581421, "rewards/rejected": -1.4289062023162842, "step": 2975 }, { "epoch": 0.7672502574665293, "grad_norm": 500.0, "learning_rate": 1.1637487126673531e-07, "logits/chosen": -2.5374999046325684, "logits/rejected": -2.549999952316284, "logps/chosen": -312.79998779296875, "logps/rejected": -317.3999938964844, "loss": 0.4504, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.355764776468277, "rewards/margins": 1.035546898841858, "rewards/rejected": -1.390625, "step": 2980 }, { "epoch": 0.7685375901132853, "grad_norm": 247.0, "learning_rate": 1.1573120494335737e-07, "logits/chosen": -2.5062499046325684, "logits/rejected": -2.6468749046325684, "logps/chosen": -278.20001220703125, "logps/rejected": -276.6000061035156, "loss": 0.6391, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.43476563692092896, "rewards/margins": 0.5615234375, "rewards/rejected": -0.995898425579071, "step": 2985 }, { "epoch": 0.7698249227600412, "grad_norm": 314.0, "learning_rate": 1.150875386199794e-07, "logits/chosen": -2.8187499046325684, "logits/rejected": -2.7093749046325684, "logps/chosen": -291.6000061035156, "logps/rejected": -231.8000030517578, "loss": 0.5754, "rewards/accuracies": 0.5, "rewards/chosen": -0.068328857421875, "rewards/margins": 0.3980468809604645, "rewards/rejected": -0.46564942598342896, "step": 2990 }, { "epoch": 0.7711122554067971, "grad_norm": 380.0, "learning_rate": 1.1444387229660144e-07, "logits/chosen": -2.753124952316284, "logits/rejected": -2.653125047683716, "logps/chosen": -287.6000061035156, "logps/rejected": -311.3999938964844, "loss": 0.5551, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.21855469048023224, "rewards/margins": 0.6683593988418579, "rewards/rejected": -0.887499988079071, "step": 2995 }, { "epoch": 0.772399588053553, "grad_norm": 608.0, "learning_rate": 1.1380020597322347e-07, "logits/chosen": -2.6156249046325684, "logits/rejected": -2.606250047683716, "logps/chosen": -351.6000061035156, "logps/rejected": -355.20001220703125, "loss": 0.5346, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.115966796875, "rewards/margins": 0.656445324420929, "rewards/rejected": -0.772265613079071, "step": 3000 }, { "epoch": 0.773686920700309, "grad_norm": 580.0, "learning_rate": 1.1315653964984552e-07, "logits/chosen": -2.6187500953674316, "logits/rejected": -2.621875047683716, "logps/chosen": -333.20001220703125, "logps/rejected": -338.3999938964844, "loss": 0.5797, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3773437440395355, "rewards/margins": 0.791796863079071, "rewards/rejected": -1.1682617664337158, "step": 3005 }, { "epoch": 0.7749742533470649, "grad_norm": 580.0, "learning_rate": 1.1251287332646755e-07, "logits/chosen": -2.253124952316284, "logits/rejected": -2.215625047683716, "logps/chosen": -299.20001220703125, "logps/rejected": -341.79998779296875, "loss": 0.5172, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6974853277206421, "rewards/margins": 0.970263659954071, "rewards/rejected": -1.668432593345642, "step": 3010 }, { "epoch": 0.7762615859938208, "grad_norm": 808.0, "learning_rate": 1.118692070030896e-07, "logits/chosen": -2.671875, "logits/rejected": -2.528125047683716, "logps/chosen": -309.20001220703125, "logps/rejected": -341.6000061035156, "loss": 0.5387, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5111328363418579, "rewards/margins": 0.742382824420929, "rewards/rejected": -1.2519042491912842, "step": 3015 }, { "epoch": 0.7775489186405767, "grad_norm": 302.0, "learning_rate": 1.1122554067971163e-07, "logits/chosen": -2.6312499046325684, "logits/rejected": -2.637500047683716, "logps/chosen": -317.20001220703125, "logps/rejected": -300.79998779296875, "loss": 0.5094, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6429687738418579, "rewards/margins": 0.995312511920929, "rewards/rejected": -1.6355469226837158, "step": 3020 }, { "epoch": 0.7788362512873327, "grad_norm": 588.0, "learning_rate": 1.1058187435633368e-07, "logits/chosen": -2.53125, "logits/rejected": -2.643749952316284, "logps/chosen": -328.3999938964844, "logps/rejected": -329.79998779296875, "loss": 0.4813, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4691406190395355, "rewards/margins": 0.925000011920929, "rewards/rejected": -1.391992211341858, "step": 3025 }, { "epoch": 0.7801235839340885, "grad_norm": 500.0, "learning_rate": 1.0993820803295571e-07, "logits/chosen": -2.7093749046325684, "logits/rejected": -2.549999952316284, "logps/chosen": -320.20001220703125, "logps/rejected": -378.3999938964844, "loss": 0.424, "rewards/accuracies": 0.75, "rewards/chosen": -0.4867187440395355, "rewards/margins": 1.213281273841858, "rewards/rejected": -1.7023437023162842, "step": 3030 }, { "epoch": 0.7814109165808445, "grad_norm": 732.0, "learning_rate": 1.0929454170957775e-07, "logits/chosen": -2.4906249046325684, "logits/rejected": -2.5687499046325684, "logps/chosen": -295.75, "logps/rejected": -313.5, "loss": 0.5207, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6732177734375, "rewards/margins": 1.0499999523162842, "rewards/rejected": -1.722753882408142, "step": 3035 }, { "epoch": 0.7826982492276005, "grad_norm": 484.0, "learning_rate": 1.0865087538619978e-07, "logits/chosen": -2.4625000953674316, "logits/rejected": -2.515625, "logps/chosen": -252.1999969482422, "logps/rejected": -360.0, "loss": 0.457, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.59765625, "rewards/margins": 1.1953125, "rewards/rejected": -1.796875, "step": 3040 }, { "epoch": 0.7839855818743563, "grad_norm": 244.0, "learning_rate": 1.0800720906282184e-07, "logits/chosen": -2.659374952316284, "logits/rejected": -2.674999952316284, "logps/chosen": -299.79998779296875, "logps/rejected": -358.0, "loss": 0.5715, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5848144292831421, "rewards/margins": 0.652880847454071, "rewards/rejected": -1.2394530773162842, "step": 3045 }, { "epoch": 0.7852729145211123, "grad_norm": 4768.0, "learning_rate": 1.0736354273944387e-07, "logits/chosen": -2.706249952316284, "logits/rejected": -2.721874952316284, "logps/chosen": -240.8000030517578, "logps/rejected": -256.5, "loss": 0.6871, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.318359375, "rewards/margins": 0.35429686307907104, "rewards/rejected": -0.673828125, "step": 3050 }, { "epoch": 0.7865602471678682, "grad_norm": 312.0, "learning_rate": 1.067198764160659e-07, "logits/chosen": -2.737499952316284, "logits/rejected": -2.6171875, "logps/chosen": -332.0, "logps/rejected": -391.20001220703125, "loss": 0.584, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.21601562201976776, "rewards/margins": 0.6768554449081421, "rewards/rejected": -0.8910156488418579, "step": 3055 }, { "epoch": 0.787847579814624, "grad_norm": 344.0, "learning_rate": 1.0607621009268794e-07, "logits/chosen": -2.721874952316284, "logits/rejected": -2.640625, "logps/chosen": -324.0, "logps/rejected": -372.0, "loss": 0.4773, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.358642578125, "rewards/margins": 1.1496093273162842, "rewards/rejected": -1.5070312023162842, "step": 3060 }, { "epoch": 0.78913491246138, "grad_norm": 482.0, "learning_rate": 1.0543254376930998e-07, "logits/chosen": -2.6187500953674316, "logits/rejected": -2.512500047683716, "logps/chosen": -280.0, "logps/rejected": -336.3999938964844, "loss": 0.5008, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.491943359375, "rewards/margins": 1.0625, "rewards/rejected": -1.5554687976837158, "step": 3065 }, { "epoch": 0.790422245108136, "grad_norm": 330.0, "learning_rate": 1.0478887744593203e-07, "logits/chosen": -2.7593750953674316, "logits/rejected": -2.671875, "logps/chosen": -295.79998779296875, "logps/rejected": -246.1999969482422, "loss": 0.6484, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.12312011420726776, "rewards/margins": 0.29462891817092896, "rewards/rejected": -0.41874998807907104, "step": 3070 }, { "epoch": 0.7917095777548918, "grad_norm": 376.0, "learning_rate": 1.0414521112255406e-07, "logits/chosen": -2.659374952316284, "logits/rejected": -2.5140624046325684, "logps/chosen": -276.20001220703125, "logps/rejected": -305.79998779296875, "loss": 0.4994, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3458007872104645, "rewards/margins": 0.955273449420929, "rewards/rejected": -1.301171898841858, "step": 3075 }, { "epoch": 0.7929969104016478, "grad_norm": 540.0, "learning_rate": 1.035015447991761e-07, "logits/chosen": -2.6156249046325684, "logits/rejected": -2.4359374046325684, "logps/chosen": -247.5, "logps/rejected": -321.3999938964844, "loss": 0.5273, "rewards/accuracies": 0.625, "rewards/chosen": -0.19201049208641052, "rewards/margins": 0.7662109136581421, "rewards/rejected": -0.956835925579071, "step": 3080 }, { "epoch": 0.7942842430484037, "grad_norm": 324.0, "learning_rate": 1.0285787847579814e-07, "logits/chosen": -2.765625, "logits/rejected": -2.8218750953674316, "logps/chosen": -323.20001220703125, "logps/rejected": -346.3999938964844, "loss": 0.4957, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.376953125, "rewards/margins": 0.75, "rewards/rejected": -1.127832055091858, "step": 3085 }, { "epoch": 0.7955715756951597, "grad_norm": 462.0, "learning_rate": 1.0221421215242018e-07, "logits/chosen": -2.59375, "logits/rejected": -2.5687499046325684, "logps/chosen": -376.0, "logps/rejected": -441.20001220703125, "loss": 0.4629, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0929687023162842, "rewards/margins": 1.310156226158142, "rewards/rejected": -2.4046874046325684, "step": 3090 }, { "epoch": 0.7968589083419155, "grad_norm": 564.0, "learning_rate": 1.0157054582904221e-07, "logits/chosen": -2.546875, "logits/rejected": -2.229687452316284, "logps/chosen": -195.89999389648438, "logps/rejected": -215.33749389648438, "loss": 0.6258, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.39042967557907104, "rewards/margins": 0.3419433534145355, "rewards/rejected": -0.7343505620956421, "step": 3095 }, { "epoch": 0.7981462409886715, "grad_norm": 442.0, "learning_rate": 1.0092687950566426e-07, "logits/chosen": -2.6156249046325684, "logits/rejected": -2.706249952316284, "logps/chosen": -269.20001220703125, "logps/rejected": -319.79998779296875, "loss": 0.4104, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4318603575229645, "rewards/margins": 1.1794922351837158, "rewards/rejected": -1.6128418445587158, "step": 3100 }, { "epoch": 0.7994335736354274, "grad_norm": 596.0, "learning_rate": 1.002832131822863e-07, "logits/chosen": -2.3890624046325684, "logits/rejected": -2.2171874046325684, "logps/chosen": -277.79998779296875, "logps/rejected": -294.0, "loss": 0.5859, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.25773924589157104, "rewards/margins": 0.49980467557907104, "rewards/rejected": -0.7578125, "step": 3105 }, { "epoch": 0.8007209062821833, "grad_norm": 460.0, "learning_rate": 9.963954685890834e-08, "logits/chosen": -2.703125, "logits/rejected": -2.6812500953674316, "logps/chosen": -305.20001220703125, "logps/rejected": -362.3999938964844, "loss": 0.4887, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.47998046875, "rewards/margins": 0.953906238079071, "rewards/rejected": -1.435937523841858, "step": 3110 }, { "epoch": 0.8020082389289392, "grad_norm": 434.0, "learning_rate": 9.899588053553037e-08, "logits/chosen": -2.385937452316284, "logits/rejected": -2.4390625953674316, "logps/chosen": -255.1999969482422, "logps/rejected": -290.6000061035156, "loss": 0.5152, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.4632812440395355, "rewards/margins": 0.8323730230331421, "rewards/rejected": -1.296289086341858, "step": 3115 }, { "epoch": 0.8032955715756952, "grad_norm": 584.0, "learning_rate": 9.835221421215241e-08, "logits/chosen": -2.481250047683716, "logits/rejected": -2.401562452316284, "logps/chosen": -275.3999938964844, "logps/rejected": -332.79998779296875, "loss": 0.5652, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.706982433795929, "rewards/margins": 0.69921875, "rewards/rejected": -1.4074218273162842, "step": 3120 }, { "epoch": 0.804582904222451, "grad_norm": 468.0, "learning_rate": 9.770854788877446e-08, "logits/chosen": -2.518749952316284, "logits/rejected": -2.3843750953674316, "logps/chosen": -313.3999938964844, "logps/rejected": -379.0, "loss": 0.4574, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6871093511581421, "rewards/margins": 1.111328125, "rewards/rejected": -1.7957031726837158, "step": 3125 }, { "epoch": 0.805870236869207, "grad_norm": 478.0, "learning_rate": 9.70648815653965e-08, "logits/chosen": -2.421875, "logits/rejected": -2.465625047683716, "logps/chosen": -356.3999938964844, "logps/rejected": -383.79998779296875, "loss": 0.5086, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5225585699081421, "rewards/margins": 0.9521484375, "rewards/rejected": -1.472265601158142, "step": 3130 }, { "epoch": 0.807157569515963, "grad_norm": 592.0, "learning_rate": 9.642121524201853e-08, "logits/chosen": -2.512500047683716, "logits/rejected": -2.543750047683716, "logps/chosen": -348.3999938964844, "logps/rejected": -394.79998779296875, "loss": 0.5035, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.798828125, "rewards/margins": 0.907421886920929, "rewards/rejected": -1.706640601158142, "step": 3135 }, { "epoch": 0.8084449021627188, "grad_norm": 440.0, "learning_rate": 9.577754891864057e-08, "logits/chosen": -2.4124999046325684, "logits/rejected": -2.4703125953674316, "logps/chosen": -300.79998779296875, "logps/rejected": -372.6000061035156, "loss": 0.4711, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.598388671875, "rewards/margins": 1.077539086341858, "rewards/rejected": -1.673437476158142, "step": 3140 }, { "epoch": 0.8097322348094748, "grad_norm": 428.0, "learning_rate": 9.513388259526261e-08, "logits/chosen": -2.4781250953674316, "logits/rejected": -2.4749999046325684, "logps/chosen": -250.8000030517578, "logps/rejected": -273.3999938964844, "loss": 0.5793, "rewards/accuracies": 0.625, "rewards/chosen": -0.5591796636581421, "rewards/margins": 0.7269531488418579, "rewards/rejected": -1.286523461341858, "step": 3145 }, { "epoch": 0.8110195674562307, "grad_norm": 680.0, "learning_rate": 9.449021627188466e-08, "logits/chosen": -2.489062547683716, "logits/rejected": -2.409374952316284, "logps/chosen": -336.0, "logps/rejected": -288.79998779296875, "loss": 0.5586, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.529614269733429, "rewards/margins": 0.6175781488418579, "rewards/rejected": -1.148046851158142, "step": 3150 }, { "epoch": 0.8123069001029866, "grad_norm": 288.0, "learning_rate": 9.384654994850669e-08, "logits/chosen": -2.465625047683716, "logits/rejected": -2.53125, "logps/chosen": -266.6000061035156, "logps/rejected": -309.20001220703125, "loss": 0.5352, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.47929686307907104, "rewards/margins": 0.8765624761581421, "rewards/rejected": -1.353906273841858, "step": 3155 }, { "epoch": 0.8135942327497425, "grad_norm": 334.0, "learning_rate": 9.320288362512873e-08, "logits/chosen": -2.8062500953674316, "logits/rejected": -2.7718749046325684, "logps/chosen": -275.3999938964844, "logps/rejected": -386.79998779296875, "loss": 0.4254, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3759765625, "rewards/margins": 1.1960937976837158, "rewards/rejected": -1.5710937976837158, "step": 3160 }, { "epoch": 0.8148815653964985, "grad_norm": 552.0, "learning_rate": 9.255921730175077e-08, "logits/chosen": -2.6031250953674316, "logits/rejected": -2.565624952316284, "logps/chosen": -306.79998779296875, "logps/rejected": -345.79998779296875, "loss": 0.5285, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.81640625, "rewards/margins": 0.7699218988418579, "rewards/rejected": -1.5859375, "step": 3165 }, { "epoch": 0.8161688980432544, "grad_norm": 572.0, "learning_rate": 9.191555097837281e-08, "logits/chosen": -2.737499952316284, "logits/rejected": -2.684375047683716, "logps/chosen": -300.6000061035156, "logps/rejected": -278.20001220703125, "loss": 0.5695, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.4857421815395355, "rewards/margins": 0.6104491949081421, "rewards/rejected": -1.097265601158142, "step": 3170 }, { "epoch": 0.8174562306900103, "grad_norm": 428.0, "learning_rate": 9.127188465499484e-08, "logits/chosen": -2.6343750953674316, "logits/rejected": -2.559375047683716, "logps/chosen": -334.79998779296875, "logps/rejected": -295.79998779296875, "loss": 0.4988, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.21533203125, "rewards/margins": 0.811718761920929, "rewards/rejected": -1.0281250476837158, "step": 3175 }, { "epoch": 0.8187435633367662, "grad_norm": 700.0, "learning_rate": 9.062821833161689e-08, "logits/chosen": -2.637500047683716, "logits/rejected": -2.5843749046325684, "logps/chosen": -313.3999938964844, "logps/rejected": -333.6000061035156, "loss": 0.6262, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.41484373807907104, "rewards/margins": 0.4610351622104645, "rewards/rejected": -0.876953125, "step": 3180 }, { "epoch": 0.8200308959835222, "grad_norm": 716.0, "learning_rate": 8.998455200823893e-08, "logits/chosen": -2.5875000953674316, "logits/rejected": -2.645312547683716, "logps/chosen": -295.0, "logps/rejected": -275.1000061035156, "loss": 0.55, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.46464842557907104, "rewards/margins": 0.6865234375, "rewards/rejected": -1.150390625, "step": 3185 }, { "epoch": 0.821318228630278, "grad_norm": 492.0, "learning_rate": 8.934088568486097e-08, "logits/chosen": -2.7562499046325684, "logits/rejected": -2.840625047683716, "logps/chosen": -313.6000061035156, "logps/rejected": -301.3999938964844, "loss": 0.6742, "rewards/accuracies": 0.5625, "rewards/chosen": -0.45917969942092896, "rewards/margins": 0.3165039122104645, "rewards/rejected": -0.7769531011581421, "step": 3190 }, { "epoch": 0.822605561277034, "grad_norm": 772.0, "learning_rate": 8.8697219361483e-08, "logits/chosen": -2.6156249046325684, "logits/rejected": -2.737499952316284, "logps/chosen": -312.0, "logps/rejected": -320.1000061035156, "loss": 0.5969, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.4517578184604645, "rewards/margins": 0.634448230266571, "rewards/rejected": -1.083642601966858, "step": 3195 }, { "epoch": 0.82389289392379, "grad_norm": 490.0, "learning_rate": 8.805355303810503e-08, "logits/chosen": -2.5843749046325684, "logits/rejected": -2.7093749046325684, "logps/chosen": -323.0, "logps/rejected": -347.29998779296875, "loss": 0.5441, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2583984434604645, "rewards/margins": 0.6957031488418579, "rewards/rejected": -0.955078125, "step": 3200 }, { "epoch": 0.8251802265705458, "grad_norm": 502.0, "learning_rate": 8.740988671472709e-08, "logits/chosen": -2.575000047683716, "logits/rejected": -2.53125, "logps/chosen": -275.3999938964844, "logps/rejected": -325.6000061035156, "loss": 0.5996, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.591796875, "rewards/margins": 0.7676757574081421, "rewards/rejected": -1.358984351158142, "step": 3205 }, { "epoch": 0.8264675592173018, "grad_norm": 536.0, "learning_rate": 8.676622039134912e-08, "logits/chosen": -2.8218750953674316, "logits/rejected": -2.8031249046325684, "logps/chosen": -385.79998779296875, "logps/rejected": -388.79998779296875, "loss": 0.4984, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.45683592557907104, "rewards/margins": 0.8558593988418579, "rewards/rejected": -1.3125, "step": 3210 }, { "epoch": 0.8277548918640577, "grad_norm": 632.0, "learning_rate": 8.612255406797116e-08, "logits/chosen": -2.5718750953674316, "logits/rejected": -2.426562547683716, "logps/chosen": -325.20001220703125, "logps/rejected": -327.3999938964844, "loss": 0.5998, "rewards/accuracies": 0.75, "rewards/chosen": -0.517260730266571, "rewards/margins": 0.816210925579071, "rewards/rejected": -1.333593726158142, "step": 3215 }, { "epoch": 0.8290422245108136, "grad_norm": 584.0, "learning_rate": 8.547888774459319e-08, "logits/chosen": -2.534374952316284, "logits/rejected": -2.450000047683716, "logps/chosen": -262.20001220703125, "logps/rejected": -246.8000030517578, "loss": 0.6008, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.38749998807907104, "rewards/margins": 0.3941406309604645, "rewards/rejected": -0.78125, "step": 3220 }, { "epoch": 0.8303295571575695, "grad_norm": 532.0, "learning_rate": 8.483522142121524e-08, "logits/chosen": -2.518749952316284, "logits/rejected": -2.465625047683716, "logps/chosen": -330.0, "logps/rejected": -378.3999938964844, "loss": 0.4707, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.711718738079071, "rewards/margins": 1.232421875, "rewards/rejected": -1.9460937976837158, "step": 3225 }, { "epoch": 0.8316168898043255, "grad_norm": 434.0, "learning_rate": 8.419155509783727e-08, "logits/chosen": -2.6187500953674316, "logits/rejected": -2.706249952316284, "logps/chosen": -325.3999938964844, "logps/rejected": -360.3999938964844, "loss": 0.452, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2567382752895355, "rewards/margins": 1.0285155773162842, "rewards/rejected": -1.2843749523162842, "step": 3230 }, { "epoch": 0.8329042224510813, "grad_norm": 410.0, "learning_rate": 8.354788877445932e-08, "logits/chosen": -2.621875047683716, "logits/rejected": -2.4593749046325684, "logps/chosen": -319.79998779296875, "logps/rejected": -374.3999938964844, "loss": 0.4875, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6468750238418579, "rewards/margins": 1.1554687023162842, "rewards/rejected": -1.8039062023162842, "step": 3235 }, { "epoch": 0.8341915550978373, "grad_norm": 320.0, "learning_rate": 8.290422245108136e-08, "logits/chosen": -2.7593750953674316, "logits/rejected": -2.762500047683716, "logps/chosen": -274.5, "logps/rejected": -346.0, "loss": 0.4801, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.369873046875, "rewards/margins": 0.974609375, "rewards/rejected": -1.3429687023162842, "step": 3240 }, { "epoch": 0.8354788877445932, "grad_norm": 552.0, "learning_rate": 8.22605561277034e-08, "logits/chosen": -2.643749952316284, "logits/rejected": -2.606250047683716, "logps/chosen": -281.0, "logps/rejected": -341.6000061035156, "loss": 0.523, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.38164061307907104, "rewards/margins": 0.8089843988418579, "rewards/rejected": -1.1892578601837158, "step": 3245 }, { "epoch": 0.8367662203913491, "grad_norm": 340.0, "learning_rate": 8.161688980432543e-08, "logits/chosen": -2.5999999046325684, "logits/rejected": -2.5875000953674316, "logps/chosen": -332.3999938964844, "logps/rejected": -342.6000061035156, "loss": 0.457, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.746777355670929, "rewards/margins": 1.142724633216858, "rewards/rejected": -1.889257788658142, "step": 3250 }, { "epoch": 0.838053553038105, "grad_norm": 844.0, "learning_rate": 8.097322348094747e-08, "logits/chosen": -2.796875, "logits/rejected": -2.1527342796325684, "logps/chosen": -253.625, "logps/rejected": -271.20001220703125, "loss": 0.5449, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4339599609375, "rewards/margins": 0.609082043170929, "rewards/rejected": -1.04315185546875, "step": 3255 }, { "epoch": 0.839340885684861, "grad_norm": 314.0, "learning_rate": 8.032955715756952e-08, "logits/chosen": -2.700000047683716, "logits/rejected": -2.778125047683716, "logps/chosen": -322.3999938964844, "logps/rejected": -404.79998779296875, "loss": 0.5422, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.52734375, "rewards/margins": 0.789843738079071, "rewards/rejected": -1.316015601158142, "step": 3260 }, { "epoch": 0.8406282183316169, "grad_norm": 680.0, "learning_rate": 7.968589083419156e-08, "logits/chosen": -2.659374952316284, "logits/rejected": -2.606250047683716, "logps/chosen": -344.79998779296875, "logps/rejected": -382.79998779296875, "loss": 0.5, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8359375, "rewards/margins": 1.1042969226837158, "rewards/rejected": -1.942968726158142, "step": 3265 }, { "epoch": 0.8419155509783728, "grad_norm": 344.0, "learning_rate": 7.904222451081359e-08, "logits/chosen": -2.481250047683716, "logits/rejected": -2.6656250953674316, "logps/chosen": -341.6000061035156, "logps/rejected": -330.6000061035156, "loss": 0.6262, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.43388670682907104, "rewards/margins": 0.6234375238418579, "rewards/rejected": -1.0558593273162842, "step": 3270 }, { "epoch": 0.8432028836251287, "grad_norm": 338.0, "learning_rate": 7.839855818743563e-08, "logits/chosen": -2.59375, "logits/rejected": -2.6328125, "logps/chosen": -296.0, "logps/rejected": -303.3999938964844, "loss": 0.466, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2500976622104645, "rewards/margins": 0.9297851324081421, "rewards/rejected": -1.1784179210662842, "step": 3275 }, { "epoch": 0.8444902162718847, "grad_norm": 720.0, "learning_rate": 7.775489186405767e-08, "logits/chosen": -2.440624952316284, "logits/rejected": -2.46875, "logps/chosen": -308.20001220703125, "logps/rejected": -303.6000061035156, "loss": 0.6039, "rewards/accuracies": 0.5625, "rewards/chosen": -0.44550782442092896, "rewards/margins": 0.61328125, "rewards/rejected": -1.0603516101837158, "step": 3280 }, { "epoch": 0.8457775489186405, "grad_norm": 352.0, "learning_rate": 7.711122554067972e-08, "logits/chosen": -2.640625, "logits/rejected": -2.5843749046325684, "logps/chosen": -300.6000061035156, "logps/rejected": -337.20001220703125, "loss": 0.6383, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.728759765625, "rewards/margins": 0.4326171875, "rewards/rejected": -1.16015625, "step": 3285 }, { "epoch": 0.8470648815653965, "grad_norm": 430.0, "learning_rate": 7.646755921730175e-08, "logits/chosen": -2.6624999046325684, "logits/rejected": -2.6500000953674316, "logps/chosen": -340.0, "logps/rejected": -401.20001220703125, "loss": 0.5398, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.4906249940395355, "rewards/margins": 0.752734363079071, "rewards/rejected": -1.242578148841858, "step": 3290 }, { "epoch": 0.8483522142121525, "grad_norm": 412.0, "learning_rate": 7.582389289392379e-08, "logits/chosen": -2.6468749046325684, "logits/rejected": -2.690624952316284, "logps/chosen": -357.6000061035156, "logps/rejected": -459.20001220703125, "loss": 0.4469, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6089843511581421, "rewards/margins": 1.443750023841858, "rewards/rejected": -2.051562547683716, "step": 3295 }, { "epoch": 0.8496395468589083, "grad_norm": 692.0, "learning_rate": 7.518022657054583e-08, "logits/chosen": -2.512500047683716, "logits/rejected": -2.432812452316284, "logps/chosen": -301.6000061035156, "logps/rejected": -321.0, "loss": 0.5258, "rewards/accuracies": 0.625, "rewards/chosen": -0.33037108182907104, "rewards/margins": 0.65234375, "rewards/rejected": -0.9828125238418579, "step": 3300 }, { "epoch": 0.8509268795056643, "grad_norm": 229.0, "learning_rate": 7.453656024716787e-08, "logits/chosen": -2.9156250953674316, "logits/rejected": -2.871875047683716, "logps/chosen": -329.3999938964844, "logps/rejected": -366.0, "loss": 0.6109, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.16280516982078552, "rewards/margins": 0.48515623807907104, "rewards/rejected": -0.6494140625, "step": 3305 }, { "epoch": 0.8522142121524202, "grad_norm": 490.0, "learning_rate": 7.38928939237899e-08, "logits/chosen": -2.762500047683716, "logits/rejected": -2.7906250953674316, "logps/chosen": -376.3999938964844, "logps/rejected": -342.79998779296875, "loss": 0.5414, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.31464844942092896, "rewards/margins": 0.642382800579071, "rewards/rejected": -0.9560546875, "step": 3310 }, { "epoch": 0.8535015447991761, "grad_norm": 442.0, "learning_rate": 7.324922760041195e-08, "logits/chosen": -2.721874952316284, "logits/rejected": -2.78125, "logps/chosen": -318.20001220703125, "logps/rejected": -380.79998779296875, "loss": 0.5109, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.25761717557907104, "rewards/margins": 0.787890613079071, "rewards/rejected": -1.044531226158142, "step": 3315 }, { "epoch": 0.854788877445932, "grad_norm": 560.0, "learning_rate": 7.260556127703399e-08, "logits/chosen": -2.4749999046325684, "logits/rejected": -2.471874952316284, "logps/chosen": -308.8999938964844, "logps/rejected": -384.0, "loss": 0.4031, "rewards/accuracies": 0.75, "rewards/chosen": -0.8011718988418579, "rewards/margins": 1.4304687976837158, "rewards/rejected": -2.231250047683716, "step": 3320 }, { "epoch": 0.856076210092688, "grad_norm": 264.0, "learning_rate": 7.196189495365603e-08, "logits/chosen": -2.3968749046325684, "logits/rejected": -2.4000000953674316, "logps/chosen": -325.20001220703125, "logps/rejected": -447.20001220703125, "loss": 0.3531, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9593750238418579, "rewards/margins": 1.662500023841858, "rewards/rejected": -2.6156249046325684, "step": 3325 }, { "epoch": 0.8573635427394438, "grad_norm": 652.0, "learning_rate": 7.131822863027806e-08, "logits/chosen": -2.5250000953674316, "logits/rejected": -2.5093750953674316, "logps/chosen": -365.6000061035156, "logps/rejected": -418.20001220703125, "loss": 0.4734, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9398437738418579, "rewards/margins": 1.2664062976837158, "rewards/rejected": -2.2085938453674316, "step": 3330 }, { "epoch": 0.8586508753861998, "grad_norm": 816.0, "learning_rate": 7.067456230690009e-08, "logits/chosen": -2.53125, "logits/rejected": -2.59375, "logps/chosen": -255.0, "logps/rejected": -292.3999938964844, "loss": 0.6297, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.72601318359375, "rewards/margins": 0.576123058795929, "rewards/rejected": -1.3046875, "step": 3335 }, { "epoch": 0.8599382080329557, "grad_norm": 310.0, "learning_rate": 7.003089598352215e-08, "logits/chosen": -2.5687499046325684, "logits/rejected": -2.721874952316284, "logps/chosen": -297.79998779296875, "logps/rejected": -300.0, "loss": 0.5723, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3033203184604645, "rewards/margins": 0.5337890386581421, "rewards/rejected": -0.835156261920929, "step": 3340 }, { "epoch": 0.8612255406797117, "grad_norm": 524.0, "learning_rate": 6.938722966014417e-08, "logits/chosen": -2.5625, "logits/rejected": -2.690624952316284, "logps/chosen": -313.6000061035156, "logps/rejected": -329.6000061035156, "loss": 0.4492, "rewards/accuracies": 0.6875, "rewards/chosen": -0.60546875, "rewards/margins": 1.070703148841858, "rewards/rejected": -1.6755859851837158, "step": 3345 }, { "epoch": 0.8625128733264675, "grad_norm": 494.0, "learning_rate": 6.874356333676622e-08, "logits/chosen": -2.734375, "logits/rejected": -2.5999999046325684, "logps/chosen": -354.0, "logps/rejected": -353.0, "loss": 0.5992, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.35161131620407104, "rewards/margins": 0.5796874761581421, "rewards/rejected": -0.9320312738418579, "step": 3350 }, { "epoch": 0.8638002059732235, "grad_norm": 496.0, "learning_rate": 6.809989701338825e-08, "logits/chosen": -2.528125047683716, "logits/rejected": -2.526562452316284, "logps/chosen": -342.3999938964844, "logps/rejected": -410.79998779296875, "loss": 0.4324, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5323241949081421, "rewards/margins": 1.2374999523162842, "rewards/rejected": -1.7726562023162842, "step": 3355 }, { "epoch": 0.8650875386199794, "grad_norm": 572.0, "learning_rate": 6.74562306900103e-08, "logits/chosen": NaN, "logits/rejected": -2.262500047683716, "logps/chosen": -293.79998779296875, "logps/rejected": -312.5, "loss": 0.5734, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.3812499940395355, "rewards/margins": 0.7123047113418579, "rewards/rejected": -1.0935547351837158, "step": 3360 }, { "epoch": 0.8663748712667353, "grad_norm": 330.0, "learning_rate": 6.681256436663233e-08, "logits/chosen": -2.590625047683716, "logits/rejected": -2.4281249046325684, "logps/chosen": -314.20001220703125, "logps/rejected": -358.79998779296875, "loss": 0.3666, "rewards/accuracies": 0.8125, "rewards/chosen": -0.41132813692092896, "rewards/margins": 1.4484374523162842, "rewards/rejected": -1.859375, "step": 3365 }, { "epoch": 0.8676622039134912, "grad_norm": 454.0, "learning_rate": 6.616889804325438e-08, "logits/chosen": -2.6968750953674316, "logits/rejected": -2.746875047683716, "logps/chosen": -322.20001220703125, "logps/rejected": -262.5, "loss": 0.5984, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.25859373807907104, "rewards/margins": 0.3597656190395355, "rewards/rejected": -0.619458019733429, "step": 3370 }, { "epoch": 0.8689495365602472, "grad_norm": 412.0, "learning_rate": 6.55252317198764e-08, "logits/chosen": -2.684375047683716, "logits/rejected": -2.456249952316284, "logps/chosen": -303.3999938964844, "logps/rejected": -320.0, "loss": 0.5176, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4159179627895355, "rewards/margins": 0.7320312261581421, "rewards/rejected": -1.149999976158142, "step": 3375 }, { "epoch": 0.870236869207003, "grad_norm": 438.0, "learning_rate": 6.488156539649846e-08, "logits/chosen": -2.703125, "logits/rejected": -2.762500047683716, "logps/chosen": -323.6000061035156, "logps/rejected": -290.6000061035156, "loss": 0.5957, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.37835693359375, "rewards/margins": 0.43212890625, "rewards/rejected": -0.810546875, "step": 3380 }, { "epoch": 0.871524201853759, "grad_norm": 402.0, "learning_rate": 6.423789907312049e-08, "logits/chosen": -2.590625047683716, "logits/rejected": -2.450000047683716, "logps/chosen": -310.6000061035156, "logps/rejected": -367.20001220703125, "loss": 0.4828, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.643750011920929, "rewards/margins": 1.053125023841858, "rewards/rejected": -1.697656273841858, "step": 3385 }, { "epoch": 0.872811534500515, "grad_norm": 684.0, "learning_rate": 6.359423274974253e-08, "logits/chosen": -2.2874999046325684, "logits/rejected": -2.4937500953674316, "logps/chosen": -309.3999938964844, "logps/rejected": -353.3999938964844, "loss": 0.443, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7281249761581421, "rewards/margins": 1.1203124523162842, "rewards/rejected": -1.8484375476837158, "step": 3390 }, { "epoch": 0.8740988671472708, "grad_norm": 446.0, "learning_rate": 6.295056642636456e-08, "logits/chosen": -2.6624999046325684, "logits/rejected": -2.682812452316284, "logps/chosen": -386.79998779296875, "logps/rejected": -361.20001220703125, "loss": 0.5758, "rewards/accuracies": 0.625, "rewards/chosen": -0.16542968153953552, "rewards/margins": 0.5439453125, "rewards/rejected": -0.7095702886581421, "step": 3395 }, { "epoch": 0.8753861997940268, "grad_norm": 668.0, "learning_rate": 6.23069001029866e-08, "logits/chosen": -2.512500047683716, "logits/rejected": -1.9656250476837158, "logps/chosen": -285.79998779296875, "logps/rejected": -326.3999938964844, "loss": 0.5051, "rewards/accuracies": 0.625, "rewards/chosen": -0.607226550579071, "rewards/margins": 0.981640636920929, "rewards/rejected": -1.5880858898162842, "step": 3400 }, { "epoch": 0.8766735324407827, "grad_norm": 422.0, "learning_rate": 6.166323377960865e-08, "logits/chosen": -2.5250000953674316, "logits/rejected": -2.59375, "logps/chosen": -365.0, "logps/rejected": -376.79998779296875, "loss": 0.5684, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4847168028354645, "rewards/margins": 0.641406238079071, "rewards/rejected": -1.1257812976837158, "step": 3405 }, { "epoch": 0.8779608650875386, "grad_norm": 404.0, "learning_rate": 6.101956745623069e-08, "logits/chosen": -2.6312499046325684, "logits/rejected": -2.6937499046325684, "logps/chosen": -337.0, "logps/rejected": -364.6000061035156, "loss": 0.4977, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.74169921875, "rewards/margins": 1.1496093273162842, "rewards/rejected": -1.893164038658142, "step": 3410 }, { "epoch": 0.8792481977342945, "grad_norm": 300.0, "learning_rate": 6.037590113285273e-08, "logits/chosen": -2.6937499046325684, "logits/rejected": -2.387500047683716, "logps/chosen": -237.8000030517578, "logps/rejected": -279.79998779296875, "loss": 0.607, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.33818358182907104, "rewards/margins": 0.47187501192092896, "rewards/rejected": -0.8101562261581421, "step": 3415 }, { "epoch": 0.8805355303810505, "grad_norm": 480.0, "learning_rate": 5.973223480947476e-08, "logits/chosen": -2.4281249046325684, "logits/rejected": -2.125, "logps/chosen": -253.39999389648438, "logps/rejected": -263.6000061035156, "loss": 0.5586, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.01186523400247097, "rewards/margins": 0.4791015684604645, "rewards/rejected": -0.4671386778354645, "step": 3420 }, { "epoch": 0.8818228630278064, "grad_norm": 360.0, "learning_rate": 5.9088568486096805e-08, "logits/chosen": -2.609375, "logits/rejected": -2.495312452316284, "logps/chosen": -355.6000061035156, "logps/rejected": -312.6000061035156, "loss": 0.5645, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4515624940395355, "rewards/margins": 0.56585693359375, "rewards/rejected": -1.017578125, "step": 3425 }, { "epoch": 0.8831101956745623, "grad_norm": 456.0, "learning_rate": 5.844490216271884e-08, "logits/chosen": -2.4937500953674316, "logits/rejected": -2.390625, "logps/chosen": -353.6000061035156, "logps/rejected": -407.6000061035156, "loss": 0.4283, "rewards/accuracies": 0.75, "rewards/chosen": -0.596875011920929, "rewards/margins": 1.265234351158142, "rewards/rejected": -1.859765648841858, "step": 3430 }, { "epoch": 0.8843975283213182, "grad_norm": 290.0, "learning_rate": 5.7801235839340884e-08, "logits/chosen": -2.2906250953674316, "logits/rejected": -2.534374952316284, "logps/chosen": -219.8000030517578, "logps/rejected": -204.5500030517578, "loss": 0.6094, "rewards/accuracies": 0.375, "rewards/chosen": 0.05645751953125, "rewards/margins": 0.3206543028354645, "rewards/rejected": -0.2640624940395355, "step": 3435 }, { "epoch": 0.8856848609680742, "grad_norm": 370.0, "learning_rate": 5.715756951596292e-08, "logits/chosen": -2.565624952316284, "logits/rejected": -2.6812500953674316, "logps/chosen": -264.1000061035156, "logps/rejected": -333.1000061035156, "loss": 0.4611, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.22299805283546448, "rewards/margins": 1.0242187976837158, "rewards/rejected": -1.244421362876892, "step": 3440 }, { "epoch": 0.88697219361483, "grad_norm": 560.0, "learning_rate": 5.651390319258496e-08, "logits/chosen": -2.4156250953674316, "logits/rejected": -2.6312499046325684, "logps/chosen": -264.79998779296875, "logps/rejected": -371.0, "loss": 0.4926, "rewards/accuracies": 0.625, "rewards/chosen": -0.37626951932907104, "rewards/margins": 1.001562476158142, "rewards/rejected": -1.3796875476837158, "step": 3445 }, { "epoch": 0.888259526261586, "grad_norm": 502.0, "learning_rate": 5.5870236869207e-08, "logits/chosen": -2.578125, "logits/rejected": -2.753124952316284, "logps/chosen": -279.20001220703125, "logps/rejected": -237.0, "loss": 0.5219, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1708984375, "rewards/margins": 0.680468738079071, "rewards/rejected": -0.8515625, "step": 3450 }, { "epoch": 0.889546858908342, "grad_norm": 668.0, "learning_rate": 5.522657054582904e-08, "logits/chosen": -2.53125, "logits/rejected": -2.5843749046325684, "logps/chosen": -299.20001220703125, "logps/rejected": -353.20001220703125, "loss": 0.5312, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5960937738418579, "rewards/margins": 0.903515636920929, "rewards/rejected": -1.4992187023162842, "step": 3455 }, { "epoch": 0.8908341915550978, "grad_norm": 572.0, "learning_rate": 5.458290422245108e-08, "logits/chosen": -2.6187500953674316, "logits/rejected": -2.640625, "logps/chosen": -362.79998779296875, "logps/rejected": -358.0, "loss": 0.5668, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.78125, "rewards/margins": 0.67578125, "rewards/rejected": -1.4562499523162842, "step": 3460 }, { "epoch": 0.8921215242018538, "grad_norm": 165.0, "learning_rate": 5.393923789907312e-08, "logits/chosen": -2.746875047683716, "logits/rejected": -2.612499952316284, "logps/chosen": -230.6999969482422, "logps/rejected": -299.70001220703125, "loss": 0.5719, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.380126953125, "rewards/margins": 0.651171863079071, "rewards/rejected": -1.033593773841858, "step": 3465 }, { "epoch": 0.8934088568486097, "grad_norm": 390.0, "learning_rate": 5.3295571575695156e-08, "logits/chosen": -2.503124952316284, "logits/rejected": -2.424999952316284, "logps/chosen": -273.79998779296875, "logps/rejected": -341.79998779296875, "loss": 0.4508, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5600219964981079, "rewards/margins": 1.233984351158142, "rewards/rejected": -1.795312523841858, "step": 3470 }, { "epoch": 0.8946961894953656, "grad_norm": 434.0, "learning_rate": 5.26519052523172e-08, "logits/chosen": -2.609375, "logits/rejected": -2.5250000953674316, "logps/chosen": -337.0, "logps/rejected": -362.3999938964844, "loss": 0.5215, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3216796815395355, "rewards/margins": 0.727734386920929, "rewards/rejected": -1.0476562976837158, "step": 3475 }, { "epoch": 0.8959835221421215, "grad_norm": 552.0, "learning_rate": 5.2008238928939235e-08, "logits/chosen": -2.653125047683716, "logits/rejected": -2.5062499046325684, "logps/chosen": -323.6000061035156, "logps/rejected": -353.6000061035156, "loss": 0.5652, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.47099608182907104, "rewards/margins": 0.646484375, "rewards/rejected": -1.116796851158142, "step": 3480 }, { "epoch": 0.8972708547888775, "grad_norm": 524.0, "learning_rate": 5.136457260556128e-08, "logits/chosen": -2.421875, "logits/rejected": -2.520312547683716, "logps/chosen": -282.3999938964844, "logps/rejected": -328.79998779296875, "loss": 0.4863, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.31982421875, "rewards/margins": 1.0285155773162842, "rewards/rejected": -1.34765625, "step": 3485 }, { "epoch": 0.8985581874356333, "grad_norm": 576.0, "learning_rate": 5.0720906282183313e-08, "logits/chosen": -2.3125, "logits/rejected": -2.4000000953674316, "logps/chosen": -325.20001220703125, "logps/rejected": -407.6000061035156, "loss": 0.4645, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9027343988418579, "rewards/margins": 1.27734375, "rewards/rejected": -2.1851563453674316, "step": 3490 }, { "epoch": 0.8998455200823893, "grad_norm": 446.0, "learning_rate": 5.0077239958805356e-08, "logits/chosen": -2.8187499046325684, "logits/rejected": -2.825000047683716, "logps/chosen": -364.79998779296875, "logps/rejected": -336.0, "loss": 0.5207, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.13825683295726776, "rewards/margins": 0.8492187261581421, "rewards/rejected": -0.986132800579071, "step": 3495 }, { "epoch": 0.9011328527291452, "grad_norm": 556.0, "learning_rate": 4.943357363542739e-08, "logits/chosen": -2.5374999046325684, "logits/rejected": -2.421875, "logps/chosen": -312.0, "logps/rejected": -339.20001220703125, "loss": 0.5859, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4359374940395355, "rewards/margins": 0.88671875, "rewards/rejected": -1.321874976158142, "step": 3500 }, { "epoch": 0.9024201853759012, "grad_norm": 414.0, "learning_rate": 4.8789907312049435e-08, "logits/chosen": -2.731250047683716, "logits/rejected": -2.668750047683716, "logps/chosen": -352.79998779296875, "logps/rejected": -397.6000061035156, "loss": 0.4359, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3645996153354645, "rewards/margins": 1.1667969226837158, "rewards/rejected": -1.5304687023162842, "step": 3505 }, { "epoch": 0.903707518022657, "grad_norm": 484.0, "learning_rate": 4.814624098867147e-08, "logits/chosen": -2.534374952316284, "logits/rejected": -2.737499952316284, "logps/chosen": -285.29998779296875, "logps/rejected": -318.20001220703125, "loss": 0.5633, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.3798828125, "rewards/margins": 0.7743164300918579, "rewards/rejected": -1.1555664539337158, "step": 3510 }, { "epoch": 0.904994850669413, "grad_norm": 504.0, "learning_rate": 4.7502574665293514e-08, "logits/chosen": -2.612499952316284, "logits/rejected": -2.7093749046325684, "logps/chosen": -369.6000061035156, "logps/rejected": -413.6000061035156, "loss": 0.5461, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.905468761920929, "rewards/margins": 1.080078125, "rewards/rejected": -1.9890625476837158, "step": 3515 }, { "epoch": 0.9062821833161689, "grad_norm": 644.0, "learning_rate": 4.685890834191555e-08, "logits/chosen": -2.5062499046325684, "logits/rejected": -2.426562547683716, "logps/chosen": -307.20001220703125, "logps/rejected": -408.0, "loss": 0.3293, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6915038824081421, "rewards/margins": 1.6925780773162842, "rewards/rejected": -2.3832030296325684, "step": 3520 }, { "epoch": 0.9075695159629248, "grad_norm": 1080.0, "learning_rate": 4.621524201853759e-08, "logits/chosen": -2.4625000953674316, "logits/rejected": -2.659374952316284, "logps/chosen": -338.6000061035156, "logps/rejected": -400.0, "loss": 0.525, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.794726550579071, "rewards/margins": 0.857714831829071, "rewards/rejected": -1.6570312976837158, "step": 3525 }, { "epoch": 0.9088568486096807, "grad_norm": 406.0, "learning_rate": 4.557157569515963e-08, "logits/chosen": -2.671875, "logits/rejected": -2.700000047683716, "logps/chosen": -339.20001220703125, "logps/rejected": -375.3999938964844, "loss": 0.4844, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5152343511581421, "rewards/margins": 1.0001952648162842, "rewards/rejected": -1.5164062976837158, "step": 3530 }, { "epoch": 0.9101441812564367, "grad_norm": 628.0, "learning_rate": 4.492790937178167e-08, "logits/chosen": -2.5, "logits/rejected": -2.4937500953674316, "logps/chosen": -281.3999938964844, "logps/rejected": -368.79998779296875, "loss": 0.457, "rewards/accuracies": 0.6875, "rewards/chosen": -0.956250011920929, "rewards/margins": 1.2097656726837158, "rewards/rejected": -2.169140577316284, "step": 3535 }, { "epoch": 0.9114315139031925, "grad_norm": 268.0, "learning_rate": 4.42842430484037e-08, "logits/chosen": -2.53125, "logits/rejected": -2.5406250953674316, "logps/chosen": -308.3999938964844, "logps/rejected": -336.79998779296875, "loss": 0.5168, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.33984375, "rewards/margins": 0.814892590045929, "rewards/rejected": -1.1570312976837158, "step": 3540 }, { "epoch": 0.9127188465499485, "grad_norm": 760.0, "learning_rate": 4.364057672502574e-08, "logits/chosen": -2.6031250953674316, "logits/rejected": -2.721874952316284, "logps/chosen": -374.79998779296875, "logps/rejected": -420.79998779296875, "loss": 0.5172, "rewards/accuracies": 0.6875, "rewards/chosen": -0.555468738079071, "rewards/margins": 0.9007812738418579, "rewards/rejected": -1.455468773841858, "step": 3545 }, { "epoch": 0.9140061791967045, "grad_norm": 426.0, "learning_rate": 4.299691040164778e-08, "logits/chosen": -2.6875, "logits/rejected": -2.706249952316284, "logps/chosen": -342.0, "logps/rejected": -384.0, "loss": 0.5969, "rewards/accuracies": 0.625, "rewards/chosen": -0.5386718511581421, "rewards/margins": 0.5425781011581421, "rewards/rejected": -1.0828125476837158, "step": 3550 }, { "epoch": 0.9152935118434603, "grad_norm": 512.0, "learning_rate": 4.235324407826982e-08, "logits/chosen": -2.559375047683716, "logits/rejected": -2.612499952316284, "logps/chosen": -314.3999938964844, "logps/rejected": -378.3999938964844, "loss": 0.5086, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6885010004043579, "rewards/margins": 0.792187511920929, "rewards/rejected": -1.478906273841858, "step": 3555 }, { "epoch": 0.9165808444902163, "grad_norm": 318.0, "learning_rate": 4.170957775489186e-08, "logits/chosen": -2.4906249046325684, "logits/rejected": -2.565624952316284, "logps/chosen": -345.0, "logps/rejected": -368.6000061035156, "loss": 0.4961, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7761474847793579, "rewards/margins": 1.024999976158142, "rewards/rejected": -1.802734375, "step": 3560 }, { "epoch": 0.9178681771369722, "grad_norm": 195.0, "learning_rate": 4.10659114315139e-08, "logits/chosen": -2.6656250953674316, "logits/rejected": -2.428906202316284, "logps/chosen": -289.0, "logps/rejected": -316.6000061035156, "loss": 0.5742, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.20498046278953552, "rewards/margins": 0.4892578125, "rewards/rejected": -0.694531261920929, "step": 3565 }, { "epoch": 0.9191555097837281, "grad_norm": 720.0, "learning_rate": 4.042224510813594e-08, "logits/chosen": -2.403125047683716, "logits/rejected": -2.1015625, "logps/chosen": -294.8999938964844, "logps/rejected": -308.3500061035156, "loss": 0.5848, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5630859136581421, "rewards/margins": 0.624218761920929, "rewards/rejected": -1.187524437904358, "step": 3570 }, { "epoch": 0.920442842430484, "grad_norm": 636.0, "learning_rate": 3.977857878475798e-08, "logits/chosen": -2.543750047683716, "logits/rejected": -2.487499952316284, "logps/chosen": -333.20001220703125, "logps/rejected": -305.79998779296875, "loss": 0.5496, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.745800793170929, "rewards/margins": 0.73095703125, "rewards/rejected": -1.4783203601837158, "step": 3575 }, { "epoch": 0.92173017507724, "grad_norm": 864.0, "learning_rate": 3.9134912461380015e-08, "logits/chosen": -2.6031250953674316, "logits/rejected": -2.6781249046325684, "logps/chosen": -293.3999938964844, "logps/rejected": -359.6000061035156, "loss": 0.4988, "rewards/accuracies": 0.6875, "rewards/chosen": -0.43437498807907104, "rewards/margins": 0.7623046636581421, "rewards/rejected": -1.1953125, "step": 3580 }, { "epoch": 0.9230175077239959, "grad_norm": 472.0, "learning_rate": 3.849124613800206e-08, "logits/chosen": -2.668750047683716, "logits/rejected": -2.78125, "logps/chosen": -342.3999938964844, "logps/rejected": -312.6000061035156, "loss": 0.5145, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.28632813692092896, "rewards/margins": 0.746874988079071, "rewards/rejected": -1.032812476158142, "step": 3585 }, { "epoch": 0.9243048403707518, "grad_norm": 270.0, "learning_rate": 3.7847579814624094e-08, "logits/chosen": -2.7750000953674316, "logits/rejected": -2.5218749046325684, "logps/chosen": -267.0, "logps/rejected": -256.70001220703125, "loss": 0.5859, "rewards/accuracies": 0.5, "rewards/chosen": -0.21523436903953552, "rewards/margins": 0.511914074420929, "rewards/rejected": -0.727246105670929, "step": 3590 }, { "epoch": 0.9255921730175077, "grad_norm": 328.0, "learning_rate": 3.720391349124614e-08, "logits/chosen": -2.609375, "logits/rejected": -2.3812499046325684, "logps/chosen": -272.5, "logps/rejected": -336.20001220703125, "loss": 0.4762, "rewards/accuracies": 0.625, "rewards/chosen": -0.5213867425918579, "rewards/margins": 0.917675793170929, "rewards/rejected": -1.4388306140899658, "step": 3595 }, { "epoch": 0.9268795056642637, "grad_norm": 328.0, "learning_rate": 3.656024716786817e-08, "logits/chosen": -2.734375, "logits/rejected": -2.6078124046325684, "logps/chosen": -256.20001220703125, "logps/rejected": -300.6000061035156, "loss": 0.4941, "rewards/accuracies": 0.5625, "rewards/chosen": -0.18585205078125, "rewards/margins": 0.879687488079071, "rewards/rejected": -1.0652344226837158, "step": 3600 }, { "epoch": 0.9281668383110195, "grad_norm": 496.0, "learning_rate": 3.5916580844490216e-08, "logits/chosen": -2.5406250953674316, "logits/rejected": -2.311718702316284, "logps/chosen": -286.20001220703125, "logps/rejected": -340.0, "loss": 0.4977, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4920715391635895, "rewards/margins": 0.8843749761581421, "rewards/rejected": -1.3781249523162842, "step": 3605 }, { "epoch": 0.9294541709577755, "grad_norm": 568.0, "learning_rate": 3.527291452111225e-08, "logits/chosen": -2.5875000953674316, "logits/rejected": -2.546875, "logps/chosen": -322.0, "logps/rejected": -372.3999938964844, "loss": 0.4193, "rewards/accuracies": 0.75, "rewards/chosen": -0.8296874761581421, "rewards/margins": 1.219140648841858, "rewards/rejected": -2.049999952316284, "step": 3610 }, { "epoch": 0.9307415036045315, "grad_norm": 1088.0, "learning_rate": 3.4629248197734294e-08, "logits/chosen": -2.590625047683716, "logits/rejected": -2.6265625953674316, "logps/chosen": -373.6000061035156, "logps/rejected": -393.20001220703125, "loss": 0.4504, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.4664550721645355, "rewards/margins": 1.01171875, "rewards/rejected": -1.4773437976837158, "step": 3615 }, { "epoch": 0.9320288362512873, "grad_norm": 223.0, "learning_rate": 3.398558187435633e-08, "logits/chosen": -2.762500047683716, "logits/rejected": -2.6781249046325684, "logps/chosen": -273.3999938964844, "logps/rejected": -277.5, "loss": 0.5281, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.38232421875, "rewards/margins": 0.779492199420929, "rewards/rejected": -1.1612701416015625, "step": 3620 }, { "epoch": 0.9333161688980433, "grad_norm": 808.0, "learning_rate": 3.334191555097837e-08, "logits/chosen": -2.575000047683716, "logits/rejected": -2.578125, "logps/chosen": -300.79998779296875, "logps/rejected": -300.20001220703125, "loss": 0.5379, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.4442382752895355, "rewards/margins": 0.73828125, "rewards/rejected": -1.1808593273162842, "step": 3625 }, { "epoch": 0.9346035015447992, "grad_norm": 272.0, "learning_rate": 3.269824922760041e-08, "logits/chosen": -2.6312499046325684, "logits/rejected": -2.487499952316284, "logps/chosen": -315.20001220703125, "logps/rejected": -391.6000061035156, "loss": 0.4953, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4637695252895355, "rewards/margins": 0.8089843988418579, "rewards/rejected": -1.2726562023162842, "step": 3630 }, { "epoch": 0.935890834191555, "grad_norm": 496.0, "learning_rate": 3.205458290422245e-08, "logits/chosen": -2.578125, "logits/rejected": -2.542187452316284, "logps/chosen": -310.6000061035156, "logps/rejected": -321.6000061035156, "loss": 0.6602, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.6539062261581421, "rewards/margins": 0.5435791015625, "rewards/rejected": -1.195703148841858, "step": 3635 }, { "epoch": 0.937178166838311, "grad_norm": 270.0, "learning_rate": 3.141091658084449e-08, "logits/chosen": -2.7249999046325684, "logits/rejected": -2.78125, "logps/chosen": -315.79998779296875, "logps/rejected": -294.20001220703125, "loss": 0.6453, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.2835937440395355, "rewards/margins": 0.3606933653354645, "rewards/rejected": -0.6435302495956421, "step": 3640 }, { "epoch": 0.938465499485067, "grad_norm": 450.0, "learning_rate": 3.076725025746653e-08, "logits/chosen": -2.6812500953674316, "logits/rejected": -2.659374952316284, "logps/chosen": -391.20001220703125, "logps/rejected": -361.20001220703125, "loss": 0.5797, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.669921875, "rewards/margins": 0.6898437738418579, "rewards/rejected": -1.359277367591858, "step": 3645 }, { "epoch": 0.9397528321318228, "grad_norm": 788.0, "learning_rate": 3.0123583934088567e-08, "logits/chosen": -2.2874999046325684, "logits/rejected": -2.659374952316284, "logps/chosen": -338.20001220703125, "logps/rejected": -328.0, "loss": 0.5598, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.33002930879592896, "rewards/margins": 0.602832019329071, "rewards/rejected": -0.9320312738418579, "step": 3650 }, { "epoch": 0.9410401647785788, "grad_norm": 556.0, "learning_rate": 2.9479917610710606e-08, "logits/chosen": -2.7249999046325684, "logits/rejected": -2.793750047683716, "logps/chosen": -330.79998779296875, "logps/rejected": -369.20001220703125, "loss": 0.4762, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2914062440395355, "rewards/margins": 1.025390625, "rewards/rejected": -1.3156249523162842, "step": 3655 }, { "epoch": 0.9423274974253347, "grad_norm": 600.0, "learning_rate": 2.8836251287332645e-08, "logits/chosen": -2.7718749046325684, "logits/rejected": -2.753124952316284, "logps/chosen": -323.6000061035156, "logps/rejected": -345.6000061035156, "loss": 0.4723, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2566894590854645, "rewards/margins": 0.8671875, "rewards/rejected": -1.123437523841858, "step": 3660 }, { "epoch": 0.9436148300720907, "grad_norm": 372.0, "learning_rate": 2.8192584963954685e-08, "logits/chosen": -2.7406249046325684, "logits/rejected": -2.84375, "logps/chosen": -294.3999938964844, "logps/rejected": -341.6000061035156, "loss": 0.5127, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3822265565395355, "rewards/margins": 0.7890625, "rewards/rejected": -1.169531226158142, "step": 3665 }, { "epoch": 0.9449021627188465, "grad_norm": 700.0, "learning_rate": 2.7548918640576724e-08, "logits/chosen": -2.512500047683716, "logits/rejected": -2.612499952316284, "logps/chosen": -281.6000061035156, "logps/rejected": -261.0, "loss": 0.5871, "rewards/accuracies": 0.625, "rewards/chosen": -0.388916015625, "rewards/margins": 0.5322265625, "rewards/rejected": -0.918749988079071, "step": 3670 }, { "epoch": 0.9461894953656025, "grad_norm": 466.0, "learning_rate": 2.6905252317198764e-08, "logits/chosen": -2.628124952316284, "logits/rejected": -2.7281250953674316, "logps/chosen": -250.60000610351562, "logps/rejected": -210.89999389648438, "loss": 0.6004, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.15556640923023224, "rewards/margins": 0.4253906309604645, "rewards/rejected": -0.581835925579071, "step": 3675 }, { "epoch": 0.9474768280123584, "grad_norm": 588.0, "learning_rate": 2.6261585993820803e-08, "logits/chosen": -2.518749952316284, "logits/rejected": -2.417187452316284, "logps/chosen": -271.1000061035156, "logps/rejected": -291.0, "loss": 0.557, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.4496215879917145, "rewards/margins": 0.6724609136581421, "rewards/rejected": -1.1222655773162842, "step": 3680 }, { "epoch": 0.9487641606591143, "grad_norm": 604.0, "learning_rate": 2.5617919670442842e-08, "logits/chosen": -2.4781250953674316, "logits/rejected": -2.354687452316284, "logps/chosen": -266.0, "logps/rejected": -333.3999938964844, "loss": 0.5578, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3202148377895355, "rewards/margins": 0.764086902141571, "rewards/rejected": -1.08447265625, "step": 3685 }, { "epoch": 0.9500514933058702, "grad_norm": 912.0, "learning_rate": 2.497425334706488e-08, "logits/chosen": -2.4437499046325684, "logits/rejected": -2.5843749046325684, "logps/chosen": -325.0, "logps/rejected": -335.6000061035156, "loss": 0.552, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.562695324420929, "rewards/margins": 0.864453136920929, "rewards/rejected": -1.4269530773162842, "step": 3690 }, { "epoch": 0.9513388259526262, "grad_norm": 752.0, "learning_rate": 2.433058702368692e-08, "logits/chosen": -2.4781250953674316, "logits/rejected": -2.4312500953674316, "logps/chosen": -333.20001220703125, "logps/rejected": -375.20001220703125, "loss": 0.5367, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9742187261581421, "rewards/margins": 0.975781261920929, "rewards/rejected": -1.9484374523162842, "step": 3695 }, { "epoch": 0.952626158599382, "grad_norm": 656.0, "learning_rate": 2.368692070030896e-08, "logits/chosen": -2.6031250953674316, "logits/rejected": -2.6624999046325684, "logps/chosen": -305.5, "logps/rejected": -314.3999938964844, "loss": 0.5316, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.16616210341453552, "rewards/margins": 0.6703125238418579, "rewards/rejected": -0.8355468511581421, "step": 3700 }, { "epoch": 0.953913491246138, "grad_norm": 338.0, "learning_rate": 2.3043254376931e-08, "logits/chosen": -2.6875, "logits/rejected": -2.6937499046325684, "logps/chosen": -343.20001220703125, "logps/rejected": -287.20001220703125, "loss": 0.6164, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.43107908964157104, "rewards/margins": 0.53955078125, "rewards/rejected": -0.9710937738418579, "step": 3705 }, { "epoch": 0.955200823892894, "grad_norm": 414.0, "learning_rate": 2.239958805355304e-08, "logits/chosen": -2.512500047683716, "logits/rejected": -2.734375, "logps/chosen": -265.6000061035156, "logps/rejected": -262.6000061035156, "loss": 0.5871, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.545458972454071, "rewards/margins": 0.4951171875, "rewards/rejected": -1.040429711341858, "step": 3710 }, { "epoch": 0.9564881565396498, "grad_norm": 280.0, "learning_rate": 2.1755921730175075e-08, "logits/chosen": -2.496875047683716, "logits/rejected": -2.6343750953674316, "logps/chosen": -272.8999938964844, "logps/rejected": -254.3000030517578, "loss": 0.6148, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.4238037168979645, "rewards/margins": 0.4013671875, "rewards/rejected": -0.823437511920929, "step": 3715 }, { "epoch": 0.9577754891864058, "grad_norm": 478.0, "learning_rate": 2.1112255406797115e-08, "logits/chosen": -2.53125, "logits/rejected": -2.590625047683716, "logps/chosen": -322.3999938964844, "logps/rejected": -322.20001220703125, "loss": 0.5445, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.649609386920929, "rewards/margins": 0.921582043170929, "rewards/rejected": -1.572167992591858, "step": 3720 }, { "epoch": 0.9590628218331617, "grad_norm": 520.0, "learning_rate": 2.0468589083419154e-08, "logits/chosen": -2.528125047683716, "logits/rejected": -2.2828125953674316, "logps/chosen": -295.6000061035156, "logps/rejected": -352.6000061035156, "loss": 0.6059, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1005859375, "rewards/margins": 0.8108886480331421, "rewards/rejected": -1.91015625, "step": 3725 }, { "epoch": 0.9603501544799176, "grad_norm": 420.0, "learning_rate": 1.9824922760041193e-08, "logits/chosen": -2.625, "logits/rejected": -2.575000047683716, "logps/chosen": -327.79998779296875, "logps/rejected": -309.20001220703125, "loss": 0.5785, "rewards/accuracies": 0.625, "rewards/chosen": -0.5228515863418579, "rewards/margins": 0.6156250238418579, "rewards/rejected": -1.1394531726837158, "step": 3730 }, { "epoch": 0.9616374871266735, "grad_norm": 426.0, "learning_rate": 1.9181256436663233e-08, "logits/chosen": -2.8218750953674316, "logits/rejected": -2.778125047683716, "logps/chosen": -313.3999938964844, "logps/rejected": -324.0, "loss": 0.625, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.3230957090854645, "rewards/margins": 0.421875, "rewards/rejected": -0.7445312738418579, "step": 3735 }, { "epoch": 0.9629248197734295, "grad_norm": 502.0, "learning_rate": 1.8537590113285272e-08, "logits/chosen": -2.4078125953674316, "logits/rejected": -2.440624952316284, "logps/chosen": -221.10000610351562, "logps/rejected": -280.79998779296875, "loss": 0.4953, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.392822265625, "rewards/margins": 0.8008788824081421, "rewards/rejected": -1.193750023841858, "step": 3740 }, { "epoch": 0.9642121524201854, "grad_norm": 788.0, "learning_rate": 1.789392378990731e-08, "logits/chosen": -2.581249952316284, "logits/rejected": -2.4781250953674316, "logps/chosen": -285.3999938964844, "logps/rejected": -327.20001220703125, "loss": 0.493, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.24843749403953552, "rewards/margins": 1.0442383289337158, "rewards/rejected": -1.294042944908142, "step": 3745 }, { "epoch": 0.9654994850669413, "grad_norm": 422.0, "learning_rate": 1.725025746652935e-08, "logits/chosen": -2.589062452316284, "logits/rejected": -2.9000000953674316, "logps/chosen": -201.1999969482422, "logps/rejected": -236.39999389648438, "loss": 0.6645, "rewards/accuracies": 0.375, "rewards/chosen": -0.14729003608226776, "rewards/margins": 0.20244140923023224, "rewards/rejected": -0.34980469942092896, "step": 3750 }, { "epoch": 0.9667868177136972, "grad_norm": 440.0, "learning_rate": 1.660659114315139e-08, "logits/chosen": -2.331249952316284, "logits/rejected": -2.440624952316284, "logps/chosen": -276.79998779296875, "logps/rejected": -356.0, "loss": 0.4008, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.6761718988418579, "rewards/margins": 1.2734375, "rewards/rejected": -1.9484374523162842, "step": 3755 }, { "epoch": 0.9680741503604532, "grad_norm": 219.0, "learning_rate": 1.596292481977343e-08, "logits/chosen": -2.40625, "logits/rejected": -2.378124952316284, "logps/chosen": -304.79998779296875, "logps/rejected": -365.0, "loss": 0.4016, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.838574230670929, "rewards/margins": 1.376562476158142, "rewards/rejected": -2.210156202316284, "step": 3760 }, { "epoch": 0.969361483007209, "grad_norm": 1296.0, "learning_rate": 1.531925849639547e-08, "logits/chosen": -2.526562452316284, "logits/rejected": -2.3843750953674316, "logps/chosen": -293.0, "logps/rejected": -294.79998779296875, "loss": 0.6014, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.348876953125, "rewards/margins": 0.612597644329071, "rewards/rejected": -0.9638671875, "step": 3765 }, { "epoch": 0.970648815653965, "grad_norm": 720.0, "learning_rate": 1.4675592173017507e-08, "logits/chosen": -2.528125047683716, "logits/rejected": -2.145312547683716, "logps/chosen": -275.8999938964844, "logps/rejected": -334.79998779296875, "loss": 0.5406, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.543652355670929, "rewards/margins": 0.9398437738418579, "rewards/rejected": -1.480371117591858, "step": 3770 }, { "epoch": 0.971936148300721, "grad_norm": 318.0, "learning_rate": 1.4031925849639546e-08, "logits/chosen": -2.6781249046325684, "logits/rejected": -2.75, "logps/chosen": -328.0, "logps/rejected": -321.0, "loss": 0.5672, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.04306640475988388, "rewards/margins": 0.5422424077987671, "rewards/rejected": -0.585498034954071, "step": 3775 }, { "epoch": 0.9732234809474768, "grad_norm": 406.0, "learning_rate": 1.3388259526261585e-08, "logits/chosen": -2.628124952316284, "logits/rejected": -2.5406250953674316, "logps/chosen": -290.0, "logps/rejected": -339.3999938964844, "loss": 0.4598, "rewards/accuracies": 0.6875, "rewards/chosen": -0.262939453125, "rewards/margins": 0.8179687261581421, "rewards/rejected": -1.080468773841858, "step": 3780 }, { "epoch": 0.9745108135942327, "grad_norm": 520.0, "learning_rate": 1.2744593202883625e-08, "logits/chosen": -2.6156249046325684, "logits/rejected": -2.731250047683716, "logps/chosen": -317.0, "logps/rejected": -292.0, "loss": 0.6195, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.2789062559604645, "rewards/margins": 0.5194336175918579, "rewards/rejected": -0.7986816167831421, "step": 3785 }, { "epoch": 0.9757981462409887, "grad_norm": 400.0, "learning_rate": 1.2100926879505664e-08, "logits/chosen": -2.596874952316284, "logits/rejected": -2.753124952316284, "logps/chosen": -252.60000610351562, "logps/rejected": -291.3500061035156, "loss": 0.5254, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2658447325229645, "rewards/margins": 0.7083984613418579, "rewards/rejected": -0.9750000238418579, "step": 3790 }, { "epoch": 0.9770854788877446, "grad_norm": 940.0, "learning_rate": 1.1457260556127703e-08, "logits/chosen": -2.528125047683716, "logits/rejected": -2.700000047683716, "logps/chosen": -271.3999938964844, "logps/rejected": -351.20001220703125, "loss": 0.4813, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4615722596645355, "rewards/margins": 0.93359375, "rewards/rejected": -1.3984375, "step": 3795 }, { "epoch": 0.9783728115345005, "grad_norm": 528.0, "learning_rate": 1.0813594232749741e-08, "logits/chosen": -2.5687499046325684, "logits/rejected": -2.5250000953674316, "logps/chosen": -338.6000061035156, "logps/rejected": -379.20001220703125, "loss": 0.4922, "rewards/accuracies": 0.6875, "rewards/chosen": -0.71484375, "rewards/margins": 1.0888671875, "rewards/rejected": -1.8039062023162842, "step": 3800 }, { "epoch": 0.9796601441812565, "grad_norm": 414.0, "learning_rate": 1.016992790937178e-08, "logits/chosen": -2.546875, "logits/rejected": -2.503124952316284, "logps/chosen": -259.20001220703125, "logps/rejected": -379.3999938964844, "loss": 0.4059, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.78125, "rewards/margins": 1.4406249523162842, "rewards/rejected": -2.2249999046325684, "step": 3805 }, { "epoch": 0.9809474768280123, "grad_norm": 756.0, "learning_rate": 9.52626158599382e-09, "logits/chosen": -2.59375, "logits/rejected": -2.6312499046325684, "logps/chosen": -281.6000061035156, "logps/rejected": -283.79998779296875, "loss": 0.6961, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.642773449420929, "rewards/margins": 0.4638671875, "rewards/rejected": -1.107421875, "step": 3810 }, { "epoch": 0.9822348094747683, "grad_norm": 612.0, "learning_rate": 8.88259526261586e-09, "logits/chosen": -2.715625047683716, "logits/rejected": -2.799999952316284, "logps/chosen": -258.79998779296875, "logps/rejected": -299.3999938964844, "loss": 0.607, "rewards/accuracies": 0.5625, "rewards/chosen": -0.48554688692092896, "rewards/margins": 0.534716784954071, "rewards/rejected": -1.0190918445587158, "step": 3815 }, { "epoch": 0.9835221421215242, "grad_norm": 672.0, "learning_rate": 8.238928939237899e-09, "logits/chosen": -2.581249952316284, "logits/rejected": -2.549999952316284, "logps/chosen": -296.79998779296875, "logps/rejected": -359.79998779296875, "loss": 0.65, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6055663824081421, "rewards/margins": 0.5458008050918579, "rewards/rejected": -1.15234375, "step": 3820 }, { "epoch": 0.9848094747682801, "grad_norm": 620.0, "learning_rate": 7.595262615859938e-09, "logits/chosen": -2.40625, "logits/rejected": -2.456249952316284, "logps/chosen": -289.20001220703125, "logps/rejected": -392.3999938964844, "loss": 0.5289, "rewards/accuracies": 0.625, "rewards/chosen": -0.8050781488418579, "rewards/margins": 0.9296875, "rewards/rejected": -1.734375, "step": 3825 }, { "epoch": 0.986096807415036, "grad_norm": 492.0, "learning_rate": 6.951596292481977e-09, "logits/chosen": -2.5718750953674316, "logits/rejected": -2.621875047683716, "logps/chosen": -350.3999938964844, "logps/rejected": -386.0, "loss": 0.5301, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6471191644668579, "rewards/margins": 0.930371105670929, "rewards/rejected": -1.576269507408142, "step": 3830 }, { "epoch": 0.987384140061792, "grad_norm": 616.0, "learning_rate": 6.307929969104016e-09, "logits/chosen": -2.578125, "logits/rejected": -2.239062547683716, "logps/chosen": -352.79998779296875, "logps/rejected": -424.79998779296875, "loss": 0.5555, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.072656273841858, "rewards/margins": 0.9541381597518921, "rewards/rejected": -2.0257811546325684, "step": 3835 }, { "epoch": 0.9886714727085479, "grad_norm": 512.0, "learning_rate": 5.664263645726055e-09, "logits/chosen": -2.59375, "logits/rejected": -2.7406249046325684, "logps/chosen": -359.6000061035156, "logps/rejected": -356.0, "loss": 0.5758, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3032898008823395, "rewards/margins": 0.5933593511581421, "rewards/rejected": -0.899218738079071, "step": 3840 }, { "epoch": 0.9899588053553038, "grad_norm": 446.0, "learning_rate": 5.020597322348095e-09, "logits/chosen": -2.5843749046325684, "logits/rejected": -2.784374952316284, "logps/chosen": -378.0, "logps/rejected": -364.79998779296875, "loss": 0.4516, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.16298827528953552, "rewards/margins": 0.932812511920929, "rewards/rejected": -1.0964844226837158, "step": 3845 }, { "epoch": 0.9912461380020597, "grad_norm": 456.0, "learning_rate": 4.376930998970134e-09, "logits/chosen": -2.762500047683716, "logits/rejected": -2.6968750953674316, "logps/chosen": -364.0, "logps/rejected": -345.20001220703125, "loss": 0.5395, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4781494140625, "rewards/margins": 0.760546863079071, "rewards/rejected": -1.237890601158142, "step": 3850 }, { "epoch": 0.9925334706488157, "grad_norm": 360.0, "learning_rate": 3.733264675592173e-09, "logits/chosen": -2.512500047683716, "logits/rejected": -2.2593750953674316, "logps/chosen": -269.70001220703125, "logps/rejected": -287.79998779296875, "loss": 0.543, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.532910168170929, "rewards/margins": 0.6400390863418579, "rewards/rejected": -1.173437476158142, "step": 3855 }, { "epoch": 0.9938208032955715, "grad_norm": 632.0, "learning_rate": 3.089598352214212e-09, "logits/chosen": -2.621875047683716, "logits/rejected": -2.6500000953674316, "logps/chosen": -362.3999938964844, "logps/rejected": -393.6000061035156, "loss": 0.5535, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8525390625, "rewards/margins": 0.911914050579071, "rewards/rejected": -1.7609374523162842, "step": 3860 }, { "epoch": 0.9951081359423275, "grad_norm": 408.0, "learning_rate": 2.445932028836251e-09, "logits/chosen": -2.464062452316284, "logits/rejected": -2.5999999046325684, "logps/chosen": -365.20001220703125, "logps/rejected": -408.0, "loss": 0.5121, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.70361328125, "rewards/margins": 0.914843738079071, "rewards/rejected": -1.6164062023162842, "step": 3865 }, { "epoch": 0.9963954685890835, "grad_norm": 474.0, "learning_rate": 1.8022657054582903e-09, "logits/chosen": -2.5687499046325684, "logits/rejected": -2.5093750953674316, "logps/chosen": -317.6000061035156, "logps/rejected": -405.20001220703125, "loss": 0.4566, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6523803472518921, "rewards/margins": 1.1632812023162842, "rewards/rejected": -1.8171875476837158, "step": 3870 }, { "epoch": 0.9976828012358393, "grad_norm": 608.0, "learning_rate": 1.1585993820803295e-09, "logits/chosen": -2.6312499046325684, "logits/rejected": -2.721874952316284, "logps/chosen": -332.0, "logps/rejected": -317.3999938964844, "loss": 0.5828, "rewards/accuracies": 0.5, "rewards/chosen": -0.3084472715854645, "rewards/margins": 0.6429687738418579, "rewards/rejected": -0.9515625238418579, "step": 3875 }, { "epoch": 0.9989701338825953, "grad_norm": 318.0, "learning_rate": 5.149330587023687e-10, "logits/chosen": -2.6781249046325684, "logits/rejected": -2.4593749046325684, "logps/chosen": -301.79998779296875, "logps/rejected": -265.0, "loss": 0.5484, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.29853516817092896, "rewards/margins": 0.537890613079071, "rewards/rejected": -0.836718738079071, "step": 3880 } ], "logging_steps": 5, "max_steps": 3884, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }