diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9190 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 100, + "global_step": 5510, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 41.204517024742586, + "learning_rate": 2.7223230490018146e-10, + "logits/chosen": -1.0541447401046753, + "logits/rejected": -0.7520447373390198, + "logps/chosen": -60.40666961669922, + "logps/rejected": -106.31614685058594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 46.36022550961656, + "learning_rate": 2.722323049001815e-09, + "logits/chosen": -1.6423306465148926, + "logits/rejected": -1.0551129579544067, + "logps/chosen": -118.1044921875, + "logps/rejected": -147.71112060546875, + "loss": 0.6932, + "rewards/accuracies": 0.4444444477558136, + "rewards/chosen": 0.00048234747373498976, + "rewards/margins": -0.009559460915625095, + "rewards/rejected": 0.01004180870950222, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 44.73280646725986, + "learning_rate": 5.44464609800363e-09, + "logits/chosen": -1.3452703952789307, + "logits/rejected": -1.4189544916152954, + "logps/chosen": -100.15978240966797, + "logps/rejected": -120.36905670166016, + "loss": 0.6934, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.005216827150434256, + "rewards/margins": 0.023615092039108276, + "rewards/rejected": -0.02883191965520382, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 47.03882983712326, + "learning_rate": 8.166969147005445e-09, + "logits/chosen": -1.8659166097640991, + "logits/rejected": -1.3837544918060303, + "logps/chosen": -97.74725341796875, + "logps/rejected": -225.3568878173828, + "loss": 0.6933, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.01882881112396717, + "rewards/margins": 0.012613847851753235, + "rewards/rejected": 0.0062149628065526485, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 51.31627563719448, + "learning_rate": 1.088929219600726e-08, + "logits/chosen": -1.726585030555725, + "logits/rejected": -1.5602327585220337, + "logps/chosen": -117.8109359741211, + "logps/rejected": -127.14540100097656, + "loss": 0.6942, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.006876278668642044, + "rewards/margins": 0.018800906836986542, + "rewards/rejected": -0.025677183642983437, + "step": 40 + }, + { + "epoch": 0.02, + "grad_norm": 41.50488252094809, + "learning_rate": 1.3611615245009074e-08, + "logits/chosen": -1.8296377658843994, + "logits/rejected": -1.4772355556488037, + "logps/chosen": -97.29546356201172, + "logps/rejected": -119.9217758178711, + "loss": 0.6936, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.01763664372265339, + "rewards/margins": 0.02796524204313755, + "rewards/rejected": -0.010328599251806736, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 43.88658635517158, + "learning_rate": 1.633393829401089e-08, + "logits/chosen": -1.7242791652679443, + "logits/rejected": -1.7074722051620483, + "logps/chosen": -93.03128814697266, + "logps/rejected": -110.11827087402344, + "loss": 0.6967, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.006739559583365917, + "rewards/margins": -0.004187393002212048, + "rewards/rejected": 0.010926952585577965, + "step": 60 + }, + { + "epoch": 0.03, + "grad_norm": 45.67398430745054, + "learning_rate": 1.90562613430127e-08, + "logits/chosen": -2.0839061737060547, + "logits/rejected": -1.6974592208862305, + "logps/chosen": -89.07298278808594, + "logps/rejected": -121.43327331542969, + "loss": 0.6934, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.018308373168110847, + "rewards/margins": 0.026240844279527664, + "rewards/rejected": -0.007932471111416817, + "step": 70 + }, + { + "epoch": 0.03, + "grad_norm": 48.69573234717181, + "learning_rate": 2.177858439201452e-08, + "logits/chosen": -1.2379311323165894, + "logits/rejected": -1.0085999965667725, + "logps/chosen": -92.49485778808594, + "logps/rejected": -110.10205078125, + "loss": 0.6901, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.012255726382136345, + "rewards/margins": 0.024808084592223167, + "rewards/rejected": -0.012552358210086823, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 41.975191835891444, + "learning_rate": 2.4500907441016332e-08, + "logits/chosen": -1.6204273700714111, + "logits/rejected": -1.3888098001480103, + "logps/chosen": -96.46438598632812, + "logps/rejected": -131.0227508544922, + "loss": 0.694, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0034812160301953554, + "rewards/margins": -0.005814972333610058, + "rewards/rejected": 0.002333755139261484, + "step": 90 + }, + { + "epoch": 0.04, + "grad_norm": 42.27700827156122, + "learning_rate": 2.7223230490018148e-08, + "logits/chosen": -1.6626800298690796, + "logits/rejected": -1.4588112831115723, + "logps/chosen": -84.01578521728516, + "logps/rejected": -111.53340911865234, + "loss": 0.6953, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.015042876824736595, + "rewards/margins": 0.0040522003546357155, + "rewards/rejected": 0.01099067647010088, + "step": 100 + }, + { + "epoch": 0.04, + "eval_logits/chosen": -2.501953125, + "eval_logits/rejected": -2.2406537532806396, + "eval_logps/chosen": -99.84339141845703, + "eval_logps/rejected": -125.21623992919922, + "eval_loss": 0.6942028403282166, + "eval_rewards/accuracies": 0.5357142686843872, + "eval_rewards/chosen": 0.011287148110568523, + "eval_rewards/margins": -0.0042336308397352695, + "eval_rewards/rejected": 0.015520776621997356, + "eval_runtime": 73.7375, + "eval_samples_per_second": 12.07, + "eval_steps_per_second": 0.19, + "step": 100 + }, + { + "epoch": 0.04, + "grad_norm": 46.94348163244785, + "learning_rate": 2.994555353901996e-08, + "logits/chosen": -1.4403226375579834, + "logits/rejected": -1.123073935508728, + "logps/chosen": -107.60018157958984, + "logps/rejected": -231.7578582763672, + "loss": 0.6892, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.009284459054470062, + "rewards/margins": 0.01384044624865055, + "rewards/rejected": -0.004555988125503063, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 46.81740301639235, + "learning_rate": 3.266787658802178e-08, + "logits/chosen": -1.4798848628997803, + "logits/rejected": -1.3624470233917236, + "logps/chosen": -94.34761047363281, + "logps/rejected": -120.3208999633789, + "loss": 0.6925, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.016081811860203743, + "rewards/margins": 0.002866668626666069, + "rewards/rejected": 0.0132151423022151, + "step": 120 + }, + { + "epoch": 0.05, + "grad_norm": 46.08791439590883, + "learning_rate": 3.539019963702359e-08, + "logits/chosen": -1.9895107746124268, + "logits/rejected": -1.281200885772705, + "logps/chosen": -95.02359771728516, + "logps/rejected": -131.6729736328125, + "loss": 0.6933, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.02468906342983246, + "rewards/margins": -0.0023543343413621187, + "rewards/rejected": 0.027043398469686508, + "step": 130 + }, + { + "epoch": 0.05, + "grad_norm": 48.41164277694124, + "learning_rate": 3.81125226860254e-08, + "logits/chosen": -1.8839794397354126, + "logits/rejected": -1.5620959997177124, + "logps/chosen": -97.3235855102539, + "logps/rejected": -101.75816345214844, + "loss": 0.6896, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.023149680346250534, + "rewards/margins": -0.027130965143442154, + "rewards/rejected": 0.050280649214982986, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 42.39632204422603, + "learning_rate": 4.083484573502722e-08, + "logits/chosen": -1.9134677648544312, + "logits/rejected": -1.6838871240615845, + "logps/chosen": -72.68460845947266, + "logps/rejected": -104.17420959472656, + "loss": 0.6856, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04995592683553696, + "rewards/margins": 0.012682494707405567, + "rewards/rejected": 0.03727342560887337, + "step": 150 + }, + { + "epoch": 0.06, + "grad_norm": 45.64533725609479, + "learning_rate": 4.355716878402904e-08, + "logits/chosen": -1.3503320217132568, + "logits/rejected": -1.1550263166427612, + "logps/chosen": -110.38504791259766, + "logps/rejected": -130.46852111816406, + "loss": 0.69, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.05299491807818413, + "rewards/margins": -0.0019288047915324569, + "rewards/rejected": 0.05492372438311577, + "step": 160 + }, + { + "epoch": 0.06, + "grad_norm": 41.18460830204607, + "learning_rate": 4.627949183303085e-08, + "logits/chosen": -1.496957778930664, + "logits/rejected": -0.9095319509506226, + "logps/chosen": -78.29255676269531, + "logps/rejected": -132.0447540283203, + "loss": 0.6846, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.059414975345134735, + "rewards/margins": 0.02102474495768547, + "rewards/rejected": 0.03839023411273956, + "step": 170 + }, + { + "epoch": 0.07, + "grad_norm": 41.49152413703793, + "learning_rate": 4.9001814882032664e-08, + "logits/chosen": -1.5343455076217651, + "logits/rejected": -1.8757272958755493, + "logps/chosen": -103.69041442871094, + "logps/rejected": -111.36357116699219, + "loss": 0.687, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.07798389345407486, + "rewards/margins": 0.0013759995345026255, + "rewards/rejected": 0.07660789787769318, + "step": 180 + }, + { + "epoch": 0.07, + "grad_norm": 42.39863370578702, + "learning_rate": 5.172413793103448e-08, + "logits/chosen": -1.6158069372177124, + "logits/rejected": -0.9124865531921387, + "logps/chosen": -100.64363861083984, + "logps/rejected": -135.06634521484375, + "loss": 0.6837, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.09579858928918839, + "rewards/margins": 0.01044096052646637, + "rewards/rejected": 0.08535762131214142, + "step": 190 + }, + { + "epoch": 0.07, + "grad_norm": 39.9911934894391, + "learning_rate": 5.4446460980036295e-08, + "logits/chosen": -2.3459877967834473, + "logits/rejected": -1.5009344816207886, + "logps/chosen": -82.8287353515625, + "logps/rejected": -124.65399169921875, + "loss": 0.6799, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.14615079760551453, + "rewards/margins": -0.0005797408521175385, + "rewards/rejected": 0.14673054218292236, + "step": 200 + }, + { + "epoch": 0.07, + "eval_logits/chosen": -2.5021491050720215, + "eval_logits/rejected": -2.2421486377716064, + "eval_logps/chosen": -98.44856262207031, + "eval_logps/rejected": -124.38037109375, + "eval_loss": 0.6807048916816711, + "eval_rewards/accuracies": 0.7857142686843872, + "eval_rewards/chosen": 0.15076985955238342, + "eval_rewards/margins": 0.05166209489107132, + "eval_rewards/rejected": 0.0991077572107315, + "eval_runtime": 71.9557, + "eval_samples_per_second": 12.369, + "eval_steps_per_second": 0.195, + "step": 200 + }, + { + "epoch": 0.08, + "grad_norm": 45.974517848209416, + "learning_rate": 5.716878402903811e-08, + "logits/chosen": -2.038114070892334, + "logits/rejected": -1.6292145252227783, + "logps/chosen": -95.0359115600586, + "logps/rejected": -117.90171813964844, + "loss": 0.6816, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.15096978843212128, + "rewards/margins": -0.013230574317276478, + "rewards/rejected": 0.16420036554336548, + "step": 210 + }, + { + "epoch": 0.08, + "grad_norm": 44.619784920786046, + "learning_rate": 5.989110707803992e-08, + "logits/chosen": -2.296483039855957, + "logits/rejected": -1.9715936183929443, + "logps/chosen": -93.15199279785156, + "logps/rejected": -127.34361267089844, + "loss": 0.674, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20922093093395233, + "rewards/margins": 0.046642549335956573, + "rewards/rejected": 0.16257838904857635, + "step": 220 + }, + { + "epoch": 0.08, + "grad_norm": 44.88970750413813, + "learning_rate": 6.261343012704174e-08, + "logits/chosen": -1.450714349746704, + "logits/rejected": -0.9239526987075806, + "logps/chosen": -85.76863098144531, + "logps/rejected": -121.76054382324219, + "loss": 0.6702, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.1901402473449707, + "rewards/margins": 0.05737558752298355, + "rewards/rejected": 0.13276468217372894, + "step": 230 + }, + { + "epoch": 0.09, + "grad_norm": 39.31008777722159, + "learning_rate": 6.533575317604356e-08, + "logits/chosen": -1.9880611896514893, + "logits/rejected": -1.3975508213043213, + "logps/chosen": -99.23193359375, + "logps/rejected": -116.69496154785156, + "loss": 0.6673, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.2814417779445648, + "rewards/margins": 0.020699840039014816, + "rewards/rejected": 0.2607419490814209, + "step": 240 + }, + { + "epoch": 0.09, + "grad_norm": 43.800148674203605, + "learning_rate": 6.805807622504536e-08, + "logits/chosen": -1.808105230331421, + "logits/rejected": -1.5273463726043701, + "logps/chosen": -86.12389373779297, + "logps/rejected": -122.4721908569336, + "loss": 0.666, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.2851946949958801, + "rewards/margins": 0.03660944104194641, + "rewards/rejected": 0.24858525395393372, + "step": 250 + }, + { + "epoch": 0.09, + "grad_norm": 43.84613473195863, + "learning_rate": 7.078039927404718e-08, + "logits/chosen": -2.0416717529296875, + "logits/rejected": -1.812233567237854, + "logps/chosen": -118.04366302490234, + "logps/rejected": -118.3464126586914, + "loss": 0.6663, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.3446394205093384, + "rewards/margins": 0.07985617220401764, + "rewards/rejected": 0.26478323340415955, + "step": 260 + }, + { + "epoch": 0.1, + "grad_norm": 45.46886488583846, + "learning_rate": 7.3502722323049e-08, + "logits/chosen": -2.089078664779663, + "logits/rejected": -1.5567834377288818, + "logps/chosen": -103.9489974975586, + "logps/rejected": -151.25582885742188, + "loss": 0.651, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.3747584819793701, + "rewards/margins": 0.04969606548547745, + "rewards/rejected": 0.32506245374679565, + "step": 270 + }, + { + "epoch": 0.1, + "grad_norm": 46.872473628922336, + "learning_rate": 7.62250453720508e-08, + "logits/chosen": -1.9163455963134766, + "logits/rejected": -1.490391492843628, + "logps/chosen": -79.95561218261719, + "logps/rejected": -128.16030883789062, + "loss": 0.6558, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.5081700682640076, + "rewards/margins": 0.14635446667671204, + "rewards/rejected": 0.36181557178497314, + "step": 280 + }, + { + "epoch": 0.11, + "grad_norm": 40.264154663026744, + "learning_rate": 7.894736842105262e-08, + "logits/chosen": -1.999222755432129, + "logits/rejected": -2.104238986968994, + "logps/chosen": -96.35908508300781, + "logps/rejected": -113.0312728881836, + "loss": 0.6477, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.603106677532196, + "rewards/margins": 0.18604376912117004, + "rewards/rejected": 0.4170629382133484, + "step": 290 + }, + { + "epoch": 0.11, + "grad_norm": 38.393082680911135, + "learning_rate": 8.166969147005444e-08, + "logits/chosen": -2.011298894882202, + "logits/rejected": -1.8035573959350586, + "logps/chosen": -101.60588836669922, + "logps/rejected": -126.2813491821289, + "loss": 0.6328, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5809694528579712, + "rewards/margins": 0.10704417526721954, + "rewards/rejected": 0.4739252030849457, + "step": 300 + }, + { + "epoch": 0.11, + "eval_logits/chosen": -2.5139808654785156, + "eval_logits/rejected": -2.253553628921509, + "eval_logps/chosen": -93.76677703857422, + "eval_logps/rejected": -121.03182220458984, + "eval_loss": 0.6357866525650024, + "eval_rewards/accuracies": 0.8214285969734192, + "eval_rewards/chosen": 0.6189486384391785, + "eval_rewards/margins": 0.18498651683330536, + "eval_rewards/rejected": 0.4339621067047119, + "eval_runtime": 72.0138, + "eval_samples_per_second": 12.359, + "eval_steps_per_second": 0.194, + "step": 300 + }, + { + "epoch": 0.11, + "grad_norm": 41.32014110656028, + "learning_rate": 8.439201451905626e-08, + "logits/chosen": -1.7809641361236572, + "logits/rejected": -1.4835566282272339, + "logps/chosen": -91.62686157226562, + "logps/rejected": -160.69992065429688, + "loss": 0.6213, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6366099715232849, + "rewards/margins": 0.10506564378738403, + "rewards/rejected": 0.5315443277359009, + "step": 310 + }, + { + "epoch": 0.12, + "grad_norm": 40.20352730538307, + "learning_rate": 8.711433756805808e-08, + "logits/chosen": -1.3174822330474854, + "logits/rejected": -0.9151695370674133, + "logps/chosen": -76.4534912109375, + "logps/rejected": -106.75065612792969, + "loss": 0.6352, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.6975838541984558, + "rewards/margins": 0.11969652026891708, + "rewards/rejected": 0.5778872966766357, + "step": 320 + }, + { + "epoch": 0.12, + "grad_norm": 44.10982454650109, + "learning_rate": 8.983666061705989e-08, + "logits/chosen": -1.7297052145004272, + "logits/rejected": -1.1650458574295044, + "logps/chosen": -95.3366470336914, + "logps/rejected": -113.1480712890625, + "loss": 0.6209, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.8798357844352722, + "rewards/margins": 0.10148117691278458, + "rewards/rejected": 0.7783547043800354, + "step": 330 + }, + { + "epoch": 0.12, + "grad_norm": 39.48829529957203, + "learning_rate": 9.25589836660617e-08, + "logits/chosen": -1.7264535427093506, + "logits/rejected": -1.7544937133789062, + "logps/chosen": -100.7391586303711, + "logps/rejected": -132.95297241210938, + "loss": 0.6138, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.7276839017868042, + "rewards/margins": 0.1849050223827362, + "rewards/rejected": 0.5427789092063904, + "step": 340 + }, + { + "epoch": 0.13, + "grad_norm": 40.75750118936868, + "learning_rate": 9.528130671506351e-08, + "logits/chosen": -1.9755557775497437, + "logits/rejected": -1.5455005168914795, + "logps/chosen": -92.50463104248047, + "logps/rejected": -114.50557708740234, + "loss": 0.6187, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.886059582233429, + "rewards/margins": 0.1289076954126358, + "rewards/rejected": 0.7571519613265991, + "step": 350 + }, + { + "epoch": 0.13, + "grad_norm": 42.45802253147107, + "learning_rate": 9.800362976406533e-08, + "logits/chosen": -1.9283500909805298, + "logits/rejected": -1.5053136348724365, + "logps/chosen": -82.00525665283203, + "logps/rejected": -104.69712829589844, + "loss": 0.6127, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.1786202192306519, + "rewards/margins": 0.27130889892578125, + "rewards/rejected": 0.9073113203048706, + "step": 360 + }, + { + "epoch": 0.13, + "grad_norm": 39.734876260743064, + "learning_rate": 1.0072595281306713e-07, + "logits/chosen": -2.2132327556610107, + "logits/rejected": -1.904916524887085, + "logps/chosen": -72.9496841430664, + "logps/rejected": -108.14723205566406, + "loss": 0.605, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 1.2401649951934814, + "rewards/margins": 0.1573180854320526, + "rewards/rejected": 1.0828468799591064, + "step": 370 + }, + { + "epoch": 0.14, + "grad_norm": 39.83612602137503, + "learning_rate": 1.0344827586206897e-07, + "logits/chosen": -2.0371642112731934, + "logits/rejected": -1.9173485040664673, + "logps/chosen": -83.82426452636719, + "logps/rejected": -97.99388885498047, + "loss": 0.5913, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.2351922988891602, + "rewards/margins": 0.4066685140132904, + "rewards/rejected": 0.8285236358642578, + "step": 380 + }, + { + "epoch": 0.14, + "grad_norm": 42.576323725278016, + "learning_rate": 1.0617059891107078e-07, + "logits/chosen": -2.1276021003723145, + "logits/rejected": -2.370884418487549, + "logps/chosen": -97.64468383789062, + "logps/rejected": -113.1034164428711, + "loss": 0.6069, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.331588864326477, + "rewards/margins": 0.19395257532596588, + "rewards/rejected": 1.1376360654830933, + "step": 390 + }, + { + "epoch": 0.15, + "grad_norm": 44.1343637917746, + "learning_rate": 1.0889292196007259e-07, + "logits/chosen": -1.8790748119354248, + "logits/rejected": -1.5620168447494507, + "logps/chosen": -72.6387710571289, + "logps/rejected": -154.55137634277344, + "loss": 0.5964, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.3714253902435303, + "rewards/margins": 0.4154202938079834, + "rewards/rejected": 0.9560050964355469, + "step": 400 + }, + { + "epoch": 0.15, + "eval_logits/chosen": -2.5321178436279297, + "eval_logits/rejected": -2.2736294269561768, + "eval_logps/chosen": -86.19153594970703, + "eval_logps/rejected": -116.01555633544922, + "eval_loss": 0.5837011337280273, + "eval_rewards/accuracies": 0.7678571343421936, + "eval_rewards/chosen": 1.3764731884002686, + "eval_rewards/margins": 0.44088268280029297, + "eval_rewards/rejected": 0.9355906844139099, + "eval_runtime": 71.9235, + "eval_samples_per_second": 12.374, + "eval_steps_per_second": 0.195, + "step": 400 + }, + { + "epoch": 0.15, + "grad_norm": 32.5866475732278, + "learning_rate": 1.1161524500907441e-07, + "logits/chosen": -2.1084635257720947, + "logits/rejected": -1.6532913446426392, + "logps/chosen": -71.08902740478516, + "logps/rejected": -119.5894546508789, + "loss": 0.5579, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.3634411096572876, + "rewards/margins": 0.15741710364818573, + "rewards/rejected": 1.206023931503296, + "step": 410 + }, + { + "epoch": 0.15, + "grad_norm": 51.63908130905885, + "learning_rate": 1.1433756805807621e-07, + "logits/chosen": -1.8392841815948486, + "logits/rejected": -1.5305787324905396, + "logps/chosen": -64.91630554199219, + "logps/rejected": -90.24629974365234, + "loss": 0.5809, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 1.4629342555999756, + "rewards/margins": 0.21032755076885223, + "rewards/rejected": 1.2526066303253174, + "step": 420 + }, + { + "epoch": 0.16, + "grad_norm": 42.575288667126856, + "learning_rate": 1.1705989110707803e-07, + "logits/chosen": -1.7529628276824951, + "logits/rejected": -1.451038122177124, + "logps/chosen": -81.2251205444336, + "logps/rejected": -142.9680938720703, + "loss": 0.5776, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 1.41762375831604, + "rewards/margins": 0.20172078907489777, + "rewards/rejected": 1.2159030437469482, + "step": 430 + }, + { + "epoch": 0.16, + "grad_norm": 37.38028775120078, + "learning_rate": 1.1978221415607984e-07, + "logits/chosen": -1.7141106128692627, + "logits/rejected": -1.342930555343628, + "logps/chosen": -74.53529357910156, + "logps/rejected": -107.2362060546875, + "loss": 0.5626, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.7542121410369873, + "rewards/margins": 0.4075583517551422, + "rewards/rejected": 1.3466538190841675, + "step": 440 + }, + { + "epoch": 0.16, + "grad_norm": 36.24749956558924, + "learning_rate": 1.2250453720508167e-07, + "logits/chosen": -1.9711906909942627, + "logits/rejected": -1.221478819847107, + "logps/chosen": -103.87667083740234, + "logps/rejected": -135.00277709960938, + "loss": 0.5604, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 1.4585211277008057, + "rewards/margins": 0.28392547369003296, + "rewards/rejected": 1.1745957136154175, + "step": 450 + }, + { + "epoch": 0.17, + "grad_norm": 42.96091232839455, + "learning_rate": 1.2522686025408348e-07, + "logits/chosen": -1.7234961986541748, + "logits/rejected": -1.5666788816452026, + "logps/chosen": -94.36329650878906, + "logps/rejected": -109.29872131347656, + "loss": 0.5283, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 1.7453796863555908, + "rewards/margins": 0.3029170632362366, + "rewards/rejected": 1.442462682723999, + "step": 460 + }, + { + "epoch": 0.17, + "grad_norm": 48.636321311117, + "learning_rate": 1.279491833030853e-07, + "logits/chosen": -2.4974513053894043, + "logits/rejected": -1.580528736114502, + "logps/chosen": -93.00968170166016, + "logps/rejected": -127.61222076416016, + "loss": 0.5586, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.8251793384552002, + "rewards/margins": 0.470356285572052, + "rewards/rejected": 1.354823112487793, + "step": 470 + }, + { + "epoch": 0.17, + "grad_norm": 35.14478185239256, + "learning_rate": 1.3067150635208711e-07, + "logits/chosen": -1.878103256225586, + "logits/rejected": -1.7810052633285522, + "logps/chosen": -64.92475891113281, + "logps/rejected": -95.4583511352539, + "loss": 0.5455, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 1.9156177043914795, + "rewards/margins": 0.21928730607032776, + "rewards/rejected": 1.6963306665420532, + "step": 480 + }, + { + "epoch": 0.18, + "grad_norm": 41.07425605129, + "learning_rate": 1.3339382940108892e-07, + "logits/chosen": -1.8308073282241821, + "logits/rejected": -1.6511573791503906, + "logps/chosen": -94.47706604003906, + "logps/rejected": -142.56326293945312, + "loss": 0.5669, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.7191970348358154, + "rewards/margins": 0.4305998682975769, + "rewards/rejected": 1.2885972261428833, + "step": 490 + }, + { + "epoch": 0.18, + "grad_norm": 40.72644317271578, + "learning_rate": 1.3611615245009072e-07, + "logits/chosen": -1.8288648128509521, + "logits/rejected": -1.6132848262786865, + "logps/chosen": -84.89088439941406, + "logps/rejected": -141.57489013671875, + "loss": 0.5567, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.799727201461792, + "rewards/margins": 0.7182740569114685, + "rewards/rejected": 1.0814530849456787, + "step": 500 + }, + { + "epoch": 0.18, + "eval_logits/chosen": -2.56964373588562, + "eval_logits/rejected": -2.3121345043182373, + "eval_logps/chosen": -81.02156829833984, + "eval_logps/rejected": -113.64513397216797, + "eval_loss": 0.5355415940284729, + "eval_rewards/accuracies": 0.8392857313156128, + "eval_rewards/chosen": 1.8934696912765503, + "eval_rewards/margins": 0.7208380699157715, + "eval_rewards/rejected": 1.1726313829421997, + "eval_runtime": 71.9101, + "eval_samples_per_second": 12.377, + "eval_steps_per_second": 0.195, + "step": 500 + }, + { + "epoch": 0.19, + "grad_norm": 31.50812715169961, + "learning_rate": 1.3883847549909256e-07, + "logits/chosen": -2.6629555225372314, + "logits/rejected": -1.8640201091766357, + "logps/chosen": -64.94083404541016, + "logps/rejected": -131.4000701904297, + "loss": 0.5287, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 2.017392635345459, + "rewards/margins": 0.5965152978897095, + "rewards/rejected": 1.4208776950836182, + "step": 510 + }, + { + "epoch": 0.19, + "grad_norm": 41.001026190013995, + "learning_rate": 1.4156079854809436e-07, + "logits/chosen": -2.0135555267333984, + "logits/rejected": -2.3467681407928467, + "logps/chosen": -83.2616958618164, + "logps/rejected": -96.27758026123047, + "loss": 0.5086, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.7922933101654053, + "rewards/margins": 0.6331424117088318, + "rewards/rejected": 1.1591508388519287, + "step": 520 + }, + { + "epoch": 0.19, + "grad_norm": 43.50803723743233, + "learning_rate": 1.442831215970962e-07, + "logits/chosen": -2.3700008392333984, + "logits/rejected": -1.9721641540527344, + "logps/chosen": -65.375, + "logps/rejected": -107.3283920288086, + "loss": 0.5664, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 1.932377576828003, + "rewards/margins": 0.5669600367546082, + "rewards/rejected": 1.36541748046875, + "step": 530 + }, + { + "epoch": 0.2, + "grad_norm": 36.6257172454756, + "learning_rate": 1.47005444646098e-07, + "logits/chosen": -2.7627670764923096, + "logits/rejected": -2.018375873565674, + "logps/chosen": -66.04200744628906, + "logps/rejected": -90.71024322509766, + "loss": 0.5114, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 2.1180334091186523, + "rewards/margins": 0.621078610420227, + "rewards/rejected": 1.4969549179077148, + "step": 540 + }, + { + "epoch": 0.2, + "grad_norm": 35.991193860058296, + "learning_rate": 1.497277676950998e-07, + "logits/chosen": -1.3566482067108154, + "logits/rejected": -1.3935582637786865, + "logps/chosen": -77.91327667236328, + "logps/rejected": -97.44300079345703, + "loss": 0.5308, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 1.9388548135757446, + "rewards/margins": 0.38032227754592896, + "rewards/rejected": 1.5585325956344604, + "step": 550 + }, + { + "epoch": 0.2, + "grad_norm": 48.06228927830701, + "learning_rate": 1.49998780935628e-07, + "logits/chosen": -1.7465827465057373, + "logits/rejected": -1.4473216533660889, + "logps/chosen": -70.13250732421875, + "logps/rejected": -105.08538818359375, + "loss": 0.4779, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.5702747106552124, + "rewards/margins": 0.8154007196426392, + "rewards/rejected": 0.7548739910125732, + "step": 560 + }, + { + "epoch": 0.21, + "grad_norm": 38.75394797505258, + "learning_rate": 1.4999456693682575e-07, + "logits/chosen": -2.4446985721588135, + "logits/rejected": -1.8925511837005615, + "logps/chosen": -71.6372299194336, + "logps/rejected": -103.564697265625, + "loss": 0.5129, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.205054759979248, + "rewards/margins": 0.5576547384262085, + "rewards/rejected": 1.647400140762329, + "step": 570 + }, + { + "epoch": 0.21, + "grad_norm": 37.78034263492822, + "learning_rate": 1.4998734312249918e-07, + "logits/chosen": -1.8992903232574463, + "logits/rejected": -1.5952703952789307, + "logps/chosen": -83.2913589477539, + "logps/rejected": -120.90938568115234, + "loss": 0.4891, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.9770288467407227, + "rewards/margins": 0.7018577456474304, + "rewards/rejected": 1.2751710414886475, + "step": 580 + }, + { + "epoch": 0.21, + "grad_norm": 37.211325491550305, + "learning_rate": 1.4997710978256735e-07, + "logits/chosen": -1.94363272190094, + "logits/rejected": -1.9168386459350586, + "logps/chosen": -76.82261657714844, + "logps/rejected": -120.50138092041016, + "loss": 0.5175, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 2.0862905979156494, + "rewards/margins": 0.5364335775375366, + "rewards/rejected": 1.5498571395874023, + "step": 590 + }, + { + "epoch": 0.22, + "grad_norm": 38.42991397247903, + "learning_rate": 1.4996386732773285e-07, + "logits/chosen": -1.6784331798553467, + "logits/rejected": -1.4212602376937866, + "logps/chosen": -78.51207733154297, + "logps/rejected": -97.93910217285156, + "loss": 0.5234, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.9482488632202148, + "rewards/margins": 0.5499599575996399, + "rewards/rejected": 1.3982888460159302, + "step": 600 + }, + { + "epoch": 0.22, + "eval_logits/chosen": -2.631465435028076, + "eval_logits/rejected": -2.3776392936706543, + "eval_logps/chosen": -79.7362289428711, + "eval_logps/rejected": -114.89885711669922, + "eval_loss": 0.4917795658111572, + "eval_rewards/accuracies": 0.8571428656578064, + "eval_rewards/chosen": 2.022002696990967, + "eval_rewards/margins": 0.9747439026832581, + "eval_rewards/rejected": 1.047258734703064, + "eval_runtime": 71.9087, + "eval_samples_per_second": 12.377, + "eval_steps_per_second": 0.195, + "step": 600 + }, + { + "epoch": 0.22, + "grad_norm": 48.91223436910098, + "learning_rate": 1.4994761628946546e-07, + "logits/chosen": -1.6232852935791016, + "logits/rejected": -1.4054292440414429, + "logps/chosen": -76.1790542602539, + "logps/rejected": -125.78961181640625, + "loss": 0.4929, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.940302848815918, + "rewards/margins": 0.9549075961112976, + "rewards/rejected": 0.9853953123092651, + "step": 610 + }, + { + "epoch": 0.23, + "grad_norm": 41.70252788830195, + "learning_rate": 1.499283573199808e-07, + "logits/chosen": -1.7214891910552979, + "logits/rejected": -1.2875360250473022, + "logps/chosen": -73.24628448486328, + "logps/rejected": -86.8110122680664, + "loss": 0.4935, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.8633216619491577, + "rewards/margins": 0.7748242020606995, + "rewards/rejected": 1.0884974002838135, + "step": 620 + }, + { + "epoch": 0.23, + "grad_norm": 38.30334820196022, + "learning_rate": 1.499060911922141e-07, + "logits/chosen": -1.616151213645935, + "logits/rejected": -1.6390151977539062, + "logps/chosen": -68.93297576904297, + "logps/rejected": -113.6138687133789, + "loss": 0.4795, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.810837984085083, + "rewards/margins": 0.38780540227890015, + "rewards/rejected": 1.423032522201538, + "step": 630 + }, + { + "epoch": 0.23, + "grad_norm": 34.79521166345824, + "learning_rate": 1.498808187997893e-07, + "logits/chosen": -1.9353692531585693, + "logits/rejected": -1.5537188053131104, + "logps/chosen": -97.22018432617188, + "logps/rejected": -153.95028686523438, + "loss": 0.5006, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.8405921459197998, + "rewards/margins": 1.0561001300811768, + "rewards/rejected": 0.7844920754432678, + "step": 640 + }, + { + "epoch": 0.24, + "grad_norm": 94.46732088298053, + "learning_rate": 1.4985254115698304e-07, + "logits/chosen": -2.263124704360962, + "logits/rejected": -1.8345104455947876, + "logps/chosen": -73.39897155761719, + "logps/rejected": -92.21670532226562, + "loss": 0.4917, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 2.1108241081237793, + "rewards/margins": 0.9142869114875793, + "rewards/rejected": 1.1965371370315552, + "step": 650 + }, + { + "epoch": 0.24, + "grad_norm": 33.786235958694014, + "learning_rate": 1.4982125939868402e-07, + "logits/chosen": -2.4463446140289307, + "logits/rejected": -1.6779578924179077, + "logps/chosen": -79.39064025878906, + "logps/rejected": -132.6228790283203, + "loss": 0.4479, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.3004369735717773, + "rewards/margins": 1.3063924312591553, + "rewards/rejected": 0.9940446019172668, + "step": 660 + }, + { + "epoch": 0.24, + "grad_norm": 35.539345812176975, + "learning_rate": 1.4978697478034753e-07, + "logits/chosen": -2.2131102085113525, + "logits/rejected": -1.8831088542938232, + "logps/chosen": -67.44413757324219, + "logps/rejected": -113.38688659667969, + "loss": 0.4648, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 2.094628095626831, + "rewards/margins": 0.9700366854667664, + "rewards/rejected": 1.124591588973999, + "step": 670 + }, + { + "epoch": 0.25, + "grad_norm": 38.83509762475066, + "learning_rate": 1.497496886779449e-07, + "logits/chosen": -2.4378554821014404, + "logits/rejected": -1.9790445566177368, + "logps/chosen": -103.16511535644531, + "logps/rejected": -128.8167266845703, + "loss": 0.4337, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.8627456426620483, + "rewards/margins": 0.8879152536392212, + "rewards/rejected": 0.9748304486274719, + "step": 680 + }, + { + "epoch": 0.25, + "grad_norm": 41.327033787846, + "learning_rate": 1.497094025879084e-07, + "logits/chosen": -1.8331416845321655, + "logits/rejected": -1.4264384508132935, + "logps/chosen": -80.54314422607422, + "logps/rejected": -137.10955810546875, + "loss": 0.4143, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.4691455364227295, + "rewards/margins": 0.8567919731140137, + "rewards/rejected": 0.612353503704071, + "step": 690 + }, + { + "epoch": 0.25, + "grad_norm": 38.09387686336411, + "learning_rate": 1.4966611812707116e-07, + "logits/chosen": -2.376185894012451, + "logits/rejected": -1.9318294525146484, + "logps/chosen": -67.70744323730469, + "logps/rejected": -106.6443099975586, + "loss": 0.4468, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 2.5539474487304688, + "rewards/margins": 1.1218903064727783, + "rewards/rejected": 1.43205726146698, + "step": 700 + }, + { + "epoch": 0.25, + "eval_logits/chosen": -2.6510469913482666, + "eval_logits/rejected": -2.410930871963501, + "eval_logps/chosen": -79.01429748535156, + "eval_logps/rejected": -117.04235076904297, + "eval_loss": 0.45572710037231445, + "eval_rewards/accuracies": 0.8571428656578064, + "eval_rewards/chosen": 2.0941965579986572, + "eval_rewards/margins": 1.2612864971160889, + "eval_rewards/rejected": 0.832909882068634, + "eval_runtime": 72.043, + "eval_samples_per_second": 12.354, + "eval_steps_per_second": 0.194, + "step": 700 + }, + { + "epoch": 0.26, + "grad_norm": 34.41497827448044, + "learning_rate": 1.4961983703260222e-07, + "logits/chosen": -2.285163402557373, + "logits/rejected": -1.6250555515289307, + "logps/chosen": -82.45291137695312, + "logps/rejected": -162.6754913330078, + "loss": 0.4388, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.6247352361679077, + "rewards/margins": 1.2791966199874878, + "rewards/rejected": 0.34553852677345276, + "step": 710 + }, + { + "epoch": 0.26, + "grad_norm": 38.939975049466675, + "learning_rate": 1.495705611619369e-07, + "logits/chosen": -2.8705544471740723, + "logits/rejected": -1.9211571216583252, + "logps/chosen": -52.576690673828125, + "logps/rejected": -104.49955749511719, + "loss": 0.4632, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 2.5734798908233643, + "rewards/margins": 1.8182973861694336, + "rewards/rejected": 0.755182683467865, + "step": 720 + }, + { + "epoch": 0.26, + "grad_norm": 37.237058738428075, + "learning_rate": 1.4951829249270223e-07, + "logits/chosen": -2.2160234451293945, + "logits/rejected": -1.7666336297988892, + "logps/chosen": -94.98374938964844, + "logps/rejected": -155.47586059570312, + "loss": 0.4477, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.8300750255584717, + "rewards/margins": 0.9866692423820496, + "rewards/rejected": 0.8434059023857117, + "step": 730 + }, + { + "epoch": 0.27, + "grad_norm": 42.247815955525425, + "learning_rate": 1.4946303312263751e-07, + "logits/chosen": -2.1757843494415283, + "logits/rejected": -1.8440332412719727, + "logps/chosen": -72.95508575439453, + "logps/rejected": -113.18672943115234, + "loss": 0.4715, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.1009631156921387, + "rewards/margins": 1.115952491760254, + "rewards/rejected": 0.9850105047225952, + "step": 740 + }, + { + "epoch": 0.27, + "grad_norm": 32.89732640498844, + "learning_rate": 1.4940478526951018e-07, + "logits/chosen": -1.856689453125, + "logits/rejected": -1.4313759803771973, + "logps/chosen": -82.71755981445312, + "logps/rejected": -135.11648559570312, + "loss": 0.426, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.647456407546997, + "rewards/margins": 1.2215721607208252, + "rewards/rejected": 0.425883948802948, + "step": 750 + }, + { + "epoch": 0.28, + "grad_norm": 39.76459644604554, + "learning_rate": 1.4934355127102686e-07, + "logits/chosen": -2.3893539905548096, + "logits/rejected": -1.5895264148712158, + "logps/chosen": -78.76719665527344, + "logps/rejected": -160.42080688476562, + "loss": 0.4394, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 2.3389594554901123, + "rewards/margins": 1.721954345703125, + "rewards/rejected": 0.6170052289962769, + "step": 760 + }, + { + "epoch": 0.28, + "grad_norm": 30.768392184975625, + "learning_rate": 1.492793335847394e-07, + "logits/chosen": -2.251796245574951, + "logits/rejected": -1.5280625820159912, + "logps/chosen": -63.670433044433594, + "logps/rejected": -124.48072814941406, + "loss": 0.4001, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.670900583267212, + "rewards/margins": 1.3456604480743408, + "rewards/rejected": 0.3252400755882263, + "step": 770 + }, + { + "epoch": 0.28, + "grad_norm": 35.26526799612695, + "learning_rate": 1.4921213478794637e-07, + "logits/chosen": -2.0840353965759277, + "logits/rejected": -1.9413776397705078, + "logps/chosen": -73.97483825683594, + "logps/rejected": -113.29158020019531, + "loss": 0.4185, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.834567666053772, + "rewards/margins": 1.0754048824310303, + "rewards/rejected": 0.7591627240180969, + "step": 780 + }, + { + "epoch": 0.29, + "grad_norm": 42.37614388323698, + "learning_rate": 1.4914195757758955e-07, + "logits/chosen": -2.1000962257385254, + "logits/rejected": -2.3834102153778076, + "logps/chosen": -76.1771240234375, + "logps/rejected": -88.94342803955078, + "loss": 0.4397, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 2.1757988929748535, + "rewards/margins": 1.1148507595062256, + "rewards/rejected": 1.060948133468628, + "step": 790 + }, + { + "epoch": 0.29, + "grad_norm": 37.69381725528037, + "learning_rate": 1.4906880477014573e-07, + "logits/chosen": -2.2119500637054443, + "logits/rejected": -1.4499582052230835, + "logps/chosen": -62.1577033996582, + "logps/rejected": -163.81442260742188, + "loss": 0.4445, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.9283027648925781, + "rewards/margins": 1.5437467098236084, + "rewards/rejected": 0.3845560550689697, + "step": 800 + }, + { + "epoch": 0.29, + "eval_logits/chosen": -2.6051876544952393, + "eval_logits/rejected": -2.3914036750793457, + "eval_logps/chosen": -83.42094421386719, + "eval_logps/rejected": -123.46272277832031, + "eval_loss": 0.43243297934532166, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 1.6535331010818481, + "eval_rewards/margins": 1.4626604318618774, + "eval_rewards/rejected": 0.1908726841211319, + "eval_runtime": 71.9896, + "eval_samples_per_second": 12.363, + "eval_steps_per_second": 0.194, + "step": 800 + }, + { + "epoch": 0.29, + "grad_norm": 35.55693000649927, + "learning_rate": 1.489926793015137e-07, + "logits/chosen": -2.137413501739502, + "logits/rejected": -1.9606491327285767, + "logps/chosen": -69.8790512084961, + "logps/rejected": -97.57211303710938, + "loss": 0.3881, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 2.0780739784240723, + "rewards/margins": 1.4736759662628174, + "rewards/rejected": 0.6043978333473206, + "step": 810 + }, + { + "epoch": 0.3, + "grad_norm": 37.02292806830243, + "learning_rate": 1.489135842268963e-07, + "logits/chosen": -2.0538628101348877, + "logits/rejected": -2.0753753185272217, + "logps/chosen": -106.13938903808594, + "logps/rejected": -144.13514709472656, + "loss": 0.4145, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.3868672847747803, + "rewards/margins": 1.0619122982025146, + "rewards/rejected": 0.3249548673629761, + "step": 820 + }, + { + "epoch": 0.3, + "grad_norm": 40.671728265294014, + "learning_rate": 1.4883152272067798e-07, + "logits/chosen": -1.9181344509124756, + "logits/rejected": -1.6507272720336914, + "logps/chosen": -99.54740905761719, + "logps/rejected": -119.50572204589844, + "loss": 0.4404, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.4642399549484253, + "rewards/margins": 0.9573423266410828, + "rewards/rejected": 0.5068976283073425, + "step": 830 + }, + { + "epoch": 0.3, + "grad_norm": 28.025590997376412, + "learning_rate": 1.487464980762972e-07, + "logits/chosen": -2.1211495399475098, + "logits/rejected": -1.789926528930664, + "logps/chosen": -76.32536315917969, + "logps/rejected": -180.9839324951172, + "loss": 0.393, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.8189208507537842, + "rewards/margins": 1.659450888633728, + "rewards/rejected": 0.15946999192237854, + "step": 840 + }, + { + "epoch": 0.31, + "grad_norm": 42.494743726734264, + "learning_rate": 1.4865851370611445e-07, + "logits/chosen": -1.7717339992523193, + "logits/rejected": -1.7804222106933594, + "logps/chosen": -94.37425994873047, + "logps/rejected": -134.35830688476562, + "loss": 0.3825, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.593718409538269, + "rewards/margins": 1.334201455116272, + "rewards/rejected": 0.25951701402664185, + "step": 850 + }, + { + "epoch": 0.31, + "grad_norm": 47.60353919509746, + "learning_rate": 1.4856757314127514e-07, + "logits/chosen": -1.930068016052246, + "logits/rejected": -1.686745285987854, + "logps/chosen": -87.13040161132812, + "logps/rejected": -108.21986389160156, + "loss": 0.3977, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.576547622680664, + "rewards/margins": 1.2103341817855835, + "rewards/rejected": 0.3662133812904358, + "step": 860 + }, + { + "epoch": 0.32, + "grad_norm": 55.2942123107171, + "learning_rate": 1.4847368003156803e-07, + "logits/chosen": -1.7101478576660156, + "logits/rejected": -1.5123283863067627, + "logps/chosen": -80.416259765625, + "logps/rejected": -92.58718872070312, + "loss": 0.4117, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.8133132457733154, + "rewards/margins": 1.0088872909545898, + "rewards/rejected": 0.8044260144233704, + "step": 870 + }, + { + "epoch": 0.32, + "grad_norm": 35.816541721556696, + "learning_rate": 1.483768381452786e-07, + "logits/chosen": -2.319746732711792, + "logits/rejected": -1.79997980594635, + "logps/chosen": -87.76374816894531, + "logps/rejected": -112.37357330322266, + "loss": 0.3523, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.6848220825195312, + "rewards/margins": 0.9358736276626587, + "rewards/rejected": 0.7489483952522278, + "step": 880 + }, + { + "epoch": 0.32, + "grad_norm": 36.93257214414797, + "learning_rate": 1.482770513690379e-07, + "logits/chosen": -1.800686240196228, + "logits/rejected": -1.3662943840026855, + "logps/chosen": -88.72349548339844, + "logps/rejected": -132.239501953125, + "loss": 0.3748, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.5943005084991455, + "rewards/margins": 1.0010852813720703, + "rewards/rejected": 0.5932152271270752, + "step": 890 + }, + { + "epoch": 0.33, + "grad_norm": 32.655789909757885, + "learning_rate": 1.4817432370766656e-07, + "logits/chosen": -1.7111234664916992, + "logits/rejected": -1.62754225730896, + "logps/chosen": -72.77391052246094, + "logps/rejected": -117.9988021850586, + "loss": 0.4219, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0136775970458984, + "rewards/margins": 1.9716641902923584, + "rewards/rejected": 0.042013369500637054, + "step": 900 + }, + { + "epoch": 0.33, + "eval_logits/chosen": -2.572319507598877, + "eval_logits/rejected": -2.3702971935272217, + "eval_logps/chosen": -83.57896423339844, + "eval_logps/rejected": -125.93928527832031, + "eval_loss": 0.4064531624317169, + "eval_rewards/accuracies": 0.9107142686843872, + "eval_rewards/chosen": 1.6377297639846802, + "eval_rewards/margins": 1.694512963294983, + "eval_rewards/rejected": -0.056782953441143036, + "eval_runtime": 71.9888, + "eval_samples_per_second": 12.363, + "eval_steps_per_second": 0.194, + "step": 900 + }, + { + "epoch": 0.33, + "grad_norm": 34.63562171188158, + "learning_rate": 1.4806865928401402e-07, + "logits/chosen": -2.034156084060669, + "logits/rejected": -1.3545467853546143, + "logps/chosen": -63.69415283203125, + "logps/rejected": -122.93882751464844, + "loss": 0.3752, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.400649309158325, + "rewards/margins": 2.036404609680176, + "rewards/rejected": 0.36424484848976135, + "step": 910 + }, + { + "epoch": 0.33, + "grad_norm": 51.406692521659814, + "learning_rate": 1.4796006233879314e-07, + "logits/chosen": -1.6481889486312866, + "logits/rejected": -1.8775209188461304, + "logps/chosen": -105.07774353027344, + "logps/rejected": -128.53115844726562, + "loss": 0.4173, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.0885381698608398, + "rewards/margins": 0.976910412311554, + "rewards/rejected": 0.11162771284580231, + "step": 920 + }, + { + "epoch": 0.34, + "grad_norm": 36.79775770964529, + "learning_rate": 1.4784853723040993e-07, + "logits/chosen": -1.9353240728378296, + "logits/rejected": -1.7171472311019897, + "logps/chosen": -96.37171173095703, + "logps/rejected": -142.31790161132812, + "loss": 0.4281, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.0895094871520996, + "rewards/margins": 1.8335977792739868, + "rewards/rejected": -0.7440882921218872, + "step": 930 + }, + { + "epoch": 0.34, + "grad_norm": 41.812337440587434, + "learning_rate": 1.4773408843478865e-07, + "logits/chosen": -2.2208828926086426, + "logits/rejected": -1.9805580377578735, + "logps/chosen": -97.20782470703125, + "logps/rejected": -128.40142822265625, + "loss": 0.3873, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.3034839630126953, + "rewards/margins": 1.4772690534591675, + "rewards/rejected": -0.17378509044647217, + "step": 940 + }, + { + "epoch": 0.34, + "grad_norm": 42.4893277983641, + "learning_rate": 1.4761672054519223e-07, + "logits/chosen": -2.1439785957336426, + "logits/rejected": -2.007866144180298, + "logps/chosen": -98.85569763183594, + "logps/rejected": -139.9319305419922, + "loss": 0.3826, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.8108327388763428, + "rewards/margins": 1.2574350833892822, + "rewards/rejected": 0.5533978343009949, + "step": 950 + }, + { + "epoch": 0.35, + "grad_norm": 33.46468899519379, + "learning_rate": 1.4749643827203783e-07, + "logits/chosen": -1.9708385467529297, + "logits/rejected": -1.5546438694000244, + "logps/chosen": -93.60973358154297, + "logps/rejected": -142.23947143554688, + "loss": 0.3418, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.3536150455474854, + "rewards/margins": 1.6791467666625977, + "rewards/rejected": -0.32553163170814514, + "step": 960 + }, + { + "epoch": 0.35, + "grad_norm": 37.27542798597497, + "learning_rate": 1.4737324644270786e-07, + "logits/chosen": -2.059234142303467, + "logits/rejected": -2.1984705924987793, + "logps/chosen": -100.48094177246094, + "logps/rejected": -111.93769836425781, + "loss": 0.4226, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.6639249324798584, + "rewards/margins": 1.1406528949737549, + "rewards/rejected": 0.5232721567153931, + "step": 970 + }, + { + "epoch": 0.36, + "grad_norm": 41.082132677027026, + "learning_rate": 1.4724715000135616e-07, + "logits/chosen": -1.8186811208724976, + "logits/rejected": -1.4485307931900024, + "logps/chosen": -99.40641784667969, + "logps/rejected": -124.72676086425781, + "loss": 0.4518, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.4649403095245361, + "rewards/margins": 1.2863953113555908, + "rewards/rejected": 0.1785450279712677, + "step": 980 + }, + { + "epoch": 0.36, + "grad_norm": 45.98733730441951, + "learning_rate": 1.4711815400870976e-07, + "logits/chosen": -1.4492137432098389, + "logits/rejected": -1.3497685194015503, + "logps/chosen": -80.22025299072266, + "logps/rejected": -121.85737609863281, + "loss": 0.3912, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.7067760229110718, + "rewards/margins": 2.1301562786102295, + "rewards/rejected": -0.42338013648986816, + "step": 990 + }, + { + "epoch": 0.36, + "grad_norm": 38.58236345723491, + "learning_rate": 1.4698626364186557e-07, + "logits/chosen": -2.125136137008667, + "logits/rejected": -1.753357172012329, + "logps/chosen": -95.4744644165039, + "logps/rejected": -130.9934539794922, + "loss": 0.4453, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.6447858810424805, + "rewards/margins": 1.4352115392684937, + "rewards/rejected": 0.2095741331577301, + "step": 1000 + }, + { + "epoch": 0.36, + "eval_logits/chosen": -2.6885669231414795, + "eval_logits/rejected": -2.4480230808258057, + "eval_logps/chosen": -84.5037841796875, + "eval_logps/rejected": -127.53032684326172, + "eval_loss": 0.38953348994255066, + "eval_rewards/accuracies": 0.9285714030265808, + "eval_rewards/chosen": 1.545248031616211, + "eval_rewards/margins": 1.7611348628997803, + "eval_rewards/rejected": -0.2158866971731186, + "eval_runtime": 71.9853, + "eval_samples_per_second": 12.364, + "eval_steps_per_second": 0.194, + "step": 1000 + }, + { + "epoch": 0.37, + "grad_norm": 37.010975459317685, + "learning_rate": 1.4685148419408265e-07, + "logits/chosen": -2.1752285957336426, + "logits/rejected": -1.78762686252594, + "logps/chosen": -85.75431060791016, + "logps/rejected": -120.37774658203125, + "loss": 0.3768, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.1561453342437744, + "rewards/margins": 1.0167858600616455, + "rewards/rejected": 0.1393592357635498, + "step": 1010 + }, + { + "epoch": 0.37, + "grad_norm": 35.576239317002795, + "learning_rate": 1.4671382107456988e-07, + "logits/chosen": -1.7128040790557861, + "logits/rejected": -1.5929228067398071, + "logps/chosen": -87.97055053710938, + "logps/rejected": -114.09513092041016, + "loss": 0.363, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.064514398574829, + "rewards/margins": 0.6555222272872925, + "rewards/rejected": 0.408992201089859, + "step": 1020 + }, + { + "epoch": 0.37, + "grad_norm": 48.8875893207821, + "learning_rate": 1.465732798082687e-07, + "logits/chosen": -1.8439133167266846, + "logits/rejected": -1.883114218711853, + "logps/chosen": -83.29924011230469, + "logps/rejected": -107.3209228515625, + "loss": 0.3854, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.5590392351150513, + "rewards/margins": 1.844124436378479, + "rewards/rejected": -0.2850852608680725, + "step": 1030 + }, + { + "epoch": 0.38, + "grad_norm": 37.219215466416436, + "learning_rate": 1.4642986603563156e-07, + "logits/chosen": -2.579458713531494, + "logits/rejected": -2.0807945728302, + "logps/chosen": -63.25598907470703, + "logps/rejected": -125.60914611816406, + "loss": 0.39, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.9264271259307861, + "rewards/margins": 1.684195876121521, + "rewards/rejected": 0.2422313243150711, + "step": 1040 + }, + { + "epoch": 0.38, + "grad_norm": 34.650758595758035, + "learning_rate": 1.4628358551239537e-07, + "logits/chosen": -2.4567530155181885, + "logits/rejected": -1.7575896978378296, + "logps/chosen": -67.78668975830078, + "logps/rejected": -145.57591247558594, + "loss": 0.3579, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 2.0559427738189697, + "rewards/margins": 1.9872772693634033, + "rewards/rejected": 0.06866538524627686, + "step": 1050 + }, + { + "epoch": 0.38, + "grad_norm": 28.530221009217506, + "learning_rate": 1.461344441093506e-07, + "logits/chosen": -1.84628164768219, + "logits/rejected": -1.4732797145843506, + "logps/chosen": -92.16481018066406, + "logps/rejected": -120.20820617675781, + "loss": 0.3983, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 1.2543227672576904, + "rewards/margins": 0.7427952885627747, + "rewards/rejected": 0.5115275979042053, + "step": 1060 + }, + { + "epoch": 0.39, + "grad_norm": 39.23433854659703, + "learning_rate": 1.4598244781210573e-07, + "logits/chosen": -2.2004318237304688, + "logits/rejected": -1.6184364557266235, + "logps/chosen": -83.8439712524414, + "logps/rejected": -131.82962036132812, + "loss": 0.3871, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.4584438800811768, + "rewards/margins": 1.8565394878387451, + "rewards/rejected": -0.39809566736221313, + "step": 1070 + }, + { + "epoch": 0.39, + "grad_norm": 35.45066510618733, + "learning_rate": 1.4582760272084676e-07, + "logits/chosen": -1.8293546438217163, + "logits/rejected": -1.3871347904205322, + "logps/chosen": -76.8940658569336, + "logps/rejected": -131.3907470703125, + "loss": 0.3741, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.866342306137085, + "rewards/margins": 1.9369169473648071, + "rewards/rejected": -0.07057473063468933, + "step": 1080 + }, + { + "epoch": 0.4, + "grad_norm": 37.751293634401364, + "learning_rate": 1.4566991505009272e-07, + "logits/chosen": -2.4931588172912598, + "logits/rejected": -2.37707781791687, + "logps/chosen": -91.82124328613281, + "logps/rejected": -113.77565002441406, + "loss": 0.3502, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.9339624643325806, + "rewards/margins": 0.9942086338996887, + "rewards/rejected": 0.9397537112236023, + "step": 1090 + }, + { + "epoch": 0.4, + "grad_norm": 68.79835073894007, + "learning_rate": 1.4550939112844606e-07, + "logits/chosen": -2.0712924003601074, + "logits/rejected": -1.6202366352081299, + "logps/chosen": -62.410888671875, + "logps/rejected": -119.64176940917969, + "loss": 0.372, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.7587225437164307, + "rewards/margins": 2.3271777629852295, + "rewards/rejected": -0.5684553980827332, + "step": 1100 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -2.6049461364746094, + "eval_logits/rejected": -2.3804662227630615, + "eval_logps/chosen": -83.14766693115234, + "eval_logps/rejected": -129.16748046875, + "eval_loss": 0.373942106962204, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 1.6808584928512573, + "eval_rewards/margins": 2.0604610443115234, + "eval_rewards/rejected": -0.37960249185562134, + "eval_runtime": 71.9703, + "eval_samples_per_second": 12.366, + "eval_steps_per_second": 0.195, + "step": 1100 + }, + { + "epoch": 0.4, + "grad_norm": 49.198262614504216, + "learning_rate": 1.453460373983387e-07, + "logits/chosen": -2.2133469581604004, + "logits/rejected": -1.7755333185195923, + "logps/chosen": -86.2526626586914, + "logps/rejected": -125.26475524902344, + "loss": 0.4013, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.4716092348098755, + "rewards/margins": 2.0779099464416504, + "rewards/rejected": -0.606300950050354, + "step": 1110 + }, + { + "epoch": 0.41, + "grad_norm": 26.816352398316827, + "learning_rate": 1.451798604157734e-07, + "logits/chosen": -2.0376839637756348, + "logits/rejected": -1.5721218585968018, + "logps/chosen": -62.28696823120117, + "logps/rejected": -122.70638275146484, + "loss": 0.3371, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.9331592321395874, + "rewards/margins": 1.9306316375732422, + "rewards/rejected": 0.0025276304222643375, + "step": 1120 + }, + { + "epoch": 0.41, + "grad_norm": 30.305040452259234, + "learning_rate": 1.4501086685006087e-07, + "logits/chosen": -2.2330105304718018, + "logits/rejected": -1.6862401962280273, + "logps/chosen": -61.83301544189453, + "logps/rejected": -138.49703979492188, + "loss": 0.3442, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9937431812286377, + "rewards/margins": 2.239272356033325, + "rewards/rejected": -0.2455294132232666, + "step": 1130 + }, + { + "epoch": 0.41, + "grad_norm": 37.935713309108, + "learning_rate": 1.4483906348355185e-07, + "logits/chosen": -1.9341682195663452, + "logits/rejected": -1.8443982601165771, + "logps/chosen": -88.57161712646484, + "logps/rejected": -138.91732788085938, + "loss": 0.3578, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.5548217296600342, + "rewards/margins": 1.6803414821624756, + "rewards/rejected": -0.1255197376012802, + "step": 1140 + }, + { + "epoch": 0.42, + "grad_norm": 41.30231239743997, + "learning_rate": 1.4466445721136494e-07, + "logits/chosen": -1.9577938318252563, + "logits/rejected": -1.6811765432357788, + "logps/chosen": -83.18925476074219, + "logps/rejected": -151.69259643554688, + "loss": 0.3935, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.5005887746810913, + "rewards/margins": 1.946763038635254, + "rewards/rejected": -0.4461742043495178, + "step": 1150 + }, + { + "epoch": 0.42, + "grad_norm": 47.452759957888006, + "learning_rate": 1.444870550411101e-07, + "logits/chosen": -2.1157777309417725, + "logits/rejected": -1.8774683475494385, + "logps/chosen": -72.8653793334961, + "logps/rejected": -136.59445190429688, + "loss": 0.3984, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.7430633306503296, + "rewards/margins": 1.85836660861969, + "rewards/rejected": -0.11530301719903946, + "step": 1160 + }, + { + "epoch": 0.42, + "grad_norm": 27.935593390748892, + "learning_rate": 1.443068640926072e-07, + "logits/chosen": -1.8945796489715576, + "logits/rejected": -2.1599068641662598, + "logps/chosen": -57.86497116088867, + "logps/rejected": -86.31175231933594, + "loss": 0.3544, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.740037202835083, + "rewards/margins": 1.9110946655273438, + "rewards/rejected": -0.171057790517807, + "step": 1170 + }, + { + "epoch": 0.43, + "grad_norm": 40.028733125905276, + "learning_rate": 1.4412389159760025e-07, + "logits/chosen": -2.241224527359009, + "logits/rejected": -1.6412010192871094, + "logps/chosen": -76.12211608886719, + "logps/rejected": -164.74478149414062, + "loss": 0.3165, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.4326903820037842, + "rewards/margins": 2.377753496170044, + "rewards/rejected": -0.9450632333755493, + "step": 1180 + }, + { + "epoch": 0.43, + "grad_norm": 47.0201653078679, + "learning_rate": 1.439381448994673e-07, + "logits/chosen": -2.094806432723999, + "logits/rejected": -1.7619339227676392, + "logps/chosen": -80.14140319824219, + "logps/rejected": -172.28895568847656, + "loss": 0.3704, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.4097177982330322, + "rewards/margins": 2.00710129737854, + "rewards/rejected": -0.5973835587501526, + "step": 1190 + }, + { + "epoch": 0.44, + "grad_norm": 45.538770655344166, + "learning_rate": 1.4374963145292563e-07, + "logits/chosen": -2.0040104389190674, + "logits/rejected": -1.7236404418945312, + "logps/chosen": -78.05459594726562, + "logps/rejected": -118.63360595703125, + "loss": 0.3191, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.8405094146728516, + "rewards/margins": 1.999542236328125, + "rewards/rejected": -0.15903277695178986, + "step": 1200 + }, + { + "epoch": 0.44, + "eval_logits/chosen": -2.6045000553131104, + "eval_logits/rejected": -2.3742053508758545, + "eval_logps/chosen": -86.77649688720703, + "eval_logps/rejected": -132.74559020996094, + "eval_loss": 0.3634836673736572, + "eval_rewards/accuracies": 0.875, + "eval_rewards/chosen": 1.3179771900177002, + "eval_rewards/margins": 2.0553901195526123, + "eval_rewards/rejected": -0.7374131083488464, + "eval_runtime": 71.874, + "eval_samples_per_second": 12.383, + "eval_steps_per_second": 0.195, + "step": 1200 + }, + { + "epoch": 0.44, + "grad_norm": 32.36206179132804, + "learning_rate": 1.4355835882373265e-07, + "logits/chosen": -1.9400993585586548, + "logits/rejected": -1.7014987468719482, + "logps/chosen": -102.5603256225586, + "logps/rejected": -158.36917114257812, + "loss": 0.3049, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.8652098774909973, + "rewards/margins": 1.5464653968811035, + "rewards/rejected": -0.6812552809715271, + "step": 1210 + }, + { + "epoch": 0.44, + "grad_norm": 42.82494337547035, + "learning_rate": 1.433643346883822e-07, + "logits/chosen": -2.0089356899261475, + "logits/rejected": -1.5017987489700317, + "logps/chosen": -91.1595687866211, + "logps/rejected": -135.00486755371094, + "loss": 0.3278, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.4155638217926025, + "rewards/margins": 1.6915414333343506, + "rewards/rejected": -0.27597787976264954, + "step": 1220 + }, + { + "epoch": 0.45, + "grad_norm": 56.19237380245196, + "learning_rate": 1.4316756683379635e-07, + "logits/chosen": -1.520774483680725, + "logits/rejected": -1.4852197170257568, + "logps/chosen": -90.1426010131836, + "logps/rejected": -124.76286315917969, + "loss": 0.3839, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.9303890466690063, + "rewards/margins": 2.0999183654785156, + "rewards/rejected": -0.16952911019325256, + "step": 1230 + }, + { + "epoch": 0.45, + "grad_norm": 51.35397866660894, + "learning_rate": 1.4296806315701312e-07, + "logits/chosen": -2.2964136600494385, + "logits/rejected": -1.7400308847427368, + "logps/chosen": -50.99583053588867, + "logps/rejected": -114.6817626953125, + "loss": 0.3506, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 2.290008783340454, + "rewards/margins": 2.5690829753875732, + "rewards/rejected": -0.27907437086105347, + "step": 1240 + }, + { + "epoch": 0.45, + "grad_norm": 36.79699061086273, + "learning_rate": 1.427658316648694e-07, + "logits/chosen": -2.083348512649536, + "logits/rejected": -1.4933042526245117, + "logps/chosen": -73.86151885986328, + "logps/rejected": -149.9459991455078, + "loss": 0.3594, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.754785180091858, + "rewards/margins": 2.0723814964294434, + "rewards/rejected": -0.3175961375236511, + "step": 1250 + }, + { + "epoch": 0.46, + "grad_norm": 61.77575626305043, + "learning_rate": 1.4256088047367958e-07, + "logits/chosen": -2.3271241188049316, + "logits/rejected": -2.1036393642425537, + "logps/chosen": -58.489654541015625, + "logps/rejected": -135.50341796875, + "loss": 0.3875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.7433359622955322, + "rewards/margins": 1.8952022790908813, + "rewards/rejected": -0.1518661081790924, + "step": 1260 + }, + { + "epoch": 0.46, + "grad_norm": 45.208373613406316, + "learning_rate": 1.423532178089099e-07, + "logits/chosen": -2.00785756111145, + "logits/rejected": -2.118194341659546, + "logps/chosen": -87.7576904296875, + "logps/rejected": -121.8628158569336, + "loss": 0.2697, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.2886258363723755, + "rewards/margins": 2.014240026473999, + "rewards/rejected": -0.7256141901016235, + "step": 1270 + }, + { + "epoch": 0.46, + "grad_norm": 35.53690989690113, + "learning_rate": 1.421428520048482e-07, + "logits/chosen": -1.6762386560440063, + "logits/rejected": -1.9535388946533203, + "logps/chosen": -91.85359191894531, + "logps/rejected": -145.9041290283203, + "loss": 0.3821, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.8587366342544556, + "rewards/margins": 1.5649387836456299, + "rewards/rejected": 0.2937980592250824, + "step": 1280 + }, + { + "epoch": 0.47, + "grad_norm": 33.054145363402554, + "learning_rate": 1.419297915042697e-07, + "logits/chosen": -2.3081910610198975, + "logits/rejected": -1.6348092555999756, + "logps/chosen": -73.78157806396484, + "logps/rejected": -153.98765563964844, + "loss": 0.3255, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.926365852355957, + "rewards/margins": 3.0679659843444824, + "rewards/rejected": -1.1416000127792358, + "step": 1290 + }, + { + "epoch": 0.47, + "grad_norm": 35.78921822753497, + "learning_rate": 1.4171404485809776e-07, + "logits/chosen": -1.6090662479400635, + "logits/rejected": -1.2726771831512451, + "logps/chosen": -80.95599365234375, + "logps/rejected": -132.03736877441406, + "loss": 0.3319, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.9143034219741821, + "rewards/margins": 1.9614051580429077, + "rewards/rejected": -0.04710172861814499, + "step": 1300 + }, + { + "epoch": 0.47, + "eval_logits/chosen": -2.6384994983673096, + "eval_logits/rejected": -2.401343822479248, + "eval_logps/chosen": -80.92295837402344, + "eval_logps/rejected": -127.90816497802734, + "eval_loss": 0.35564500093460083, + "eval_rewards/accuracies": 0.875, + "eval_rewards/chosen": 1.9033304452896118, + "eval_rewards/margins": 2.1570017337799072, + "eval_rewards/rejected": -0.25367113947868347, + "eval_runtime": 72.0772, + "eval_samples_per_second": 12.348, + "eval_steps_per_second": 0.194, + "step": 1300 + }, + { + "epoch": 0.48, + "grad_norm": 41.95871729273735, + "learning_rate": 1.4149562072506109e-07, + "logits/chosen": -1.6684595346450806, + "logits/rejected": -1.6990349292755127, + "logps/chosen": -106.24676513671875, + "logps/rejected": -139.57778930664062, + "loss": 0.3228, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.357491135597229, + "rewards/margins": 1.8578159809112549, + "rewards/rejected": -0.5003247261047363, + "step": 1310 + }, + { + "epoch": 0.48, + "grad_norm": 29.34145064992561, + "learning_rate": 1.4127452787134597e-07, + "logits/chosen": -2.3935904502868652, + "logits/rejected": -1.9407179355621338, + "logps/chosen": -75.06128692626953, + "logps/rejected": -170.85089111328125, + "loss": 0.3365, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.364776372909546, + "rewards/margins": 1.6955333948135376, + "rewards/rejected": -0.33075690269470215, + "step": 1320 + }, + { + "epoch": 0.48, + "grad_norm": 54.390170546748955, + "learning_rate": 1.4105077517024458e-07, + "logits/chosen": -2.1387906074523926, + "logits/rejected": -1.6588388681411743, + "logps/chosen": -64.23258972167969, + "logps/rejected": -124.39351654052734, + "loss": 0.3544, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.8940436840057373, + "rewards/margins": 2.4280219078063965, + "rewards/rejected": -0.5339787602424622, + "step": 1330 + }, + { + "epoch": 0.49, + "grad_norm": 41.328588380198035, + "learning_rate": 1.4082437160179884e-07, + "logits/chosen": -1.7820161581039429, + "logits/rejected": -1.5780375003814697, + "logps/chosen": -76.89183044433594, + "logps/rejected": -122.71630859375, + "loss": 0.3367, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.836066484451294, + "rewards/margins": 1.8271701335906982, + "rewards/rejected": 0.008896279148757458, + "step": 1340 + }, + { + "epoch": 0.49, + "grad_norm": 36.692279705097874, + "learning_rate": 1.4059532625243992e-07, + "logits/chosen": -1.921148657798767, + "logits/rejected": -1.6129744052886963, + "logps/chosen": -66.90629577636719, + "logps/rejected": -127.26859283447266, + "loss": 0.37, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.4975523948669434, + "rewards/margins": 1.937352180480957, + "rewards/rejected": -0.43979987502098083, + "step": 1350 + }, + { + "epoch": 0.49, + "grad_norm": 30.243381015544752, + "learning_rate": 1.403636483146238e-07, + "logits/chosen": -2.2570159435272217, + "logits/rejected": -1.9846521615982056, + "logps/chosen": -79.24600982666016, + "logps/rejected": -115.38359069824219, + "loss": 0.3979, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.5120487213134766, + "rewards/margins": 1.4373055696487427, + "rewards/rejected": 0.07474327087402344, + "step": 1360 + }, + { + "epoch": 0.5, + "grad_norm": 44.64162084878288, + "learning_rate": 1.40129347086462e-07, + "logits/chosen": -2.0078492164611816, + "logits/rejected": -1.9076550006866455, + "logps/chosen": -102.33550262451172, + "logps/rejected": -123.74501037597656, + "loss": 0.3483, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.0897409915924072, + "rewards/margins": 1.6122095584869385, + "rewards/rejected": -0.5224683880805969, + "step": 1370 + }, + { + "epoch": 0.5, + "grad_norm": 39.12577532028238, + "learning_rate": 1.3989243197134876e-07, + "logits/chosen": -2.399981737136841, + "logits/rejected": -1.599557876586914, + "logps/chosen": -82.2318115234375, + "logps/rejected": -135.64981079101562, + "loss": 0.3279, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.5288795232772827, + "rewards/margins": 1.9530413150787354, + "rewards/rejected": -0.42416173219680786, + "step": 1380 + }, + { + "epoch": 0.5, + "grad_norm": 36.85635152662901, + "learning_rate": 1.396529124775834e-07, + "logits/chosen": -2.5128228664398193, + "logits/rejected": -2.076643705368042, + "logps/chosen": -105.89326477050781, + "logps/rejected": -146.3048553466797, + "loss": 0.3094, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.3551980257034302, + "rewards/margins": 1.9240223169326782, + "rewards/rejected": -0.5688244700431824, + "step": 1390 + }, + { + "epoch": 0.51, + "grad_norm": 46.84425675735352, + "learning_rate": 1.394107982179888e-07, + "logits/chosen": -1.8977441787719727, + "logits/rejected": -2.266324996948242, + "logps/chosen": -78.88629913330078, + "logps/rejected": -171.7329559326172, + "loss": 0.3469, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.2071471214294434, + "rewards/margins": 1.8551435470581055, + "rewards/rejected": -0.6479963064193726, + "step": 1400 + }, + { + "epoch": 0.51, + "eval_logits/chosen": -2.633074998855591, + "eval_logits/rejected": -2.3969056606292725, + "eval_logps/chosen": -84.95893096923828, + "eval_logps/rejected": -132.71334838867188, + "eval_loss": 0.3462066352367401, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 1.4997329711914062, + "eval_rewards/margins": 2.2339236736297607, + "eval_rewards/rejected": -0.7341909408569336, + "eval_runtime": 71.9731, + "eval_samples_per_second": 12.366, + "eval_steps_per_second": 0.195, + "step": 1400 + }, + { + "epoch": 0.51, + "grad_norm": 28.479823112134607, + "learning_rate": 1.3916609890952566e-07, + "logits/chosen": -1.909106969833374, + "logits/rejected": -1.6904758214950562, + "logps/chosen": -82.46297454833984, + "logps/rejected": -124.34074401855469, + "loss": 0.319, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.6145960092544556, + "rewards/margins": 2.080094575881958, + "rewards/rejected": -0.4654986262321472, + "step": 1410 + }, + { + "epoch": 0.52, + "grad_norm": 39.355457607430694, + "learning_rate": 1.3891882437290242e-07, + "logits/chosen": -1.5750586986541748, + "logits/rejected": -1.6994893550872803, + "logps/chosen": -82.91178894042969, + "logps/rejected": -111.0701675415039, + "loss": 0.4042, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 1.5502707958221436, + "rewards/margins": 1.4318604469299316, + "rewards/rejected": 0.1184103712439537, + "step": 1420 + }, + { + "epoch": 0.52, + "grad_norm": 43.081802728687556, + "learning_rate": 1.386689845321812e-07, + "logits/chosen": -2.426974058151245, + "logits/rejected": -1.7977014780044556, + "logps/chosen": -79.90650939941406, + "logps/rejected": -139.31407165527344, + "loss": 0.3121, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.8190940618515015, + "rewards/margins": 2.4520251750946045, + "rewards/rejected": -0.6329307556152344, + "step": 1430 + }, + { + "epoch": 0.52, + "grad_norm": 38.9495726615902, + "learning_rate": 1.384165894143794e-07, + "logits/chosen": -1.8546695709228516, + "logits/rejected": -1.7970068454742432, + "logps/chosen": -105.05964660644531, + "logps/rejected": -128.0200653076172, + "loss": 0.2894, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.3824260234832764, + "rewards/margins": 1.8351023197174072, + "rewards/rejected": -0.45267629623413086, + "step": 1440 + }, + { + "epoch": 0.53, + "grad_norm": 36.527576333516, + "learning_rate": 1.381616491490674e-07, + "logits/chosen": -2.3374876976013184, + "logits/rejected": -2.048158884048462, + "logps/chosen": -73.65496063232422, + "logps/rejected": -159.6743927001953, + "loss": 0.3077, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.549155592918396, + "rewards/margins": 2.265070915222168, + "rewards/rejected": -0.7159156799316406, + "step": 1450 + }, + { + "epoch": 0.53, + "grad_norm": 37.33453867519459, + "learning_rate": 1.3790417396796205e-07, + "logits/chosen": -1.6809660196304321, + "logits/rejected": -1.6903836727142334, + "logps/chosen": -92.21663665771484, + "logps/rejected": -123.54522705078125, + "loss": 0.3688, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.5740840435028076, + "rewards/margins": 2.4830660820007324, + "rewards/rejected": -0.9089819192886353, + "step": 1460 + }, + { + "epoch": 0.53, + "grad_norm": 30.137594905849312, + "learning_rate": 1.376441742045158e-07, + "logits/chosen": -1.7785037755966187, + "logits/rejected": -1.5067006349563599, + "logps/chosen": -88.78409576416016, + "logps/rejected": -144.81686401367188, + "loss": 0.2812, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.1298682689666748, + "rewards/margins": 2.1699564456939697, + "rewards/rejected": -1.0400878190994263, + "step": 1470 + }, + { + "epoch": 0.54, + "grad_norm": 45.12639127949661, + "learning_rate": 1.3738166029350223e-07, + "logits/chosen": -2.215520143508911, + "logits/rejected": -1.8298145532608032, + "logps/chosen": -78.92552185058594, + "logps/rejected": -147.58212280273438, + "loss": 0.361, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.8534799814224243, + "rewards/margins": 2.4630656242370605, + "rewards/rejected": -0.6095854640007019, + "step": 1480 + }, + { + "epoch": 0.54, + "grad_norm": 30.18520757137706, + "learning_rate": 1.3711664277059714e-07, + "logits/chosen": -1.3158643245697021, + "logits/rejected": -1.4519625902175903, + "logps/chosen": -84.15853118896484, + "logps/rejected": -140.84286499023438, + "loss": 0.3058, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.9669021368026733, + "rewards/margins": 2.4468729496002197, + "rewards/rejected": -0.4799710810184479, + "step": 1490 + }, + { + "epoch": 0.54, + "grad_norm": 39.46537040194847, + "learning_rate": 1.3684913227195577e-07, + "logits/chosen": -2.1749026775360107, + "logits/rejected": -1.9646120071411133, + "logps/chosen": -85.54203796386719, + "logps/rejected": -132.18484497070312, + "loss": 0.2976, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.073870062828064, + "rewards/margins": 1.4576705694198608, + "rewards/rejected": -0.3838004171848297, + "step": 1500 + }, + { + "epoch": 0.54, + "eval_logits/chosen": -2.6264164447784424, + "eval_logits/rejected": -2.3935818672180176, + "eval_logps/chosen": -86.55225372314453, + "eval_logps/rejected": -134.98130798339844, + "eval_loss": 0.3363237977027893, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 1.3404006958007812, + "eval_rewards/margins": 2.3013861179351807, + "eval_rewards/rejected": -0.960985541343689, + "eval_runtime": 71.9646, + "eval_samples_per_second": 12.367, + "eval_steps_per_second": 0.195, + "step": 1500 + }, + { + "epoch": 0.55, + "grad_norm": 46.836051305920115, + "learning_rate": 1.365791395337859e-07, + "logits/chosen": -1.5352973937988281, + "logits/rejected": -1.6385904550552368, + "logps/chosen": -79.53118896484375, + "logps/rejected": -113.24952697753906, + "loss": 0.3002, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.4074381589889526, + "rewards/margins": 2.091799259185791, + "rewards/rejected": -0.6843612790107727, + "step": 1510 + }, + { + "epoch": 0.55, + "grad_norm": 31.950241265489773, + "learning_rate": 1.3630667539191687e-07, + "logits/chosen": -1.526102066040039, + "logits/rejected": -1.5050265789031982, + "logps/chosen": -98.58650207519531, + "logps/rejected": -133.04754638671875, + "loss": 0.3186, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.1431519985198975, + "rewards/margins": 1.964644193649292, + "rewards/rejected": -0.8214923739433289, + "step": 1520 + }, + { + "epoch": 0.56, + "grad_norm": 40.788510789631786, + "learning_rate": 1.3603175078136497e-07, + "logits/chosen": -1.8123880624771118, + "logits/rejected": -1.3758784532546997, + "logps/chosen": -68.65928649902344, + "logps/rejected": -116.91877746582031, + "loss": 0.3174, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3894504308700562, + "rewards/margins": 2.661546468734741, + "rewards/rejected": -1.2720959186553955, + "step": 1530 + }, + { + "epoch": 0.56, + "grad_norm": 46.155078059787826, + "learning_rate": 1.3575437673589428e-07, + "logits/chosen": -1.7942472696304321, + "logits/rejected": -1.867285132408142, + "logps/chosen": -94.3653793334961, + "logps/rejected": -112.46723937988281, + "loss": 0.3213, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.7745468020439148, + "rewards/margins": 1.4602556228637695, + "rewards/rejected": -0.6857088804244995, + "step": 1540 + }, + { + "epoch": 0.56, + "grad_norm": 39.375049143103645, + "learning_rate": 1.3547456438757397e-07, + "logits/chosen": -2.056170701980591, + "logits/rejected": -1.2939419746398926, + "logps/chosen": -89.32717895507812, + "logps/rejected": -133.4949493408203, + "loss": 0.3142, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.4916505813598633, + "rewards/margins": 1.9974079132080078, + "rewards/rejected": -0.5057573914527893, + "step": 1550 + }, + { + "epoch": 0.57, + "grad_norm": 32.64828907806418, + "learning_rate": 1.3519232496633152e-07, + "logits/chosen": -2.2024412155151367, + "logits/rejected": -1.6808528900146484, + "logps/chosen": -64.60079193115234, + "logps/rejected": -127.1625747680664, + "loss": 0.3165, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7459484338760376, + "rewards/margins": 2.624204158782959, + "rewards/rejected": -0.8782557249069214, + "step": 1560 + }, + { + "epoch": 0.57, + "grad_norm": 37.74052323913691, + "learning_rate": 1.3490766979950211e-07, + "logits/chosen": -2.4449658393859863, + "logits/rejected": -2.1780996322631836, + "logps/chosen": -63.970916748046875, + "logps/rejected": -116.03575134277344, + "loss": 0.285, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.5420334339141846, + "rewards/margins": 2.1397767066955566, + "rewards/rejected": -0.5977433919906616, + "step": 1570 + }, + { + "epoch": 0.57, + "grad_norm": 38.57331331754987, + "learning_rate": 1.3462061031137382e-07, + "logits/chosen": -1.8334707021713257, + "logits/rejected": -1.6402419805526733, + "logps/chosen": -81.07013702392578, + "logps/rejected": -196.51010131835938, + "loss": 0.3205, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.801913857460022, + "rewards/margins": 3.0623841285705566, + "rewards/rejected": -1.2604701519012451, + "step": 1580 + }, + { + "epoch": 0.58, + "grad_norm": 35.786587394552, + "learning_rate": 1.3433115802272929e-07, + "logits/chosen": -1.9113935232162476, + "logits/rejected": -1.5720059871673584, + "logps/chosen": -72.0071792602539, + "logps/rejected": -155.9192352294922, + "loss": 0.2904, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.8154712915420532, + "rewards/margins": 2.6344165802001953, + "rewards/rejected": -0.8189449310302734, + "step": 1590 + }, + { + "epoch": 0.58, + "grad_norm": 38.20450363937764, + "learning_rate": 1.3403932455038328e-07, + "logits/chosen": -2.0501441955566406, + "logits/rejected": -1.8184171915054321, + "logps/chosen": -86.4954833984375, + "logps/rejected": -138.66053771972656, + "loss": 0.2839, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.814734697341919, + "rewards/margins": 2.979191303253174, + "rewards/rejected": -1.1644567251205444, + "step": 1600 + }, + { + "epoch": 0.58, + "eval_logits/chosen": -2.5862371921539307, + "eval_logits/rejected": -2.3639228343963623, + "eval_logps/chosen": -84.44693756103516, + "eval_logps/rejected": -134.8124237060547, + "eval_loss": 0.3325304090976715, + "eval_rewards/accuracies": 0.8571428656578064, + "eval_rewards/chosen": 1.5509331226348877, + "eval_rewards/margins": 2.4950315952301025, + "eval_rewards/rejected": -0.9440980553627014, + "eval_runtime": 71.9558, + "eval_samples_per_second": 12.369, + "eval_steps_per_second": 0.195, + "step": 1600 + }, + { + "epoch": 0.58, + "grad_norm": 35.827979868952966, + "learning_rate": 1.3374512160671644e-07, + "logits/chosen": -1.5930265188217163, + "logits/rejected": -1.6432759761810303, + "logps/chosen": -101.54774475097656, + "logps/rejected": -133.1485595703125, + "loss": 0.3038, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.5359328389167786, + "rewards/margins": 1.9368603229522705, + "rewards/rejected": -1.4009274244308472, + "step": 1610 + }, + { + "epoch": 0.59, + "grad_norm": 30.92177760123355, + "learning_rate": 1.3344856099920526e-07, + "logits/chosen": -1.9082187414169312, + "logits/rejected": -1.8227027654647827, + "logps/chosen": -100.11506652832031, + "logps/rejected": -131.5067596435547, + "loss": 0.3262, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.2241116762161255, + "rewards/margins": 1.7369029521942139, + "rewards/rejected": -0.5127913355827332, + "step": 1620 + }, + { + "epoch": 0.59, + "grad_norm": 37.15625275272501, + "learning_rate": 1.3314965462994826e-07, + "logits/chosen": -1.8317235708236694, + "logits/rejected": -1.373651146888733, + "logps/chosen": -88.5484848022461, + "logps/rejected": -131.82040405273438, + "loss": 0.2875, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.9509607553482056, + "rewards/margins": 2.071371555328369, + "rewards/rejected": -1.120410680770874, + "step": 1630 + }, + { + "epoch": 0.6, + "grad_norm": 45.37302624095175, + "learning_rate": 1.3284841449518813e-07, + "logits/chosen": -1.686141014099121, + "logits/rejected": -1.2155983448028564, + "logps/chosen": -99.9353256225586, + "logps/rejected": -144.32980346679688, + "loss": 0.3014, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.9641411900520325, + "rewards/margins": 1.8706451654434204, + "rewards/rejected": -0.9065039753913879, + "step": 1640 + }, + { + "epoch": 0.6, + "grad_norm": 55.85191593624814, + "learning_rate": 1.3254485268483055e-07, + "logits/chosen": -1.7119086980819702, + "logits/rejected": -1.548729419708252, + "logps/chosen": -92.55009460449219, + "logps/rejected": -211.9893341064453, + "loss": 0.3211, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.1339528560638428, + "rewards/margins": 2.2448153495788574, + "rewards/rejected": -1.1108627319335938, + "step": 1650 + }, + { + "epoch": 0.6, + "grad_norm": 36.20383771458562, + "learning_rate": 1.3223898138195864e-07, + "logits/chosen": -2.340451240539551, + "logits/rejected": -1.6691877841949463, + "logps/chosen": -57.80481719970703, + "logps/rejected": -119.7285385131836, + "loss": 0.2937, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 2.0053391456604004, + "rewards/margins": 2.7729382514953613, + "rewards/rejected": -0.7675992250442505, + "step": 1660 + }, + { + "epoch": 0.61, + "grad_norm": 33.03559928672513, + "learning_rate": 1.319308128623443e-07, + "logits/chosen": -1.8629367351531982, + "logits/rejected": -1.5715751647949219, + "logps/chosen": -94.56365203857422, + "logps/rejected": -130.30677795410156, + "loss": 0.2732, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.7679950594902039, + "rewards/margins": 2.2738566398620605, + "rewards/rejected": -1.5058612823486328, + "step": 1670 + }, + { + "epoch": 0.61, + "grad_norm": 39.357859474732834, + "learning_rate": 1.3162035949395548e-07, + "logits/chosen": -1.9703582525253296, + "logits/rejected": -1.6913745403289795, + "logps/chosen": -68.82579040527344, + "logps/rejected": -130.0358123779297, + "loss": 0.3041, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.3682914972305298, + "rewards/margins": 2.242696762084961, + "rewards/rejected": -0.874405562877655, + "step": 1680 + }, + { + "epoch": 0.61, + "grad_norm": 44.19717913326496, + "learning_rate": 1.3130763373645956e-07, + "logits/chosen": -2.4028756618499756, + "logits/rejected": -1.5053786039352417, + "logps/chosen": -93.85604095458984, + "logps/rejected": -234.1709747314453, + "loss": 0.3071, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.169268250465393, + "rewards/margins": 3.234970808029175, + "rewards/rejected": -2.0657026767730713, + "step": 1690 + }, + { + "epoch": 0.62, + "grad_norm": 30.436503628615085, + "learning_rate": 1.309926481407237e-07, + "logits/chosen": -2.1171786785125732, + "logits/rejected": -1.6411173343658447, + "logps/chosen": -81.68711853027344, + "logps/rejected": -157.5670166015625, + "loss": 0.3095, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.531718373298645, + "rewards/margins": 2.767521858215332, + "rewards/rejected": -1.2358036041259766, + "step": 1700 + }, + { + "epoch": 0.62, + "eval_logits/chosen": -2.6024363040924072, + "eval_logits/rejected": -2.381307601928711, + "eval_logps/chosen": -88.0004653930664, + "eval_logps/rejected": -138.6060028076172, + "eval_loss": 0.3237309455871582, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 1.1955795288085938, + "eval_rewards/margins": 2.519036054611206, + "eval_rewards/rejected": -1.3234565258026123, + "eval_runtime": 71.9205, + "eval_samples_per_second": 12.375, + "eval_steps_per_second": 0.195, + "step": 1700 + }, + { + "epoch": 0.62, + "grad_norm": 36.27177663241429, + "learning_rate": 1.3067541534831074e-07, + "logits/chosen": -2.4922726154327393, + "logits/rejected": -1.9374639987945557, + "logps/chosen": -95.45484161376953, + "logps/rejected": -176.23080444335938, + "loss": 0.3039, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.721973180770874, + "rewards/margins": 2.6191511154174805, + "rewards/rejected": -1.897178053855896, + "step": 1710 + }, + { + "epoch": 0.62, + "grad_norm": 32.7184977007972, + "learning_rate": 1.303559480909721e-07, + "logits/chosen": -2.122936487197876, + "logits/rejected": -1.8037102222442627, + "logps/chosen": -88.16950988769531, + "logps/rejected": -144.52603149414062, + "loss": 0.2703, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.2837637662887573, + "rewards/margins": 2.390108585357666, + "rewards/rejected": -1.1063446998596191, + "step": 1720 + }, + { + "epoch": 0.63, + "grad_norm": 29.34834638089634, + "learning_rate": 1.3003425919013677e-07, + "logits/chosen": -1.9752366542816162, + "logits/rejected": -1.5035978555679321, + "logps/chosen": -87.86656188964844, + "logps/rejected": -164.7413330078125, + "loss": 0.2606, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.7725374698638916, + "rewards/margins": 3.28916597366333, + "rewards/rejected": -1.5166288614273071, + "step": 1730 + }, + { + "epoch": 0.63, + "grad_norm": 36.999465061787745, + "learning_rate": 1.2971036155639654e-07, + "logits/chosen": -1.8440446853637695, + "logits/rejected": -1.6909431219100952, + "logps/chosen": -92.90309143066406, + "logps/rejected": -143.26986694335938, + "loss": 0.3133, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0839173793792725, + "rewards/margins": 2.5392398834228516, + "rewards/rejected": -1.455322504043579, + "step": 1740 + }, + { + "epoch": 0.64, + "grad_norm": 38.30391165242112, + "learning_rate": 1.293842681889882e-07, + "logits/chosen": -2.0229029655456543, + "logits/rejected": -1.4855544567108154, + "logps/chosen": -93.77799987792969, + "logps/rejected": -204.0419464111328, + "loss": 0.3124, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.2000395059585571, + "rewards/margins": 2.4565627574920654, + "rewards/rejected": -1.2565232515335083, + "step": 1750 + }, + { + "epoch": 0.64, + "grad_norm": 41.551439437509096, + "learning_rate": 1.290559921752715e-07, + "logits/chosen": -2.3583412170410156, + "logits/rejected": -2.0657455921173096, + "logps/chosen": -85.60011291503906, + "logps/rejected": -131.65814208984375, + "loss": 0.3016, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.3455216884613037, + "rewards/margins": 2.5204150676727295, + "rewards/rejected": -1.1748934984207153, + "step": 1760 + }, + { + "epoch": 0.64, + "grad_norm": 30.177779462195463, + "learning_rate": 1.287255466902041e-07, + "logits/chosen": -1.8713395595550537, + "logits/rejected": -1.5737354755401611, + "logps/chosen": -73.5456314086914, + "logps/rejected": -161.8707275390625, + "loss": 0.2684, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.3379333019256592, + "rewards/margins": 3.22587251663208, + "rewards/rejected": -1.887939214706421, + "step": 1770 + }, + { + "epoch": 0.65, + "grad_norm": 42.266893865591655, + "learning_rate": 1.2839294499581266e-07, + "logits/chosen": -1.8609529733657837, + "logits/rejected": -1.9660890102386475, + "logps/chosen": -90.25117492675781, + "logps/rejected": -139.0089111328125, + "loss": 0.2725, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.4452346563339233, + "rewards/margins": 2.1682021617889404, + "rewards/rejected": -0.7229675054550171, + "step": 1780 + }, + { + "epoch": 0.65, + "grad_norm": 45.909064669287716, + "learning_rate": 1.280582004406608e-07, + "logits/chosen": -2.2296082973480225, + "logits/rejected": -1.7719411849975586, + "logps/chosen": -91.33625793457031, + "logps/rejected": -132.76153564453125, + "loss": 0.3173, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.3035770654678345, + "rewards/margins": 2.369469165802002, + "rewards/rejected": -1.0658921003341675, + "step": 1790 + }, + { + "epoch": 0.65, + "grad_norm": 38.096350658013705, + "learning_rate": 1.2772132645931315e-07, + "logits/chosen": -1.8231449127197266, + "logits/rejected": -1.4982807636260986, + "logps/chosen": -77.42939758300781, + "logps/rejected": -142.36465454101562, + "loss": 0.2593, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.426496982574463, + "rewards/margins": 2.347156047821045, + "rewards/rejected": -0.9206587672233582, + "step": 1800 + }, + { + "epoch": 0.65, + "eval_logits/chosen": -2.613116502761841, + "eval_logits/rejected": -2.38425612449646, + "eval_logps/chosen": -87.31402587890625, + "eval_logps/rejected": -137.7794952392578, + "eval_loss": 0.3188331723213196, + "eval_rewards/accuracies": 0.8392857313156128, + "eval_rewards/chosen": 1.2642244100570679, + "eval_rewards/margins": 2.5050292015075684, + "eval_rewards/rejected": -1.240804672241211, + "eval_runtime": 71.9246, + "eval_samples_per_second": 12.374, + "eval_steps_per_second": 0.195, + "step": 1800 + }, + { + "epoch": 0.66, + "grad_norm": 39.50184113749164, + "learning_rate": 1.273823365717963e-07, + "logits/chosen": -1.7294566631317139, + "logits/rejected": -1.341367244720459, + "logps/chosen": -74.58464050292969, + "logps/rejected": -133.0430450439453, + "loss": 0.3023, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.7522623538970947, + "rewards/margins": 2.6015543937683105, + "rewards/rejected": -0.8492921590805054, + "step": 1810 + }, + { + "epoch": 0.66, + "grad_norm": 42.89748213165641, + "learning_rate": 1.270412443830562e-07, + "logits/chosen": -1.636803388595581, + "logits/rejected": -1.4411513805389404, + "logps/chosen": -90.65261840820312, + "logps/rejected": -148.75440979003906, + "loss": 0.2737, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.4128892421722412, + "rewards/margins": 2.5060081481933594, + "rewards/rejected": -1.0931187868118286, + "step": 1820 + }, + { + "epoch": 0.66, + "grad_norm": 49.9218877269305, + "learning_rate": 1.2669806358241194e-07, + "logits/chosen": -1.672900915145874, + "logits/rejected": -1.657602310180664, + "logps/chosen": -102.61698150634766, + "logps/rejected": -124.9526138305664, + "loss": 0.3258, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.2539775371551514, + "rewards/margins": 1.9778436422348022, + "rewards/rejected": -0.7238659858703613, + "step": 1830 + }, + { + "epoch": 0.67, + "grad_norm": 52.08013573210232, + "learning_rate": 1.2635280794300674e-07, + "logits/chosen": -1.9965959787368774, + "logits/rejected": -1.7998332977294922, + "logps/chosen": -89.28053283691406, + "logps/rejected": -126.0293960571289, + "loss": 0.2968, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.1432512998580933, + "rewards/margins": 1.8423658609390259, + "rewards/rejected": -0.6991146802902222, + "step": 1840 + }, + { + "epoch": 0.67, + "grad_norm": 57.840988304922966, + "learning_rate": 1.260054913212547e-07, + "logits/chosen": -2.127389907836914, + "logits/rejected": -1.7502504587173462, + "logps/chosen": -71.455078125, + "logps/rejected": -115.15047454833984, + "loss": 0.3373, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.6597360372543335, + "rewards/margins": 2.4347267150878906, + "rewards/rejected": -0.7749907970428467, + "step": 1850 + }, + { + "epoch": 0.68, + "grad_norm": 53.530462845694025, + "learning_rate": 1.2565612765628513e-07, + "logits/chosen": -1.9101688861846924, + "logits/rejected": -1.6512504816055298, + "logps/chosen": -74.90973663330078, + "logps/rejected": -138.97523498535156, + "loss": 0.3027, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.8204338550567627, + "rewards/margins": 2.453209161758423, + "rewards/rejected": -0.6327755451202393, + "step": 1860 + }, + { + "epoch": 0.68, + "grad_norm": 37.85257138685388, + "learning_rate": 1.2530473096938278e-07, + "logits/chosen": -1.8988683223724365, + "logits/rejected": -1.6503827571868896, + "logps/chosen": -79.6770248413086, + "logps/rejected": -143.09071350097656, + "loss": 0.3155, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.498981237411499, + "rewards/margins": 2.404602527618408, + "rewards/rejected": -0.9056210517883301, + "step": 1870 + }, + { + "epoch": 0.68, + "grad_norm": 50.64293606347689, + "learning_rate": 1.2495131536342536e-07, + "logits/chosen": -2.128816843032837, + "logits/rejected": -1.7718677520751953, + "logps/chosen": -86.67499542236328, + "logps/rejected": -179.9734344482422, + "loss": 0.2966, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.5343300104141235, + "rewards/margins": 2.9166336059570312, + "rewards/rejected": -1.3823034763336182, + "step": 1880 + }, + { + "epoch": 0.69, + "grad_norm": 37.57626214701421, + "learning_rate": 1.245958950223174e-07, + "logits/chosen": -1.349810242652893, + "logits/rejected": -1.1940224170684814, + "logps/chosen": -87.54304504394531, + "logps/rejected": -124.6638412475586, + "loss": 0.3138, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8737591505050659, + "rewards/margins": 1.981792688369751, + "rewards/rejected": -1.1080334186553955, + "step": 1890 + }, + { + "epoch": 0.69, + "grad_norm": 37.541514062279354, + "learning_rate": 1.24238484210421e-07, + "logits/chosen": -2.122223138809204, + "logits/rejected": -1.972744345664978, + "logps/chosen": -123.90694427490234, + "logps/rejected": -174.65133666992188, + "loss": 0.2394, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4691895842552185, + "rewards/margins": 1.7866857051849365, + "rewards/rejected": -1.3174960613250732, + "step": 1900 + }, + { + "epoch": 0.69, + "eval_logits/chosen": -2.6142075061798096, + "eval_logits/rejected": -2.388277053833008, + "eval_logps/chosen": -86.47371673583984, + "eval_logps/rejected": -137.2868194580078, + "eval_loss": 0.3111128807067871, + "eval_rewards/accuracies": 0.8392857313156128, + "eval_rewards/chosen": 1.3482542037963867, + "eval_rewards/margins": 2.539792776107788, + "eval_rewards/rejected": -1.1915382146835327, + "eval_runtime": 71.9156, + "eval_samples_per_second": 12.376, + "eval_steps_per_second": 0.195, + "step": 1900 + }, + { + "epoch": 0.69, + "grad_norm": 48.134112820259595, + "learning_rate": 1.2387909727198345e-07, + "logits/chosen": -2.399280309677124, + "logits/rejected": -2.280487537384033, + "logps/chosen": -97.30809020996094, + "logps/rejected": -159.677734375, + "loss": 0.2845, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.0647090673446655, + "rewards/margins": 1.8406950235366821, + "rewards/rejected": -0.7759860754013062, + "step": 1910 + }, + { + "epoch": 0.7, + "grad_norm": 49.91211298121504, + "learning_rate": 1.2351774863056134e-07, + "logits/chosen": -2.3767571449279785, + "logits/rejected": -1.682460069656372, + "logps/chosen": -62.444793701171875, + "logps/rejected": -145.32229614257812, + "loss": 0.2989, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.9718739986419678, + "rewards/margins": 3.5472664833068848, + "rewards/rejected": -1.575392246246338, + "step": 1920 + }, + { + "epoch": 0.7, + "grad_norm": 29.38061476258355, + "learning_rate": 1.2315445278844197e-07, + "logits/chosen": -2.427912950515747, + "logits/rejected": -1.9351403713226318, + "logps/chosen": -73.01206970214844, + "logps/rejected": -209.5726776123047, + "loss": 0.2909, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.7785199880599976, + "rewards/margins": 2.738701581954956, + "rewards/rejected": -0.9601818323135376, + "step": 1930 + }, + { + "epoch": 0.7, + "grad_norm": 26.75959860175755, + "learning_rate": 1.227892243260611e-07, + "logits/chosen": -2.469193696975708, + "logits/rejected": -1.6766465902328491, + "logps/chosen": -56.01939010620117, + "logps/rejected": -119.8520736694336, + "loss": 0.3141, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.8703302145004272, + "rewards/margins": 2.945542335510254, + "rewards/rejected": -1.0752121210098267, + "step": 1940 + }, + { + "epoch": 0.71, + "grad_norm": 38.461850874562536, + "learning_rate": 1.224220779014178e-07, + "logits/chosen": -1.9026371240615845, + "logits/rejected": -1.4415004253387451, + "logps/chosen": -97.89253997802734, + "logps/rejected": -149.56936645507812, + "loss": 0.2902, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.6727228164672852, + "rewards/margins": 2.737921953201294, + "rewards/rejected": -1.0651990175247192, + "step": 1950 + }, + { + "epoch": 0.71, + "grad_norm": 31.657055351360636, + "learning_rate": 1.2205302824948634e-07, + "logits/chosen": -2.2927825450897217, + "logits/rejected": -1.8084007501602173, + "logps/chosen": -80.52983093261719, + "logps/rejected": -156.01187133789062, + "loss": 0.3306, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.4960755109786987, + "rewards/margins": 2.939119338989258, + "rewards/rejected": -1.4430434703826904, + "step": 1960 + }, + { + "epoch": 0.72, + "grad_norm": 50.459905943712606, + "learning_rate": 1.2168209018162456e-07, + "logits/chosen": -2.1764161586761475, + "logits/rejected": -1.9086568355560303, + "logps/chosen": -97.29804992675781, + "logps/rejected": -158.05215454101562, + "loss": 0.2753, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.2845994234085083, + "rewards/margins": 2.076698064804077, + "rewards/rejected": -0.7920988202095032, + "step": 1970 + }, + { + "epoch": 0.72, + "grad_norm": 34.05499830411277, + "learning_rate": 1.2130927858497966e-07, + "logits/chosen": -1.833762526512146, + "logits/rejected": -1.881530523300171, + "logps/chosen": -108.65325927734375, + "logps/rejected": -144.40902709960938, + "loss": 0.3155, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.106397271156311, + "rewards/margins": 2.1092371940612793, + "rewards/rejected": -1.0028399229049683, + "step": 1980 + }, + { + "epoch": 0.72, + "grad_norm": 18.84745830969425, + "learning_rate": 1.209346084218906e-07, + "logits/chosen": -1.7890899181365967, + "logits/rejected": -1.7402870655059814, + "logps/chosen": -86.61128234863281, + "logps/rejected": -116.84139251708984, + "loss": 0.3061, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.6292413473129272, + "rewards/margins": 2.007689952850342, + "rewards/rejected": -0.3784485161304474, + "step": 1990 + }, + { + "epoch": 0.73, + "grad_norm": 48.3959022523091, + "learning_rate": 1.2055809472928762e-07, + "logits/chosen": -1.7730016708374023, + "logits/rejected": -1.6386531591415405, + "logps/chosen": -79.35774230957031, + "logps/rejected": -126.07957458496094, + "loss": 0.3234, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5467567443847656, + "rewards/margins": 2.7957520484924316, + "rewards/rejected": -1.2489951848983765, + "step": 2000 + }, + { + "epoch": 0.73, + "eval_logits/chosen": -2.596472978591919, + "eval_logits/rejected": -2.3702468872070312, + "eval_logps/chosen": -86.58128356933594, + "eval_logps/rejected": -138.30990600585938, + "eval_loss": 0.3054315149784088, + "eval_rewards/accuracies": 0.8392857313156128, + "eval_rewards/chosen": 1.3374978303909302, + "eval_rewards/margins": 2.6313438415527344, + "eval_rewards/rejected": -1.2938461303710938, + "eval_runtime": 71.9938, + "eval_samples_per_second": 12.362, + "eval_steps_per_second": 0.194, + "step": 2000 + }, + { + "epoch": 0.73, + "grad_norm": 53.09826173444019, + "learning_rate": 1.2017975261808887e-07, + "logits/chosen": -1.9460725784301758, + "logits/rejected": -1.7900736331939697, + "logps/chosen": -103.51519775390625, + "logps/rejected": -141.5853271484375, + "loss": 0.3128, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.3981924057006836, + "rewards/margins": 2.500655174255371, + "rewards/rejected": -1.1024627685546875, + "step": 2010 + }, + { + "epoch": 0.73, + "grad_norm": 39.783453163990046, + "learning_rate": 1.1979959727259367e-07, + "logits/chosen": -1.8045324087142944, + "logits/rejected": -1.6309808492660522, + "logps/chosen": -92.61442565917969, + "logps/rejected": -154.93099975585938, + "loss": 0.2678, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.2143778800964355, + "rewards/margins": 2.35176420211792, + "rewards/rejected": -1.137386441230774, + "step": 2020 + }, + { + "epoch": 0.74, + "grad_norm": 39.81682054465197, + "learning_rate": 1.1941764394987351e-07, + "logits/chosen": -1.738677740097046, + "logits/rejected": -1.5726416110992432, + "logps/chosen": -82.56022644042969, + "logps/rejected": -151.69943237304688, + "loss": 0.2551, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.240398645401001, + "rewards/margins": 2.4828732013702393, + "rewards/rejected": -1.2424746751785278, + "step": 2030 + }, + { + "epoch": 0.74, + "grad_norm": 37.01944950877088, + "learning_rate": 1.1903390797915929e-07, + "logits/chosen": -2.1611745357513428, + "logits/rejected": -1.4224140644073486, + "logps/chosen": -80.29561614990234, + "logps/rejected": -215.88174438476562, + "loss": 0.2887, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.6173490285873413, + "rewards/margins": 3.720074415206909, + "rewards/rejected": -2.1027255058288574, + "step": 2040 + }, + { + "epoch": 0.74, + "grad_norm": 40.82628805207018, + "learning_rate": 1.1864840476122647e-07, + "logits/chosen": -1.8892196416854858, + "logits/rejected": -1.9005800485610962, + "logps/chosen": -69.93404388427734, + "logps/rejected": -123.88677978515625, + "loss": 0.2882, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.8273292779922485, + "rewards/margins": 3.182246685028076, + "rewards/rejected": -1.354917287826538, + "step": 2050 + }, + { + "epoch": 0.75, + "grad_norm": 25.39556496888905, + "learning_rate": 1.1826114976777678e-07, + "logits/chosen": -1.7499809265136719, + "logits/rejected": -1.4074931144714355, + "logps/chosen": -91.85431671142578, + "logps/rejected": -170.28402709960938, + "loss": 0.2753, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.5811419486999512, + "rewards/margins": 3.3795552253723145, + "rewards/rejected": -1.7984129190444946, + "step": 2060 + }, + { + "epoch": 0.75, + "grad_norm": 27.81061856916663, + "learning_rate": 1.1787215854081724e-07, + "logits/chosen": -2.056485652923584, + "logits/rejected": -1.6245391368865967, + "logps/chosen": -80.32780456542969, + "logps/rejected": -132.90272521972656, + "loss": 0.3135, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.4287405014038086, + "rewards/margins": 2.6631085872650146, + "rewards/rejected": -1.2343682050704956, + "step": 2070 + }, + { + "epoch": 0.75, + "grad_norm": 30.567746366717504, + "learning_rate": 1.1748144669203663e-07, + "logits/chosen": -1.631363868713379, + "logits/rejected": -1.6502500772476196, + "logps/chosen": -96.3510971069336, + "logps/rejected": -155.26380920410156, + "loss": 0.2693, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.1531099081039429, + "rewards/margins": 2.349030017852783, + "rewards/rejected": -1.1959199905395508, + "step": 2080 + }, + { + "epoch": 0.76, + "grad_norm": 40.52562795601512, + "learning_rate": 1.1708902990217868e-07, + "logits/chosen": -1.8120248317718506, + "logits/rejected": -1.6999822854995728, + "logps/chosen": -73.98417663574219, + "logps/rejected": -166.10572814941406, + "loss": 0.2698, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.971939742565155, + "rewards/margins": 3.178133487701416, + "rewards/rejected": -2.206193685531616, + "step": 2090 + }, + { + "epoch": 0.76, + "grad_norm": 42.93005039151693, + "learning_rate": 1.166949239204129e-07, + "logits/chosen": -2.2130112648010254, + "logits/rejected": -1.7474415302276611, + "logps/chosen": -89.34376525878906, + "logps/rejected": -188.7493133544922, + "loss": 0.2532, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.4905040264129639, + "rewards/margins": 3.4080872535705566, + "rewards/rejected": -1.9175831079483032, + "step": 2100 + }, + { + "epoch": 0.76, + "eval_logits/chosen": -2.5582356452941895, + "eval_logits/rejected": -2.3411316871643066, + "eval_logps/chosen": -86.91837310791016, + "eval_logps/rejected": -139.59432983398438, + "eval_loss": 0.30377882719039917, + "eval_rewards/accuracies": 0.8571428656578064, + "eval_rewards/chosen": 1.303788423538208, + "eval_rewards/margins": 2.72607684135437, + "eval_rewards/rejected": -1.4222885370254517, + "eval_runtime": 71.8831, + "eval_samples_per_second": 12.381, + "eval_steps_per_second": 0.195, + "step": 2100 + }, + { + "epoch": 0.77, + "grad_norm": 53.74378080351859, + "learning_rate": 1.1629914456370243e-07, + "logits/chosen": -1.6835792064666748, + "logits/rejected": -1.5631754398345947, + "logps/chosen": -79.44068908691406, + "logps/rejected": -109.39453125, + "loss": 0.3213, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8239709734916687, + "rewards/margins": 1.709472417831421, + "rewards/rejected": -0.885501503944397, + "step": 2110 + }, + { + "epoch": 0.77, + "grad_norm": 45.76775103982164, + "learning_rate": 1.1590170771616929e-07, + "logits/chosen": -2.1422882080078125, + "logits/rejected": -2.01249361038208, + "logps/chosen": -70.21590423583984, + "logps/rejected": -106.15130615234375, + "loss": 0.2981, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.9929507970809937, + "rewards/margins": 1.7180888652801514, + "rewards/rejected": 0.27486199140548706, + "step": 2120 + }, + { + "epoch": 0.77, + "grad_norm": 34.522061772377945, + "learning_rate": 1.1550262932845691e-07, + "logits/chosen": -1.712194800376892, + "logits/rejected": -1.602046251296997, + "logps/chosen": -87.20169830322266, + "logps/rejected": -150.86685180664062, + "loss": 0.2747, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.7278430461883545, + "rewards/margins": 2.8564956188201904, + "rewards/rejected": -1.1286523342132568, + "step": 2130 + }, + { + "epoch": 0.78, + "grad_norm": 54.87111432142923, + "learning_rate": 1.1510192541708984e-07, + "logits/chosen": -1.5016837120056152, + "logits/rejected": -1.4684686660766602, + "logps/chosen": -72.81539916992188, + "logps/rejected": -142.9152374267578, + "loss": 0.3061, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.2450437545776367, + "rewards/margins": 3.1693520545959473, + "rewards/rejected": -0.9243084192276001, + "step": 2140 + }, + { + "epoch": 0.78, + "grad_norm": 50.33350490881025, + "learning_rate": 1.1469961206383114e-07, + "logits/chosen": -2.383873462677002, + "logits/rejected": -2.1281042098999023, + "logps/chosen": -93.19439697265625, + "logps/rejected": -159.3778076171875, + "loss": 0.2755, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0546815395355225, + "rewards/margins": 2.3399760723114014, + "rewards/rejected": -1.285294532775879, + "step": 2150 + }, + { + "epoch": 0.78, + "grad_norm": 29.94921093499752, + "learning_rate": 1.1429570541503681e-07, + "logits/chosen": -1.880446434020996, + "logits/rejected": -1.4601715803146362, + "logps/chosen": -97.71735382080078, + "logps/rejected": -151.2371368408203, + "loss": 0.2765, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.2635929584503174, + "rewards/margins": 3.5359420776367188, + "rewards/rejected": -2.2723495960235596, + "step": 2160 + }, + { + "epoch": 0.79, + "grad_norm": 38.26921815972567, + "learning_rate": 1.1389022168100782e-07, + "logits/chosen": -1.897870659828186, + "logits/rejected": -1.5864663124084473, + "logps/chosen": -91.51280212402344, + "logps/rejected": -171.35614013671875, + "loss": 0.2715, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.0029170513153076, + "rewards/margins": 2.7681167125701904, + "rewards/rejected": -1.7651996612548828, + "step": 2170 + }, + { + "epoch": 0.79, + "grad_norm": 25.55797410044776, + "learning_rate": 1.1348317713533955e-07, + "logits/chosen": -1.8892625570297241, + "logits/rejected": -1.941860556602478, + "logps/chosen": -112.3303451538086, + "logps/rejected": -152.99148559570312, + "loss": 0.2756, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.2206674814224243, + "rewards/margins": 2.4190514087677, + "rewards/rejected": -1.1983839273452759, + "step": 2180 + }, + { + "epoch": 0.79, + "grad_norm": 35.1411721341359, + "learning_rate": 1.1307458811426865e-07, + "logits/chosen": -2.288433313369751, + "logits/rejected": -1.941382646560669, + "logps/chosen": -117.0739517211914, + "logps/rejected": -211.00942993164062, + "loss": 0.2902, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.4456199109554291, + "rewards/margins": 3.088353395462036, + "rewards/rejected": -2.642733335494995, + "step": 2190 + }, + { + "epoch": 0.8, + "grad_norm": 33.568721302560235, + "learning_rate": 1.1266447101601738e-07, + "logits/chosen": -2.234501361846924, + "logits/rejected": -1.5646653175354004, + "logps/chosen": -85.79940795898438, + "logps/rejected": -137.91712951660156, + "loss": 0.2862, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6818853616714478, + "rewards/margins": 1.9311177730560303, + "rewards/rejected": -1.249232530593872, + "step": 2200 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -2.611751079559326, + "eval_logits/rejected": -2.3868792057037354, + "eval_logps/chosen": -90.76590728759766, + "eval_logps/rejected": -142.14036560058594, + "eval_loss": 0.29903972148895264, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 0.9190365672111511, + "eval_rewards/margins": 2.59592866897583, + "eval_rewards/rejected": -1.6768922805786133, + "eval_runtime": 71.9273, + "eval_samples_per_second": 12.374, + "eval_steps_per_second": 0.195, + "step": 2200 + }, + { + "epoch": 0.8, + "grad_norm": 40.79977336139254, + "learning_rate": 1.1225284230013554e-07, + "logits/chosen": -1.9493424892425537, + "logits/rejected": -1.7713226079940796, + "logps/chosen": -127.34124755859375, + "logps/rejected": -142.61746215820312, + "loss": 0.2739, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7090291976928711, + "rewards/margins": 2.365572452545166, + "rewards/rejected": -1.656543493270874, + "step": 2210 + }, + { + "epoch": 0.81, + "grad_norm": 42.792407711246895, + "learning_rate": 1.118397184868399e-07, + "logits/chosen": -1.76601243019104, + "logits/rejected": -1.4212825298309326, + "logps/chosen": -80.1509780883789, + "logps/rejected": -130.9561004638672, + "loss": 0.2638, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.3400588035583496, + "rewards/margins": 2.613032817840576, + "rewards/rejected": -1.272973895072937, + "step": 2220 + }, + { + "epoch": 0.81, + "grad_norm": 32.9359485242729, + "learning_rate": 1.1142511615635106e-07, + "logits/chosen": -2.112626791000366, + "logits/rejected": -1.8563470840454102, + "logps/chosen": -96.79916381835938, + "logps/rejected": -137.69740295410156, + "loss": 0.271, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.3981750011444092, + "rewards/margins": 2.080976963043213, + "rewards/rejected": -0.6828019618988037, + "step": 2230 + }, + { + "epoch": 0.81, + "grad_norm": 31.24730889188293, + "learning_rate": 1.1100905194822817e-07, + "logits/chosen": -2.2185370922088623, + "logits/rejected": -1.9021923542022705, + "logps/chosen": -100.98394775390625, + "logps/rejected": -165.42442321777344, + "loss": 0.2959, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.7293106913566589, + "rewards/margins": 2.7436270713806152, + "rewards/rejected": -2.0143163204193115, + "step": 2240 + }, + { + "epoch": 0.82, + "grad_norm": 29.657646636917896, + "learning_rate": 1.1059154256070101e-07, + "logits/chosen": -2.116981029510498, + "logits/rejected": -1.6903644800186157, + "logps/chosen": -62.789337158203125, + "logps/rejected": -139.08433532714844, + "loss": 0.2711, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.8867229223251343, + "rewards/margins": 3.3395798206329346, + "rewards/rejected": -1.4528568983078003, + "step": 2250 + }, + { + "epoch": 0.82, + "grad_norm": 42.14718531203437, + "learning_rate": 1.101726047499999e-07, + "logits/chosen": -1.7964967489242554, + "logits/rejected": -1.3701756000518799, + "logps/chosen": -84.9123306274414, + "logps/rejected": -195.67616271972656, + "loss": 0.295, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.1344997882843018, + "rewards/margins": 2.1091301441192627, + "rewards/rejected": -0.9746300578117371, + "step": 2260 + }, + { + "epoch": 0.82, + "grad_norm": 44.98425619656123, + "learning_rate": 1.0975225532968324e-07, + "logits/chosen": -2.183009386062622, + "logits/rejected": -1.8169399499893188, + "logps/chosen": -71.13068389892578, + "logps/rejected": -153.55203247070312, + "loss": 0.2318, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.66214919090271, + "rewards/margins": 2.8192644119262695, + "rewards/rejected": -1.1571152210235596, + "step": 2270 + }, + { + "epoch": 0.83, + "grad_norm": 38.49859201574968, + "learning_rate": 1.0933051116996251e-07, + "logits/chosen": -2.2806153297424316, + "logits/rejected": -1.6658601760864258, + "logps/chosen": -87.88021850585938, + "logps/rejected": -157.28659057617188, + "loss": 0.2386, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.618966817855835, + "rewards/margins": 3.0835256576538086, + "rewards/rejected": -1.4645588397979736, + "step": 2280 + }, + { + "epoch": 0.83, + "grad_norm": 29.80803011600754, + "learning_rate": 1.089073891970255e-07, + "logits/chosen": -2.1777398586273193, + "logits/rejected": -1.9328248500823975, + "logps/chosen": -95.84873962402344, + "logps/rejected": -143.92617797851562, + "loss": 0.2572, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5284852981567383, + "rewards/margins": 3.2201919555664062, + "rewards/rejected": -1.6917064189910889, + "step": 2290 + }, + { + "epoch": 0.83, + "grad_norm": 38.40290299546086, + "learning_rate": 1.0848290639235677e-07, + "logits/chosen": -2.0058891773223877, + "logits/rejected": -1.5227512121200562, + "logps/chosen": -52.72210693359375, + "logps/rejected": -111.29896545410156, + "loss": 0.2972, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 2.021421432495117, + "rewards/margins": 2.641392230987549, + "rewards/rejected": -0.6199706196784973, + "step": 2300 + }, + { + "epoch": 0.83, + "eval_logits/chosen": -2.568045139312744, + "eval_logits/rejected": -2.3515257835388184, + "eval_logps/chosen": -86.87254333496094, + "eval_logps/rejected": -140.3467559814453, + "eval_loss": 0.29616376757621765, + "eval_rewards/accuracies": 0.8571428656578064, + "eval_rewards/chosen": 1.308371901512146, + "eval_rewards/margins": 2.8059024810791016, + "eval_rewards/rejected": -1.4975303411483765, + "eval_runtime": 71.9446, + "eval_samples_per_second": 12.371, + "eval_steps_per_second": 0.195, + "step": 2300 + }, + { + "epoch": 0.84, + "grad_norm": 32.78552611053422, + "learning_rate": 1.0805707979205626e-07, + "logits/chosen": -1.8444101810455322, + "logits/rejected": -1.6570831537246704, + "logps/chosen": -82.16907501220703, + "logps/rejected": -161.206787109375, + "loss": 0.2583, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.7756086587905884, + "rewards/margins": 3.3080837726593018, + "rewards/rejected": -1.532475233078003, + "step": 2310 + }, + { + "epoch": 0.84, + "grad_norm": 49.48891626378323, + "learning_rate": 1.0762992648615549e-07, + "logits/chosen": -2.2481982707977295, + "logits/rejected": -1.626535415649414, + "logps/chosen": -60.07440185546875, + "logps/rejected": -154.71829223632812, + "loss": 0.2713, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.572458028793335, + "rewards/margins": 2.8353686332702637, + "rewards/rejected": -1.2629106044769287, + "step": 2320 + }, + { + "epoch": 0.85, + "grad_norm": 32.324264844561206, + "learning_rate": 1.0720146361793166e-07, + "logits/chosen": -1.3140501976013184, + "logits/rejected": -1.3236348628997803, + "logps/chosen": -84.41221618652344, + "logps/rejected": -151.84896850585938, + "loss": 0.2769, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4236546754837036, + "rewards/margins": 3.031369686126709, + "rewards/rejected": -1.6077148914337158, + "step": 2330 + }, + { + "epoch": 0.85, + "grad_norm": 33.985147386077095, + "learning_rate": 1.0677170838321969e-07, + "logits/chosen": -2.1076178550720215, + "logits/rejected": -1.5049619674682617, + "logps/chosen": -63.42456817626953, + "logps/rejected": -135.26893615722656, + "loss": 0.3078, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.8713020086288452, + "rewards/margins": 3.84709095954895, + "rewards/rejected": -1.9757890701293945, + "step": 2340 + }, + { + "epoch": 0.85, + "grad_norm": 28.49155730270651, + "learning_rate": 1.0634067802972204e-07, + "logits/chosen": -2.251544713973999, + "logits/rejected": -1.766570806503296, + "logps/chosen": -59.371925354003906, + "logps/rejected": -96.25486755371094, + "loss": 0.2679, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.6637605428695679, + "rewards/margins": 1.9050308465957642, + "rewards/rejected": -0.24127018451690674, + "step": 2350 + }, + { + "epoch": 0.86, + "grad_norm": 46.79774181330583, + "learning_rate": 1.0590838985631653e-07, + "logits/chosen": -1.972684621810913, + "logits/rejected": -1.8693819046020508, + "logps/chosen": -81.91486358642578, + "logps/rejected": -135.78457641601562, + "loss": 0.2746, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4232879877090454, + "rewards/margins": 3.040739059448242, + "rewards/rejected": -1.6174514293670654, + "step": 2360 + }, + { + "epoch": 0.86, + "grad_norm": 45.90541036194016, + "learning_rate": 1.0547486121236202e-07, + "logits/chosen": -2.168940305709839, + "logits/rejected": -1.6793582439422607, + "logps/chosen": -85.18965148925781, + "logps/rejected": -126.7214126586914, + "loss": 0.25, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.3474589586257935, + "rewards/margins": 2.820422649383545, + "rewards/rejected": -1.472963571548462, + "step": 2370 + }, + { + "epoch": 0.86, + "grad_norm": 32.321114428237806, + "learning_rate": 1.0504010949700214e-07, + "logits/chosen": -2.089189052581787, + "logits/rejected": -1.6855659484863281, + "logps/chosen": -106.45992279052734, + "logps/rejected": -163.7842559814453, + "loss": 0.2809, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.0083647966384888, + "rewards/margins": 2.169463872909546, + "rewards/rejected": -1.1610991954803467, + "step": 2380 + }, + { + "epoch": 0.87, + "grad_norm": 29.03655328835733, + "learning_rate": 1.0460415215846706e-07, + "logits/chosen": -1.7703535556793213, + "logits/rejected": -1.6672289371490479, + "logps/chosen": -78.27106475830078, + "logps/rejected": -119.7970962524414, + "loss": 0.3083, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.5671287775039673, + "rewards/margins": 2.6943907737731934, + "rewards/rejected": -1.1272616386413574, + "step": 2390 + }, + { + "epoch": 0.87, + "grad_norm": 40.17380893976502, + "learning_rate": 1.0416700669337309e-07, + "logits/chosen": -1.7051159143447876, + "logits/rejected": -1.4203988313674927, + "logps/chosen": -81.63056945800781, + "logps/rejected": -137.97152709960938, + "loss": 0.2819, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.8759418725967407, + "rewards/margins": 2.198500156402588, + "rewards/rejected": -1.3225582838058472, + "step": 2400 + }, + { + "epoch": 0.87, + "eval_logits/chosen": -2.5985686779022217, + "eval_logits/rejected": -2.3792624473571777, + "eval_logps/chosen": -89.82866668701172, + "eval_logps/rejected": -142.37832641601562, + "eval_loss": 0.29318711161613464, + "eval_rewards/accuracies": 0.875, + "eval_rewards/chosen": 1.0127601623535156, + "eval_rewards/margins": 2.7134480476379395, + "eval_rewards/rejected": -1.7006880044937134, + "eval_runtime": 71.9548, + "eval_samples_per_second": 12.369, + "eval_steps_per_second": 0.195, + "step": 2400 + }, + { + "epoch": 0.87, + "grad_norm": 33.42712229281822, + "learning_rate": 1.0372869064602057e-07, + "logits/chosen": -1.7892534732818604, + "logits/rejected": -1.1852535009384155, + "logps/chosen": -83.11079406738281, + "logps/rejected": -178.16806030273438, + "loss": 0.2442, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.063844084739685, + "rewards/margins": 2.93255352973938, + "rewards/rejected": -1.8687092065811157, + "step": 2410 + }, + { + "epoch": 0.88, + "grad_norm": 35.05254862902216, + "learning_rate": 1.032892216076898e-07, + "logits/chosen": -1.8080724477767944, + "logits/rejected": -2.15615177154541, + "logps/chosen": -103.28878021240234, + "logps/rejected": -124.3226089477539, + "loss": 0.2772, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.9695222973823547, + "rewards/margins": 1.7081365585327148, + "rewards/rejected": -0.7386142611503601, + "step": 2420 + }, + { + "epoch": 0.88, + "grad_norm": 36.27734663918374, + "learning_rate": 1.0284861721593486e-07, + "logits/chosen": -1.5924618244171143, + "logits/rejected": -1.227879285812378, + "logps/chosen": -101.4988784790039, + "logps/rejected": -178.65904235839844, + "loss": 0.2531, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.11053466796875, + "rewards/margins": 3.443126678466797, + "rewards/rejected": -2.3325917720794678, + "step": 2430 + }, + { + "epoch": 0.89, + "grad_norm": 24.345274978234222, + "learning_rate": 1.0240689515387594e-07, + "logits/chosen": -1.6779060363769531, + "logits/rejected": -1.6044038534164429, + "logps/chosen": -79.8328857421875, + "logps/rejected": -146.0101776123047, + "loss": 0.2333, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.1275861263275146, + "rewards/margins": 3.003176212310791, + "rewards/rejected": -1.8755900859832764, + "step": 2440 + }, + { + "epoch": 0.89, + "grad_norm": 38.18691013378142, + "learning_rate": 1.0196407314948948e-07, + "logits/chosen": -2.0358641147613525, + "logits/rejected": -1.629227638244629, + "logps/chosen": -68.20674896240234, + "logps/rejected": -162.77389526367188, + "loss": 0.2482, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.680604338645935, + "rewards/margins": 3.843975067138672, + "rewards/rejected": -2.1633706092834473, + "step": 2450 + }, + { + "epoch": 0.89, + "grad_norm": 48.20820417201082, + "learning_rate": 1.015201689748969e-07, + "logits/chosen": -1.8446376323699951, + "logits/rejected": -1.5708643198013306, + "logps/chosen": -71.98088836669922, + "logps/rejected": -119.35798645019531, + "loss": 0.2679, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.3860020637512207, + "rewards/margins": 2.888756275177002, + "rewards/rejected": -1.5027542114257812, + "step": 2460 + }, + { + "epoch": 0.9, + "grad_norm": 54.24613191962403, + "learning_rate": 1.0107520044565107e-07, + "logits/chosen": -1.6053078174591064, + "logits/rejected": -1.7150065898895264, + "logps/chosen": -95.47930908203125, + "logps/rejected": -129.6278839111328, + "loss": 0.2346, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.3713977336883545, + "rewards/margins": 1.8234504461288452, + "rewards/rejected": -0.45205289125442505, + "step": 2470 + }, + { + "epoch": 0.9, + "grad_norm": 33.14197724163633, + "learning_rate": 1.0062918542002149e-07, + "logits/chosen": -1.7330402135849, + "logits/rejected": -1.5148823261260986, + "logps/chosen": -90.85403442382812, + "logps/rejected": -120.53018951416016, + "loss": 0.291, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.1606512069702148, + "rewards/margins": 1.8089545965194702, + "rewards/rejected": -0.6483034491539001, + "step": 2480 + }, + { + "epoch": 0.9, + "grad_norm": 37.842170372492035, + "learning_rate": 1.0018214179827752e-07, + "logits/chosen": -1.585915207862854, + "logits/rejected": -1.2993358373641968, + "logps/chosen": -100.29901123046875, + "logps/rejected": -149.75497436523438, + "loss": 0.2526, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.6282235980033875, + "rewards/margins": 2.4122531414031982, + "rewards/rejected": -1.784029245376587, + "step": 2490 + }, + { + "epoch": 0.91, + "grad_norm": 47.99587426386816, + "learning_rate": 9.973408752196995e-08, + "logits/chosen": -1.995279312133789, + "logits/rejected": -1.883319616317749, + "logps/chosen": -85.00283813476562, + "logps/rejected": -133.04844665527344, + "loss": 0.2523, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.3437588214874268, + "rewards/margins": 2.9578824043273926, + "rewards/rejected": -1.6141233444213867, + "step": 2500 + }, + { + "epoch": 0.91, + "eval_logits/chosen": -2.529489755630493, + "eval_logits/rejected": -2.3238425254821777, + "eval_logps/chosen": -87.53929901123047, + "eval_logps/rejected": -142.36253356933594, + "eval_loss": 0.2887308895587921, + "eval_rewards/accuracies": 0.8571428656578064, + "eval_rewards/chosen": 1.2416969537734985, + "eval_rewards/margins": 2.9408047199249268, + "eval_rewards/rejected": -1.6991075277328491, + "eval_runtime": 71.9193, + "eval_samples_per_second": 12.375, + "eval_steps_per_second": 0.195, + "step": 2500 + }, + { + "epoch": 0.91, + "grad_norm": 43.66156972073303, + "learning_rate": 9.9285040573211e-08, + "logits/chosen": -1.346799612045288, + "logits/rejected": -1.2726528644561768, + "logps/chosen": -84.28345489501953, + "logps/rejected": -163.90647888183594, + "loss": 0.2379, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.04119873046875, + "rewards/margins": 2.8771090507507324, + "rewards/rejected": -1.8359102010726929, + "step": 2510 + }, + { + "epoch": 0.91, + "grad_norm": 29.983525537713383, + "learning_rate": 9.883501897395255e-08, + "logits/chosen": -2.4633562564849854, + "logits/rejected": -1.8329979181289673, + "logps/chosen": -77.02892303466797, + "logps/rejected": -148.3466796875, + "loss": 0.309, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.9383471012115479, + "rewards/margins": 3.8611648082733154, + "rewards/rejected": -1.9228184223175049, + "step": 2520 + }, + { + "epoch": 0.92, + "grad_norm": 54.586263258672254, + "learning_rate": 9.838404078526287e-08, + "logits/chosen": -1.8038403987884521, + "logits/rejected": -1.38016676902771, + "logps/chosen": -81.94984436035156, + "logps/rejected": -152.57546997070312, + "loss": 0.2406, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.466467022895813, + "rewards/margins": 3.583028793334961, + "rewards/rejected": -2.1165618896484375, + "step": 2530 + }, + { + "epoch": 0.92, + "grad_norm": 34.64469687487794, + "learning_rate": 9.79321241066019e-08, + "logits/chosen": -1.7441418170928955, + "logits/rejected": -1.8793613910675049, + "logps/chosen": -80.00519561767578, + "logps/rejected": -135.61685180664062, + "loss": 0.2527, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.3446810245513916, + "rewards/margins": 2.6392345428466797, + "rewards/rejected": -1.2945533990859985, + "step": 2540 + }, + { + "epoch": 0.93, + "grad_norm": 52.236703640195636, + "learning_rate": 9.747928707509452e-08, + "logits/chosen": -2.3760972023010254, + "logits/rejected": -1.7141082286834717, + "logps/chosen": -70.12747955322266, + "logps/rejected": -142.975341796875, + "loss": 0.2482, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.4635976552963257, + "rewards/margins": 3.061598300933838, + "rewards/rejected": -1.5980006456375122, + "step": 2550 + }, + { + "epoch": 0.93, + "grad_norm": 54.072926051989626, + "learning_rate": 9.702554786480307e-08, + "logits/chosen": -2.1945765018463135, + "logits/rejected": -1.9323997497558594, + "logps/chosen": -109.21195983886719, + "logps/rejected": -147.5604705810547, + "loss": 0.2711, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7500441670417786, + "rewards/margins": 1.275587558746338, + "rewards/rejected": -0.5255435109138489, + "step": 2560 + }, + { + "epoch": 0.93, + "grad_norm": 28.839686251981885, + "learning_rate": 9.657092468599762e-08, + "logits/chosen": -1.9283406734466553, + "logits/rejected": -1.9637447595596313, + "logps/chosen": -73.35202026367188, + "logps/rejected": -218.9588165283203, + "loss": 0.2294, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.6155729293823242, + "rewards/margins": 3.6671338081359863, + "rewards/rejected": -2.051560640335083, + "step": 2570 + }, + { + "epoch": 0.94, + "grad_norm": 23.167116682862034, + "learning_rate": 9.611543578442521e-08, + "logits/chosen": -2.004908800125122, + "logits/rejected": -1.8643825054168701, + "logps/chosen": -73.96257019042969, + "logps/rejected": -120.63322448730469, + "loss": 0.2663, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.1677210330963135, + "rewards/margins": 1.7092103958129883, + "rewards/rejected": -0.5414894819259644, + "step": 2580 + }, + { + "epoch": 0.94, + "grad_norm": 37.61703553720983, + "learning_rate": 9.565909944057774e-08, + "logits/chosen": -1.8392232656478882, + "logits/rejected": -1.622847318649292, + "logps/chosen": -98.19245910644531, + "logps/rejected": -152.8502197265625, + "loss": 0.2644, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.2672936916351318, + "rewards/margins": 2.7811713218688965, + "rewards/rejected": -1.513877511024475, + "step": 2590 + }, + { + "epoch": 0.94, + "grad_norm": 27.461231350463876, + "learning_rate": 9.520193396895798e-08, + "logits/chosen": -1.9339004755020142, + "logits/rejected": -1.5946998596191406, + "logps/chosen": -107.6896743774414, + "logps/rejected": -166.02828979492188, + "loss": 0.2534, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.9528090357780457, + "rewards/margins": 3.083143949508667, + "rewards/rejected": -2.1303353309631348, + "step": 2600 + }, + { + "epoch": 0.94, + "eval_logits/chosen": -2.55859375, + "eval_logits/rejected": -2.3451855182647705, + "eval_logps/chosen": -89.4637680053711, + "eval_logps/rejected": -143.6739959716797, + "eval_loss": 0.2875581979751587, + "eval_rewards/accuracies": 0.8214285969734192, + "eval_rewards/chosen": 1.0492494106292725, + "eval_rewards/margins": 2.8795037269592285, + "eval_rewards/rejected": -1.8302545547485352, + "eval_runtime": 71.9243, + "eval_samples_per_second": 12.374, + "eval_steps_per_second": 0.195, + "step": 2600 + }, + { + "epoch": 0.95, + "grad_norm": 49.8667964660086, + "learning_rate": 9.474395771734493e-08, + "logits/chosen": -1.8184125423431396, + "logits/rejected": -1.5943410396575928, + "logps/chosen": -81.92977905273438, + "logps/rejected": -143.59848022460938, + "loss": 0.284, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8414875268936157, + "rewards/margins": 3.2987303733825684, + "rewards/rejected": -2.457242727279663, + "step": 2610 + }, + { + "epoch": 0.95, + "grad_norm": 30.852377454296445, + "learning_rate": 9.428518906605715e-08, + "logits/chosen": -1.6976007223129272, + "logits/rejected": -2.0197830200195312, + "logps/chosen": -77.92415618896484, + "logps/rejected": -120.9302749633789, + "loss": 0.2503, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.2503085136413574, + "rewards/margins": 2.8966543674468994, + "rewards/rejected": -1.646345853805542, + "step": 2620 + }, + { + "epoch": 0.95, + "grad_norm": 44.68222855190559, + "learning_rate": 9.382564642721517e-08, + "logits/chosen": -2.2393205165863037, + "logits/rejected": -1.8123022317886353, + "logps/chosen": -104.79426574707031, + "logps/rejected": -157.04183959960938, + "loss": 0.2891, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.9819310307502747, + "rewards/margins": 2.653965473175049, + "rewards/rejected": -1.6720342636108398, + "step": 2630 + }, + { + "epoch": 0.96, + "grad_norm": 49.38339013216613, + "learning_rate": 9.336534824400267e-08, + "logits/chosen": -1.8587281703948975, + "logits/rejected": -1.5764485597610474, + "logps/chosen": -72.25798797607422, + "logps/rejected": -129.2059783935547, + "loss": 0.2643, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.856530785560608, + "rewards/margins": 3.2116611003875732, + "rewards/rejected": -1.3551304340362549, + "step": 2640 + }, + { + "epoch": 0.96, + "grad_norm": 38.226700349391514, + "learning_rate": 9.290431298992605e-08, + "logits/chosen": -2.0723516941070557, + "logits/rejected": -1.8886346817016602, + "logps/chosen": -73.31155395507812, + "logps/rejected": -133.17425537109375, + "loss": 0.2182, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.8224437236785889, + "rewards/margins": 2.681816577911377, + "rewards/rejected": -0.8593727946281433, + "step": 2650 + }, + { + "epoch": 0.97, + "grad_norm": 27.790126692612404, + "learning_rate": 9.244255916807328e-08, + "logits/chosen": -1.9878448247909546, + "logits/rejected": -1.7479631900787354, + "logps/chosen": -75.09999084472656, + "logps/rejected": -106.42635345458984, + "loss": 0.2358, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.3537728786468506, + "rewards/margins": 2.176225423812866, + "rewards/rejected": -0.8224524259567261, + "step": 2660 + }, + { + "epoch": 0.97, + "grad_norm": 45.32065506760666, + "learning_rate": 9.19801053103711e-08, + "logits/chosen": -2.0774807929992676, + "logits/rejected": -1.564305305480957, + "logps/chosen": -108.69667053222656, + "logps/rejected": -161.80776977539062, + "loss": 0.2735, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.1085962057113647, + "rewards/margins": 2.628162145614624, + "rewards/rejected": -1.5195659399032593, + "step": 2670 + }, + { + "epoch": 0.97, + "grad_norm": 42.93481884745555, + "learning_rate": 9.151696997684133e-08, + "logits/chosen": -1.8935344219207764, + "logits/rejected": -1.3089290857315063, + "logps/chosen": -82.37386322021484, + "logps/rejected": -136.06396484375, + "loss": 0.3265, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.3590657711029053, + "rewards/margins": 2.706857681274414, + "rewards/rejected": -1.3477914333343506, + "step": 2680 + }, + { + "epoch": 0.98, + "grad_norm": 35.54137712720915, + "learning_rate": 9.105317175485603e-08, + "logits/chosen": -1.7412023544311523, + "logits/rejected": -1.2009631395339966, + "logps/chosen": -100.62733459472656, + "logps/rejected": -161.97164916992188, + "loss": 0.2636, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.8465662002563477, + "rewards/margins": 2.8156447410583496, + "rewards/rejected": -1.9690784215927124, + "step": 2690 + }, + { + "epoch": 0.98, + "grad_norm": 28.86922135565408, + "learning_rate": 9.058872925839145e-08, + "logits/chosen": -1.6413863897323608, + "logits/rejected": -1.4907596111297607, + "logps/chosen": -95.3847427368164, + "logps/rejected": -137.19778442382812, + "loss": 0.2065, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.9992202520370483, + "rewards/margins": 2.450819969177246, + "rewards/rejected": -1.4515998363494873, + "step": 2700 + }, + { + "epoch": 0.98, + "eval_logits/chosen": -2.584843158721924, + "eval_logits/rejected": -2.3644392490386963, + "eval_logps/chosen": -91.23661804199219, + "eval_logps/rejected": -144.7612762451172, + "eval_loss": 0.28060293197631836, + "eval_rewards/accuracies": 0.8571428656578064, + "eval_rewards/chosen": 0.8719648718833923, + "eval_rewards/margins": 2.8109474182128906, + "eval_rewards/rejected": -1.9389822483062744, + "eval_runtime": 72.0935, + "eval_samples_per_second": 12.345, + "eval_steps_per_second": 0.194, + "step": 2700 + }, + { + "epoch": 0.98, + "grad_norm": 33.738621565238745, + "learning_rate": 9.012366112728104e-08, + "logits/chosen": -2.308584213256836, + "logits/rejected": -1.8555558919906616, + "logps/chosen": -87.93582153320312, + "logps/rejected": -192.4984130859375, + "loss": 0.2612, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.67702716588974, + "rewards/margins": 2.732879161834717, + "rewards/rejected": -2.055852174758911, + "step": 2710 + }, + { + "epoch": 0.99, + "grad_norm": 27.441037937773388, + "learning_rate": 8.96579860264673e-08, + "logits/chosen": -1.98538339138031, + "logits/rejected": -1.8649771213531494, + "logps/chosen": -95.34701538085938, + "logps/rejected": -144.11663818359375, + "loss": 0.2566, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.9213436841964722, + "rewards/margins": 2.6342453956604004, + "rewards/rejected": -1.7129017114639282, + "step": 2720 + }, + { + "epoch": 0.99, + "grad_norm": 30.78636513438412, + "learning_rate": 8.919172264525274e-08, + "logits/chosen": -1.7649853229522705, + "logits/rejected": -1.6394439935684204, + "logps/chosen": -93.97325134277344, + "logps/rejected": -128.0108642578125, + "loss": 0.2684, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.8014779090881348, + "rewards/margins": 2.1937363147735596, + "rewards/rejected": -1.3922584056854248, + "step": 2730 + }, + { + "epoch": 0.99, + "grad_norm": 31.448648707859167, + "learning_rate": 8.872488969654978e-08, + "logits/chosen": -1.7622663974761963, + "logits/rejected": -1.6264762878417969, + "logps/chosen": -67.11592102050781, + "logps/rejected": -124.63407897949219, + "loss": 0.2686, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.3815752267837524, + "rewards/margins": 2.803210735321045, + "rewards/rejected": -1.4216358661651611, + "step": 2740 + }, + { + "epoch": 1.0, + "grad_norm": 39.421832880751474, + "learning_rate": 8.825750591612973e-08, + "logits/chosen": -2.0863993167877197, + "logits/rejected": -2.0298314094543457, + "logps/chosen": -88.62030029296875, + "logps/rejected": -142.9340057373047, + "loss": 0.261, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.233635663986206, + "rewards/margins": 2.6705703735351562, + "rewards/rejected": -1.4369348287582397, + "step": 2750 + }, + { + "epoch": 1.0, + "grad_norm": 24.481557561032986, + "learning_rate": 8.778959006187086e-08, + "logits/chosen": -2.1946396827697754, + "logits/rejected": -2.0280699729919434, + "logps/chosen": -109.33675384521484, + "logps/rejected": -159.2801055908203, + "loss": 0.1921, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.6714259386062622, + "rewards/margins": 1.6729438304901123, + "rewards/rejected": -1.0015180110931396, + "step": 2760 + }, + { + "epoch": 1.01, + "grad_norm": 33.3000860606884, + "learning_rate": 8.732116091300562e-08, + "logits/chosen": -2.672719955444336, + "logits/rejected": -2.3178205490112305, + "logps/chosen": -108.69718170166016, + "logps/rejected": -154.65037536621094, + "loss": 0.1737, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7940966486930847, + "rewards/margins": 2.4711570739746094, + "rewards/rejected": -1.6770607233047485, + "step": 2770 + }, + { + "epoch": 1.01, + "grad_norm": 33.433009311142335, + "learning_rate": 8.685223726936686e-08, + "logits/chosen": -1.885528802871704, + "logits/rejected": -1.5169920921325684, + "logps/chosen": -77.01846313476562, + "logps/rejected": -153.978759765625, + "loss": 0.171, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.206106185913086, + "rewards/margins": 3.5677733421325684, + "rewards/rejected": -2.3616671562194824, + "step": 2780 + }, + { + "epoch": 1.01, + "grad_norm": 36.29017775178196, + "learning_rate": 8.638283795063337e-08, + "logits/chosen": -1.7784044742584229, + "logits/rejected": -1.95382559299469, + "logps/chosen": -102.05876922607422, + "logps/rejected": -136.20278930664062, + "loss": 0.1558, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0153976678848267, + "rewards/margins": 2.9986801147460938, + "rewards/rejected": -1.9832820892333984, + "step": 2790 + }, + { + "epoch": 1.02, + "grad_norm": 29.62169456800005, + "learning_rate": 8.591298179557462e-08, + "logits/chosen": -1.6251239776611328, + "logits/rejected": -1.2165063619613647, + "logps/chosen": -86.18495178222656, + "logps/rejected": -161.23684692382812, + "loss": 0.1669, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3276035785675049, + "rewards/margins": 4.416350364685059, + "rewards/rejected": -3.0887465476989746, + "step": 2800 + }, + { + "epoch": 1.02, + "eval_logits/chosen": -2.437558650970459, + "eval_logits/rejected": -2.259409189224243, + "eval_logps/chosen": -88.76131439208984, + "eval_logps/rejected": -145.60667419433594, + "eval_loss": 0.2792236804962158, + "eval_rewards/accuracies": 0.875, + "eval_rewards/chosen": 1.1194943189620972, + "eval_rewards/margins": 3.143015146255493, + "eval_rewards/rejected": -2.0235207080841064, + "eval_runtime": 71.9382, + "eval_samples_per_second": 12.372, + "eval_steps_per_second": 0.195, + "step": 2800 + }, + { + "epoch": 1.02, + "grad_norm": 25.67903585470204, + "learning_rate": 8.544268766129463e-08, + "logits/chosen": -1.636718988418579, + "logits/rejected": -1.3525017499923706, + "logps/chosen": -91.5226821899414, + "logps/rejected": -152.884033203125, + "loss": 0.1626, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0812289714813232, + "rewards/margins": 3.153066873550415, + "rewards/rejected": -2.0718374252319336, + "step": 2810 + }, + { + "epoch": 1.02, + "grad_norm": 23.17233872525318, + "learning_rate": 8.497197442247518e-08, + "logits/chosen": -2.255305051803589, + "logits/rejected": -1.3946579694747925, + "logps/chosen": -73.07662963867188, + "logps/rejected": -160.0610809326172, + "loss": 0.1865, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7297379970550537, + "rewards/margins": 3.913317918777466, + "rewards/rejected": -2.183579683303833, + "step": 2820 + }, + { + "epoch": 1.03, + "grad_norm": 38.41529064635961, + "learning_rate": 8.45008609706183e-08, + "logits/chosen": -1.6799609661102295, + "logits/rejected": -1.7947546243667603, + "logps/chosen": -98.69215393066406, + "logps/rejected": -176.2200469970703, + "loss": 0.1551, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.3261483907699585, + "rewards/margins": 2.8912904262542725, + "rewards/rejected": -1.565142273902893, + "step": 2830 + }, + { + "epoch": 1.03, + "grad_norm": 33.65938875922252, + "learning_rate": 8.402936621328809e-08, + "logits/chosen": -1.584078073501587, + "logits/rejected": -1.5029069185256958, + "logps/chosen": -78.0562515258789, + "logps/rejected": -129.6306915283203, + "loss": 0.1773, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.175910472869873, + "rewards/margins": 3.317577838897705, + "rewards/rejected": -2.141667366027832, + "step": 2840 + }, + { + "epoch": 1.03, + "grad_norm": 28.882074958874135, + "learning_rate": 8.355750907335185e-08, + "logits/chosen": -1.3962595462799072, + "logits/rejected": -1.2154072523117065, + "logps/chosen": -97.38468933105469, + "logps/rejected": -141.413330078125, + "loss": 0.1708, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9436796307563782, + "rewards/margins": 2.901923418045044, + "rewards/rejected": -1.9582436084747314, + "step": 2850 + }, + { + "epoch": 1.04, + "grad_norm": 48.57490152838243, + "learning_rate": 8.308530848822072e-08, + "logits/chosen": -1.322256088256836, + "logits/rejected": -1.3967430591583252, + "logps/chosen": -69.242919921875, + "logps/rejected": -106.2719955444336, + "loss": 0.167, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.6083996295928955, + "rewards/margins": 3.0584025382995605, + "rewards/rejected": -1.4500024318695068, + "step": 2860 + }, + { + "epoch": 1.04, + "grad_norm": 33.604690884057646, + "learning_rate": 8.261278340908956e-08, + "logits/chosen": -2.133439302444458, + "logits/rejected": -1.5575885772705078, + "logps/chosen": -82.22913360595703, + "logps/rejected": -155.5948028564453, + "loss": 0.1388, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.6941452026367188, + "rewards/margins": 3.734036922454834, + "rewards/rejected": -2.0398921966552734, + "step": 2870 + }, + { + "epoch": 1.05, + "grad_norm": 39.178640280039765, + "learning_rate": 8.213995280017641e-08, + "logits/chosen": -2.022156238555908, + "logits/rejected": -1.857143759727478, + "logps/chosen": -85.9186019897461, + "logps/rejected": -166.30189514160156, + "loss": 0.1486, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.53517484664917, + "rewards/margins": 4.04697322845459, + "rewards/rejected": -2.511798858642578, + "step": 2880 + }, + { + "epoch": 1.05, + "grad_norm": 34.78141019059118, + "learning_rate": 8.166683563796132e-08, + "logits/chosen": -1.9229103326797485, + "logits/rejected": -2.1193885803222656, + "logps/chosen": -80.5084228515625, + "logps/rejected": -117.29715728759766, + "loss": 0.2072, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.6038023233413696, + "rewards/margins": 2.8986401557922363, + "rewards/rejected": -1.2948377132415771, + "step": 2890 + }, + { + "epoch": 1.05, + "grad_norm": 55.9854636653751, + "learning_rate": 8.119345091042493e-08, + "logits/chosen": -1.9403820037841797, + "logits/rejected": -1.5053958892822266, + "logps/chosen": -78.80266571044922, + "logps/rejected": -152.7443389892578, + "loss": 0.2042, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.886501669883728, + "rewards/margins": 4.484723091125488, + "rewards/rejected": -2.59822154045105, + "step": 2900 + }, + { + "epoch": 1.05, + "eval_logits/chosen": -2.46332049369812, + "eval_logits/rejected": -2.278831958770752, + "eval_logps/chosen": -90.24932861328125, + "eval_logps/rejected": -147.7169189453125, + "eval_loss": 0.2783988416194916, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 0.9706932306289673, + "eval_rewards/margins": 3.205240249633789, + "eval_rewards/rejected": -2.2345468997955322, + "eval_runtime": 71.9005, + "eval_samples_per_second": 12.378, + "eval_steps_per_second": 0.195, + "step": 2900 + }, + { + "epoch": 1.06, + "grad_norm": 19.853743178316083, + "learning_rate": 8.071981761628615e-08, + "logits/chosen": -2.0366461277008057, + "logits/rejected": -1.6515051126480103, + "logps/chosen": -108.37039947509766, + "logps/rejected": -271.4614562988281, + "loss": 0.1636, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.959917426109314, + "rewards/margins": 4.299803256988525, + "rewards/rejected": -3.33988618850708, + "step": 2910 + }, + { + "epoch": 1.06, + "grad_norm": 36.91632833855024, + "learning_rate": 8.024595476423992e-08, + "logits/chosen": -1.8184095621109009, + "logits/rejected": -1.9267457723617554, + "logps/chosen": -84.40902709960938, + "logps/rejected": -157.05877685546875, + "loss": 0.1462, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.5019279718399048, + "rewards/margins": 3.4105846881866455, + "rewards/rejected": -1.9086570739746094, + "step": 2920 + }, + { + "epoch": 1.06, + "grad_norm": 33.58138147718405, + "learning_rate": 7.977188137219414e-08, + "logits/chosen": -1.3525230884552002, + "logits/rejected": -1.4422528743743896, + "logps/chosen": -77.68660736083984, + "logps/rejected": -128.90638732910156, + "loss": 0.1572, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6080999374389648, + "rewards/margins": 3.3746752738952637, + "rewards/rejected": -1.7665754556655884, + "step": 2930 + }, + { + "epoch": 1.07, + "grad_norm": 32.99013154464336, + "learning_rate": 7.929761646650649e-08, + "logits/chosen": -2.245877504348755, + "logits/rejected": -1.9841814041137695, + "logps/chosen": -96.16056823730469, + "logps/rejected": -256.2076721191406, + "loss": 0.1878, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.466813564300537, + "rewards/margins": 4.286352157592773, + "rewards/rejected": -2.8195388317108154, + "step": 2940 + }, + { + "epoch": 1.07, + "grad_norm": 30.967043967811318, + "learning_rate": 7.882317908122083e-08, + "logits/chosen": -1.4433786869049072, + "logits/rejected": -1.1254138946533203, + "logps/chosen": -98.50581359863281, + "logps/rejected": -170.2156982421875, + "loss": 0.1753, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9996970891952515, + "rewards/margins": 3.900161027908325, + "rewards/rejected": -2.900463819503784, + "step": 2950 + }, + { + "epoch": 1.07, + "grad_norm": 21.50191304483852, + "learning_rate": 7.834858825730326e-08, + "logits/chosen": -1.6459989547729492, + "logits/rejected": -1.7681890726089478, + "logps/chosen": -95.52315521240234, + "logps/rejected": -140.78799438476562, + "loss": 0.1528, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.9920917749404907, + "rewards/margins": 3.2807719707489014, + "rewards/rejected": -2.2886805534362793, + "step": 2960 + }, + { + "epoch": 1.08, + "grad_norm": 17.260479958583524, + "learning_rate": 7.787386304187798e-08, + "logits/chosen": -1.4531595706939697, + "logits/rejected": -1.3701298236846924, + "logps/chosen": -93.404541015625, + "logps/rejected": -135.31427001953125, + "loss": 0.171, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0154047012329102, + "rewards/margins": 3.056318998336792, + "rewards/rejected": -2.040914297103882, + "step": 2970 + }, + { + "epoch": 1.08, + "grad_norm": 23.025875175350215, + "learning_rate": 7.739902248746283e-08, + "logits/chosen": -1.7231334447860718, + "logits/rejected": -1.83186936378479, + "logps/chosen": -107.64973449707031, + "logps/rejected": -145.9791717529297, + "loss": 0.1349, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5788373947143555, + "rewards/margins": 3.6469104290008545, + "rewards/rejected": -2.068073034286499, + "step": 2980 + }, + { + "epoch": 1.09, + "grad_norm": 32.06528399748813, + "learning_rate": 7.692408565120458e-08, + "logits/chosen": -2.0466766357421875, + "logits/rejected": -1.4406808614730835, + "logps/chosen": -95.41911315917969, + "logps/rejected": -297.7329406738281, + "loss": 0.1434, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.45736563205719, + "rewards/margins": 4.8933281898498535, + "rewards/rejected": -3.435962200164795, + "step": 2990 + }, + { + "epoch": 1.09, + "grad_norm": 31.87542686599211, + "learning_rate": 7.64490715941142e-08, + "logits/chosen": -2.2173619270324707, + "logits/rejected": -1.7960357666015625, + "logps/chosen": -85.83885192871094, + "logps/rejected": -169.8695831298828, + "loss": 0.1529, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.137475609779358, + "rewards/margins": 4.046895503997803, + "rewards/rejected": -2.9094200134277344, + "step": 3000 + }, + { + "epoch": 1.09, + "eval_logits/chosen": -2.4170167446136475, + "eval_logits/rejected": -2.2436559200286865, + "eval_logps/chosen": -89.03692626953125, + "eval_logps/rejected": -148.1836395263672, + "eval_loss": 0.27793529629707336, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 1.0919331312179565, + "eval_rewards/margins": 3.373152256011963, + "eval_rewards/rejected": -2.281219720840454, + "eval_runtime": 71.967, + "eval_samples_per_second": 12.367, + "eval_steps_per_second": 0.195, + "step": 3000 + }, + { + "epoch": 1.09, + "grad_norm": 30.683194890069508, + "learning_rate": 7.597399938030184e-08, + "logits/chosen": -1.7612870931625366, + "logits/rejected": -1.8522275686264038, + "logps/chosen": -73.18524169921875, + "logps/rejected": -130.78829956054688, + "loss": 0.1392, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7518160343170166, + "rewards/margins": 3.613239288330078, + "rewards/rejected": -1.861423134803772, + "step": 3010 + }, + { + "epoch": 1.1, + "grad_norm": 12.524140781131472, + "learning_rate": 7.549888807621168e-08, + "logits/chosen": -1.6050277948379517, + "logits/rejected": -1.5487821102142334, + "logps/chosen": -84.5078125, + "logps/rejected": -174.291259765625, + "loss": 0.1498, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.5817402601242065, + "rewards/margins": 3.690995693206787, + "rewards/rejected": -2.10925555229187, + "step": 3020 + }, + { + "epoch": 1.1, + "grad_norm": 19.057459821446617, + "learning_rate": 7.502375674985675e-08, + "logits/chosen": -2.2322306632995605, + "logits/rejected": -1.8400142192840576, + "logps/chosen": -94.57080078125, + "logps/rejected": -167.90066528320312, + "loss": 0.197, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.6182721853256226, + "rewards/margins": 3.7063040733337402, + "rewards/rejected": -2.0880320072174072, + "step": 3030 + }, + { + "epoch": 1.1, + "grad_norm": 39.27950520182761, + "learning_rate": 7.454862447005359e-08, + "logits/chosen": -2.199615001678467, + "logits/rejected": -1.5867798328399658, + "logps/chosen": -83.38591766357422, + "logps/rejected": -181.61129760742188, + "loss": 0.1505, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.513109564781189, + "rewards/margins": 5.107856750488281, + "rewards/rejected": -3.594747543334961, + "step": 3040 + }, + { + "epoch": 1.11, + "grad_norm": 29.496269378804925, + "learning_rate": 7.407351030565711e-08, + "logits/chosen": -2.2386364936828613, + "logits/rejected": -1.7992044687271118, + "logps/chosen": -70.04452514648438, + "logps/rejected": -168.61111450195312, + "loss": 0.1842, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.804783582687378, + "rewards/margins": 4.483792781829834, + "rewards/rejected": -2.679008960723877, + "step": 3050 + }, + { + "epoch": 1.11, + "grad_norm": 28.123692890471585, + "learning_rate": 7.359843332479512e-08, + "logits/chosen": -1.5445009469985962, + "logits/rejected": -1.632628083229065, + "logps/chosen": -97.18016815185547, + "logps/rejected": -156.12501525878906, + "loss": 0.1522, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9871312975883484, + "rewards/margins": 3.336866855621338, + "rewards/rejected": -2.3497352600097656, + "step": 3060 + }, + { + "epoch": 1.11, + "grad_norm": 24.98966890041787, + "learning_rate": 7.312341259410308e-08, + "logits/chosen": -1.8162829875946045, + "logits/rejected": -1.6895732879638672, + "logps/chosen": -82.59261322021484, + "logps/rejected": -126.13545227050781, + "loss": 0.1533, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.7326328754425049, + "rewards/margins": 3.6410274505615234, + "rewards/rejected": -1.9083948135375977, + "step": 3070 + }, + { + "epoch": 1.12, + "grad_norm": 38.009951638634, + "learning_rate": 7.264846717795899e-08, + "logits/chosen": -1.9883182048797607, + "logits/rejected": -1.5640618801116943, + "logps/chosen": -79.90910339355469, + "logps/rejected": -204.0009765625, + "loss": 0.1641, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.5611870288848877, + "rewards/margins": 3.625385284423828, + "rewards/rejected": -2.0641980171203613, + "step": 3080 + }, + { + "epoch": 1.12, + "grad_norm": 37.92405502481002, + "learning_rate": 7.217361613771814e-08, + "logits/chosen": -1.5013178586959839, + "logits/rejected": -1.6586967706680298, + "logps/chosen": -74.64991760253906, + "logps/rejected": -133.19850158691406, + "loss": 0.1487, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.388861894607544, + "rewards/margins": 3.804910659790039, + "rewards/rejected": -2.416048765182495, + "step": 3090 + }, + { + "epoch": 1.13, + "grad_norm": 33.70546037656875, + "learning_rate": 7.16988785309482e-08, + "logits/chosen": -1.7027190923690796, + "logits/rejected": -1.2969590425491333, + "logps/chosen": -91.05853271484375, + "logps/rejected": -159.3774871826172, + "loss": 0.1675, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.82365882396698, + "rewards/margins": 4.186999320983887, + "rewards/rejected": -3.363340377807617, + "step": 3100 + }, + { + "epoch": 1.13, + "eval_logits/chosen": -2.3937642574310303, + "eval_logits/rejected": -2.2311301231384277, + "eval_logps/chosen": -88.40351104736328, + "eval_logps/rejected": -148.453857421875, + "eval_loss": 0.2778138816356659, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 1.155274748802185, + "eval_rewards/margins": 3.4635140895843506, + "eval_rewards/rejected": -2.308239459991455, + "eval_runtime": 71.9837, + "eval_samples_per_second": 12.364, + "eval_steps_per_second": 0.194, + "step": 3100 + }, + { + "epoch": 1.13, + "grad_norm": 10.977394963530008, + "learning_rate": 7.122427341066431e-08, + "logits/chosen": -1.7731870412826538, + "logits/rejected": -1.5993355512619019, + "logps/chosen": -94.86338806152344, + "logps/rejected": -159.97232055664062, + "loss": 0.1694, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.6271803975105286, + "rewards/margins": 3.4266326427459717, + "rewards/rejected": -2.799452304840088, + "step": 3110 + }, + { + "epoch": 1.13, + "grad_norm": 35.6693717697895, + "learning_rate": 7.074981982456438e-08, + "logits/chosen": -1.6707003116607666, + "logits/rejected": -1.3329427242279053, + "logps/chosen": -74.29015350341797, + "logps/rejected": -126.54850769042969, + "loss": 0.1541, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.1601439714431763, + "rewards/margins": 3.103707790374756, + "rewards/rejected": -1.9435638189315796, + "step": 3120 + }, + { + "epoch": 1.14, + "grad_norm": 26.223529400996284, + "learning_rate": 7.027553681426475e-08, + "logits/chosen": -1.6821495294570923, + "logits/rejected": -1.6677749156951904, + "logps/chosen": -69.13279724121094, + "logps/rejected": -125.01466369628906, + "loss": 0.1206, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.598923683166504, + "rewards/margins": 3.550352096557617, + "rewards/rejected": -1.951428771018982, + "step": 3130 + }, + { + "epoch": 1.14, + "grad_norm": 33.00723755782217, + "learning_rate": 6.980144341453587e-08, + "logits/chosen": -1.8480640649795532, + "logits/rejected": -1.4870339632034302, + "logps/chosen": -93.90699768066406, + "logps/rejected": -151.79629516601562, + "loss": 0.1937, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.6560937166213989, + "rewards/margins": 3.1022205352783203, + "rewards/rejected": -2.446126937866211, + "step": 3140 + }, + { + "epoch": 1.14, + "grad_norm": 28.329720861751795, + "learning_rate": 6.932755865253842e-08, + "logits/chosen": -2.052046537399292, + "logits/rejected": -1.551355242729187, + "logps/chosen": -79.02404022216797, + "logps/rejected": -138.8905792236328, + "loss": 0.1736, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4904524087905884, + "rewards/margins": 3.604060411453247, + "rewards/rejected": -2.113607883453369, + "step": 3150 + }, + { + "epoch": 1.15, + "grad_norm": 22.857016307017492, + "learning_rate": 6.885390154705964e-08, + "logits/chosen": -1.932428002357483, + "logits/rejected": -1.4152684211730957, + "logps/chosen": -98.20048522949219, + "logps/rejected": -169.7655792236328, + "loss": 0.1481, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0732301473617554, + "rewards/margins": 4.125487327575684, + "rewards/rejected": -3.0522565841674805, + "step": 3160 + }, + { + "epoch": 1.15, + "grad_norm": 27.862330812022716, + "learning_rate": 6.838049110775007e-08, + "logits/chosen": -1.6845344305038452, + "logits/rejected": -1.5890744924545288, + "logps/chosen": -98.02180480957031, + "logps/rejected": -128.43356323242188, + "loss": 0.1365, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.3146222829818726, + "rewards/margins": 2.931612014770508, + "rewards/rejected": -1.6169898509979248, + "step": 3170 + }, + { + "epoch": 1.15, + "grad_norm": 29.94290227789718, + "learning_rate": 6.790734633436058e-08, + "logits/chosen": -1.99330735206604, + "logits/rejected": -1.2950857877731323, + "logps/chosen": -99.69730377197266, + "logps/rejected": -170.60195922851562, + "loss": 0.1448, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.1823816299438477, + "rewards/margins": 3.796720504760742, + "rewards/rejected": -2.6143388748168945, + "step": 3180 + }, + { + "epoch": 1.16, + "grad_norm": 64.10187404062623, + "learning_rate": 6.743448621597989e-08, + "logits/chosen": -1.8239631652832031, + "logits/rejected": -1.8418114185333252, + "logps/chosen": -100.65877532958984, + "logps/rejected": -151.15411376953125, + "loss": 0.131, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8053275942802429, + "rewards/margins": 3.210911989212036, + "rewards/rejected": -2.4055840969085693, + "step": 3190 + }, + { + "epoch": 1.16, + "grad_norm": 26.247231746645152, + "learning_rate": 6.696192973027241e-08, + "logits/chosen": -2.037968158721924, + "logits/rejected": -1.6115257740020752, + "logps/chosen": -73.27265930175781, + "logps/rejected": -159.865966796875, + "loss": 0.1542, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.6886720657348633, + "rewards/margins": 3.8989975452423096, + "rewards/rejected": -2.2103257179260254, + "step": 3200 + }, + { + "epoch": 1.16, + "eval_logits/chosen": -2.4345650672912598, + "eval_logits/rejected": -2.2589755058288574, + "eval_logps/chosen": -90.60651397705078, + "eval_logps/rejected": -151.0882110595703, + "eval_loss": 0.27637749910354614, + "eval_rewards/accuracies": 0.875, + "eval_rewards/chosen": 0.9349749684333801, + "eval_rewards/margins": 3.50665283203125, + "eval_rewards/rejected": -2.5716776847839355, + "eval_runtime": 71.9022, + "eval_samples_per_second": 12.378, + "eval_steps_per_second": 0.195, + "step": 3200 + }, + { + "epoch": 1.17, + "grad_norm": 26.043972128745228, + "learning_rate": 6.648969584271668e-08, + "logits/chosen": -1.7389099597930908, + "logits/rejected": -1.7843681573867798, + "logps/chosen": -92.87619018554688, + "logps/rejected": -143.67486572265625, + "loss": 0.155, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.270666241645813, + "rewards/margins": 3.2985167503356934, + "rewards/rejected": -2.0278501510620117, + "step": 3210 + }, + { + "epoch": 1.17, + "grad_norm": 48.1935636258946, + "learning_rate": 6.601780350584408e-08, + "logits/chosen": -2.341850757598877, + "logits/rejected": -2.0155091285705566, + "logps/chosen": -91.69564819335938, + "logps/rejected": -173.34188842773438, + "loss": 0.1687, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.45611572265625, + "rewards/margins": 4.184123516082764, + "rewards/rejected": -2.7280075550079346, + "step": 3220 + }, + { + "epoch": 1.17, + "grad_norm": 23.817770856670688, + "learning_rate": 6.55462716584783e-08, + "logits/chosen": -1.6578280925750732, + "logits/rejected": -1.547189712524414, + "logps/chosen": -70.36898803710938, + "logps/rejected": -163.3883819580078, + "loss": 0.1816, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.5254313945770264, + "rewards/margins": 4.97360897064209, + "rewards/rejected": -3.4481780529022217, + "step": 3230 + }, + { + "epoch": 1.18, + "grad_norm": 30.43724197841503, + "learning_rate": 6.507511922497525e-08, + "logits/chosen": -1.9906114339828491, + "logits/rejected": -1.7292976379394531, + "logps/chosen": -70.70221710205078, + "logps/rejected": -145.86181640625, + "loss": 0.1662, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.5770456790924072, + "rewards/margins": 4.609046459197998, + "rewards/rejected": -3.03200101852417, + "step": 3240 + }, + { + "epoch": 1.18, + "grad_norm": 30.164526587495324, + "learning_rate": 6.460436511446348e-08, + "logits/chosen": -1.9928280115127563, + "logits/rejected": -1.7679364681243896, + "logps/chosen": -100.5902328491211, + "logps/rejected": -178.82383728027344, + "loss": 0.1463, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6832008361816406, + "rewards/margins": 4.548179626464844, + "rewards/rejected": -3.864978790283203, + "step": 3250 + }, + { + "epoch": 1.18, + "grad_norm": 52.18169505988799, + "learning_rate": 6.413402822008541e-08, + "logits/chosen": -2.0655717849731445, + "logits/rejected": -1.3271939754486084, + "logps/chosen": -87.31197357177734, + "logps/rejected": -160.46385192871094, + "loss": 0.1905, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.8497316241264343, + "rewards/margins": 3.820449113845825, + "rewards/rejected": -2.9707179069519043, + "step": 3260 + }, + { + "epoch": 1.19, + "grad_norm": 28.76843973839471, + "learning_rate": 6.366412741823888e-08, + "logits/chosen": -1.6622741222381592, + "logits/rejected": -1.776906967163086, + "logps/chosen": -94.9710693359375, + "logps/rejected": -128.8150634765625, + "loss": 0.1917, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.5094915628433228, + "rewards/margins": 3.212104082107544, + "rewards/rejected": -1.7026126384735107, + "step": 3270 + }, + { + "epoch": 1.19, + "grad_norm": 29.1350717468671, + "learning_rate": 6.31946815678198e-08, + "logits/chosen": -2.0058090686798096, + "logits/rejected": -1.7673218250274658, + "logps/chosen": -95.3953857421875, + "logps/rejected": -176.0774688720703, + "loss": 0.1337, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4662463665008545, + "rewards/margins": 4.142214775085449, + "rewards/rejected": -2.6759676933288574, + "step": 3280 + }, + { + "epoch": 1.19, + "grad_norm": 27.79208712883472, + "learning_rate": 6.272570950946508e-08, + "logits/chosen": -2.106937885284424, + "logits/rejected": -1.6478996276855469, + "logps/chosen": -85.5794906616211, + "logps/rejected": -173.1703338623047, + "loss": 0.1716, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.1876065731048584, + "rewards/margins": 4.5130767822265625, + "rewards/rejected": -3.325469970703125, + "step": 3290 + }, + { + "epoch": 1.2, + "grad_norm": 26.268117829624686, + "learning_rate": 6.225723006479663e-08, + "logits/chosen": -2.263885259628296, + "logits/rejected": -1.6747421026229858, + "logps/chosen": -63.94829559326172, + "logps/rejected": -163.29486083984375, + "loss": 0.1694, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9152793884277344, + "rewards/margins": 4.830869197845459, + "rewards/rejected": -2.9155895709991455, + "step": 3300 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -2.4307594299316406, + "eval_logits/rejected": -2.257986545562744, + "eval_logps/chosen": -91.0833969116211, + "eval_logps/rejected": -151.17442321777344, + "eval_loss": 0.27279049158096313, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 0.887286365032196, + "eval_rewards/margins": 3.4675843715667725, + "eval_rewards/rejected": -2.5802979469299316, + "eval_runtime": 71.844, + "eval_samples_per_second": 12.388, + "eval_steps_per_second": 0.195, + "step": 3300 + }, + { + "epoch": 1.2, + "grad_norm": 20.771596605110865, + "learning_rate": 6.178926203566588e-08, + "logits/chosen": -1.9085609912872314, + "logits/rejected": -1.6423466205596924, + "logps/chosen": -88.54236602783203, + "logps/rejected": -151.7003631591797, + "loss": 0.1747, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4402354955673218, + "rewards/margins": 4.408102989196777, + "rewards/rejected": -2.9678680896759033, + "step": 3310 + }, + { + "epoch": 1.21, + "grad_norm": 23.09159131359664, + "learning_rate": 6.132182420339918e-08, + "logits/chosen": -1.6666587591171265, + "logits/rejected": -1.2324297428131104, + "logps/chosen": -81.8795166015625, + "logps/rejected": -138.54940795898438, + "loss": 0.1441, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4413508176803589, + "rewards/margins": 3.961609363555908, + "rewards/rejected": -2.5202584266662598, + "step": 3320 + }, + { + "epoch": 1.21, + "grad_norm": 33.84562565815576, + "learning_rate": 6.085493532804413e-08, + "logits/chosen": -2.300516128540039, + "logits/rejected": -2.122788906097412, + "logps/chosen": -86.87980651855469, + "logps/rejected": -168.46022033691406, + "loss": 0.1544, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0005496740341187, + "rewards/margins": 3.9717116355895996, + "rewards/rejected": -2.9711620807647705, + "step": 3330 + }, + { + "epoch": 1.21, + "grad_norm": 14.408331018978265, + "learning_rate": 6.03886141476166e-08, + "logits/chosen": -2.2469568252563477, + "logits/rejected": -1.8382642269134521, + "logps/chosen": -81.93803405761719, + "logps/rejected": -163.33473205566406, + "loss": 0.1334, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7893930673599243, + "rewards/margins": 4.628080368041992, + "rewards/rejected": -2.8386874198913574, + "step": 3340 + }, + { + "epoch": 1.22, + "grad_norm": 33.15124696775361, + "learning_rate": 5.992287937734873e-08, + "logits/chosen": -2.054621934890747, + "logits/rejected": -1.8777787685394287, + "logps/chosen": -84.07951354980469, + "logps/rejected": -179.1838836669922, + "loss": 0.144, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.4575679302215576, + "rewards/margins": 4.167536735534668, + "rewards/rejected": -2.7099688053131104, + "step": 3350 + }, + { + "epoch": 1.22, + "grad_norm": 32.145978602477854, + "learning_rate": 5.9457749708937756e-08, + "logits/chosen": -1.9643110036849976, + "logits/rejected": -1.795340895652771, + "logps/chosen": -82.72921752929688, + "logps/rejected": -128.0702667236328, + "loss": 0.1543, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9123824238777161, + "rewards/margins": 3.376009464263916, + "rewards/rejected": -2.4636270999908447, + "step": 3360 + }, + { + "epoch": 1.22, + "grad_norm": 49.92991955367269, + "learning_rate": 5.8993243809795915e-08, + "logits/chosen": -1.8667415380477905, + "logits/rejected": -1.3340458869934082, + "logps/chosen": -91.13977813720703, + "logps/rejected": -158.5907440185547, + "loss": 0.1547, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.49219757318496704, + "rewards/margins": 3.9671154022216797, + "rewards/rejected": -3.4749176502227783, + "step": 3370 + }, + { + "epoch": 1.23, + "grad_norm": 31.770406094051957, + "learning_rate": 5.852938032230126e-08, + "logits/chosen": -2.3319461345672607, + "logits/rejected": -1.882067322731018, + "logps/chosen": -76.39924621582031, + "logps/rejected": -176.21929931640625, + "loss": 0.1699, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2963407039642334, + "rewards/margins": 6.4765625, + "rewards/rejected": -4.180222034454346, + "step": 3380 + }, + { + "epoch": 1.23, + "grad_norm": 21.3080930046784, + "learning_rate": 5.806617786304937e-08, + "logits/chosen": -1.8695051670074463, + "logits/rejected": -1.3510863780975342, + "logps/chosen": -86.91441345214844, + "logps/rejected": -250.5083770751953, + "loss": 0.1788, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.846575140953064, + "rewards/margins": 5.218047142028809, + "rewards/rejected": -4.371471405029297, + "step": 3390 + }, + { + "epoch": 1.23, + "grad_norm": 41.88504763461129, + "learning_rate": 5.760365502210634e-08, + "logits/chosen": -1.7990360260009766, + "logits/rejected": -1.9284346103668213, + "logps/chosen": -91.82389068603516, + "logps/rejected": -152.16659545898438, + "loss": 0.1763, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4950382709503174, + "rewards/margins": 4.346599578857422, + "rewards/rejected": -2.8515613079071045, + "step": 3400 + }, + { + "epoch": 1.23, + "eval_logits/chosen": -2.414144277572632, + "eval_logits/rejected": -2.243586778640747, + "eval_logps/chosen": -91.0235366821289, + "eval_logps/rejected": -152.044921875, + "eval_loss": 0.26993635296821594, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 0.8932717442512512, + "eval_rewards/margins": 3.5606191158294678, + "eval_rewards/rejected": -2.6673471927642822, + "eval_runtime": 71.993, + "eval_samples_per_second": 12.362, + "eval_steps_per_second": 0.194, + "step": 3400 + }, + { + "epoch": 1.24, + "grad_norm": 25.05732311627679, + "learning_rate": 5.7141830362262514e-08, + "logits/chosen": -1.7883466482162476, + "logits/rejected": -1.5041227340698242, + "logps/chosen": -86.22000885009766, + "logps/rejected": -157.66397094726562, + "loss": 0.1676, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.7772796154022217, + "rewards/margins": 4.851300239562988, + "rewards/rejected": -3.0740203857421875, + "step": 3410 + }, + { + "epoch": 1.24, + "grad_norm": 44.30312099667292, + "learning_rate": 5.6680722418287674e-08, + "logits/chosen": -2.064953565597534, + "logits/rejected": -1.8630918264389038, + "logps/chosen": -60.13167190551758, + "logps/rejected": -247.2148895263672, + "loss": 0.1305, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.1393609046936035, + "rewards/margins": 4.484022617340088, + "rewards/rejected": -2.3446614742279053, + "step": 3420 + }, + { + "epoch": 1.25, + "grad_norm": 48.64244001308947, + "learning_rate": 5.622034969618704e-08, + "logits/chosen": -1.976629614830017, + "logits/rejected": -1.4999226331710815, + "logps/chosen": -77.93717956542969, + "logps/rejected": -151.5218963623047, + "loss": 0.1785, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5172656774520874, + "rewards/margins": 3.7420005798339844, + "rewards/rejected": -2.2247347831726074, + "step": 3430 + }, + { + "epoch": 1.25, + "grad_norm": 35.51662255328931, + "learning_rate": 5.576073067245862e-08, + "logits/chosen": -1.8411935567855835, + "logits/rejected": -1.700330376625061, + "logps/chosen": -67.32380676269531, + "logps/rejected": -140.62051391601562, + "loss": 0.167, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6732587814331055, + "rewards/margins": 4.312959671020508, + "rewards/rejected": -2.6397011280059814, + "step": 3440 + }, + { + "epoch": 1.25, + "grad_norm": 42.40486506801471, + "learning_rate": 5.530188379335166e-08, + "logits/chosen": -1.5920283794403076, + "logits/rejected": -1.1166603565216064, + "logps/chosen": -70.99920654296875, + "logps/rejected": -169.20806884765625, + "loss": 0.1756, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8735821843147278, + "rewards/margins": 5.562444686889648, + "rewards/rejected": -4.6888628005981445, + "step": 3450 + }, + { + "epoch": 1.26, + "grad_norm": 26.913972282878095, + "learning_rate": 5.4843827474126274e-08, + "logits/chosen": -1.9140102863311768, + "logits/rejected": -1.400821328163147, + "logps/chosen": -74.26724243164062, + "logps/rejected": -132.2233123779297, + "loss": 0.1374, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5520453453063965, + "rewards/margins": 4.453476905822754, + "rewards/rejected": -2.90143084526062, + "step": 3460 + }, + { + "epoch": 1.26, + "grad_norm": 26.13373331897645, + "learning_rate": 5.438658009831448e-08, + "logits/chosen": -1.73845636844635, + "logits/rejected": -1.6531593799591064, + "logps/chosen": -96.46147918701172, + "logps/rejected": -160.9883575439453, + "loss": 0.1403, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.2454642057418823, + "rewards/margins": 4.1239237785339355, + "rewards/rejected": -2.8784596920013428, + "step": 3470 + }, + { + "epoch": 1.26, + "grad_norm": 34.42235214848031, + "learning_rate": 5.39301600169823e-08, + "logits/chosen": -1.9057807922363281, + "logits/rejected": -1.927220344543457, + "logps/chosen": -115.20938873291016, + "logps/rejected": -181.9356231689453, + "loss": 0.1796, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7145345211029053, + "rewards/margins": 4.270758152008057, + "rewards/rejected": -3.5562233924865723, + "step": 3480 + }, + { + "epoch": 1.27, + "grad_norm": 20.971370688650854, + "learning_rate": 5.347458554799332e-08, + "logits/chosen": -1.615875005722046, + "logits/rejected": -1.6177091598510742, + "logps/chosen": -118.13121032714844, + "logps/rejected": -160.26910400390625, + "loss": 0.163, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0960075855255127, + "rewards/margins": 3.6775288581848145, + "rewards/rejected": -2.581521511077881, + "step": 3490 + }, + { + "epoch": 1.27, + "grad_norm": 27.13239221195734, + "learning_rate": 5.301987497527353e-08, + "logits/chosen": -2.2539658546447754, + "logits/rejected": -1.4924699068069458, + "logps/chosen": -91.34033203125, + "logps/rejected": -181.88430786132812, + "loss": 0.1526, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.448373556137085, + "rewards/margins": 4.606696128845215, + "rewards/rejected": -3.1583220958709717, + "step": 3500 + }, + { + "epoch": 1.27, + "eval_logits/chosen": -2.421151876449585, + "eval_logits/rejected": -2.2513325214385986, + "eval_logps/chosen": -91.84569549560547, + "eval_logps/rejected": -153.45870971679688, + "eval_loss": 0.2666407823562622, + "eval_rewards/accuracies": 0.875, + "eval_rewards/chosen": 0.8110561370849609, + "eval_rewards/margins": 3.6197829246520996, + "eval_rewards/rejected": -2.8087267875671387, + "eval_runtime": 71.9717, + "eval_samples_per_second": 12.366, + "eval_steps_per_second": 0.195, + "step": 3500 + }, + { + "epoch": 1.27, + "grad_norm": 29.49364758024353, + "learning_rate": 5.256604654807742e-08, + "logits/chosen": -1.7953096628189087, + "logits/rejected": -1.669303297996521, + "logps/chosen": -90.09492492675781, + "logps/rejected": -149.954833984375, + "loss": 0.1912, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0876281261444092, + "rewards/margins": 3.9230809211730957, + "rewards/rejected": -2.8354527950286865, + "step": 3510 + }, + { + "epoch": 1.28, + "grad_norm": 41.572390412456194, + "learning_rate": 5.21131184802557e-08, + "logits/chosen": -2.22542142868042, + "logits/rejected": -1.8966829776763916, + "logps/chosen": -83.38715362548828, + "logps/rejected": -141.80059814453125, + "loss": 0.1877, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.1568256616592407, + "rewards/margins": 3.453406572341919, + "rewards/rejected": -2.2965807914733887, + "step": 3520 + }, + { + "epoch": 1.28, + "grad_norm": 24.565593605623125, + "learning_rate": 5.166110894952426e-08, + "logits/chosen": -1.9109958410263062, + "logits/rejected": -1.4489656686782837, + "logps/chosen": -58.008819580078125, + "logps/rejected": -191.261474609375, + "loss": 0.1478, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9802272319793701, + "rewards/margins": 6.276611804962158, + "rewards/rejected": -4.296384811401367, + "step": 3530 + }, + { + "epoch": 1.28, + "grad_norm": 53.61199506683095, + "learning_rate": 5.1210036096734595e-08, + "logits/chosen": -1.5364563465118408, + "logits/rejected": -1.3872387409210205, + "logps/chosen": -107.84410095214844, + "logps/rejected": -160.59361267089844, + "loss": 0.1401, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.5507475137710571, + "rewards/margins": 3.1839842796325684, + "rewards/rejected": -2.633236885070801, + "step": 3540 + }, + { + "epoch": 1.29, + "grad_norm": 28.74282956678025, + "learning_rate": 5.0759918025145814e-08, + "logits/chosen": -2.2165632247924805, + "logits/rejected": -1.650313138961792, + "logps/chosen": -93.6681137084961, + "logps/rejected": -145.92111206054688, + "loss": 0.1582, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.3355414867401123, + "rewards/margins": 4.038471698760986, + "rewards/rejected": -2.702929973602295, + "step": 3550 + }, + { + "epoch": 1.29, + "grad_norm": 26.165374741112064, + "learning_rate": 5.031077279969797e-08, + "logits/chosen": -1.9472557306289673, + "logits/rejected": -1.7647784948349, + "logps/chosen": -76.48893737792969, + "logps/rejected": -135.52386474609375, + "loss": 0.1404, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.397589921951294, + "rewards/margins": 4.449340343475342, + "rewards/rejected": -3.051750898361206, + "step": 3560 + }, + { + "epoch": 1.3, + "grad_norm": 36.82965132004594, + "learning_rate": 4.9862618446287206e-08, + "logits/chosen": -1.9602140188217163, + "logits/rejected": -1.6700010299682617, + "logps/chosen": -72.4864730834961, + "logps/rejected": -136.77584838867188, + "loss": 0.171, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.6417019367218018, + "rewards/margins": 3.702420473098755, + "rewards/rejected": -2.060718536376953, + "step": 3570 + }, + { + "epoch": 1.3, + "grad_norm": 24.181445490171278, + "learning_rate": 4.9415472951042175e-08, + "logits/chosen": -1.9628846645355225, + "logits/rejected": -1.5444905757904053, + "logps/chosen": -81.23819732666016, + "logps/rejected": -182.43032836914062, + "loss": 0.121, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.530139684677124, + "rewards/margins": 4.815797328948975, + "rewards/rejected": -3.2856578826904297, + "step": 3580 + }, + { + "epoch": 1.3, + "grad_norm": 45.8781810453152, + "learning_rate": 4.8969354259602245e-08, + "logits/chosen": -1.7887170314788818, + "logits/rejected": -1.6762058734893799, + "logps/chosen": -87.67835998535156, + "logps/rejected": -148.01345825195312, + "loss": 0.1509, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.3987677097320557, + "rewards/margins": 4.317151069641113, + "rewards/rejected": -2.9183835983276367, + "step": 3590 + }, + { + "epoch": 1.31, + "grad_norm": 46.13081539436112, + "learning_rate": 4.85242802763973e-08, + "logits/chosen": -1.9573310613632202, + "logits/rejected": -1.7241952419281006, + "logps/chosen": -68.08113861083984, + "logps/rejected": -126.60243225097656, + "loss": 0.1819, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.9216814041137695, + "rewards/margins": 4.243165969848633, + "rewards/rejected": -2.321484327316284, + "step": 3600 + }, + { + "epoch": 1.31, + "eval_logits/chosen": -2.4482152462005615, + "eval_logits/rejected": -2.2676963806152344, + "eval_logps/chosen": -91.19036865234375, + "eval_logps/rejected": -152.58682250976562, + "eval_loss": 0.26565536856651306, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 0.8765901327133179, + "eval_rewards/margins": 3.5981290340423584, + "eval_rewards/rejected": -2.721538782119751, + "eval_runtime": 71.9607, + "eval_samples_per_second": 12.368, + "eval_steps_per_second": 0.195, + "step": 3600 + }, + { + "epoch": 1.31, + "grad_norm": 23.242650326252527, + "learning_rate": 4.808026886392907e-08, + "logits/chosen": -1.5748409032821655, + "logits/rejected": -1.6680151224136353, + "logps/chosen": -97.38771057128906, + "logps/rejected": -131.87599182128906, + "loss": 0.1103, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.183221459388733, + "rewards/margins": 2.838090419769287, + "rewards/rejected": -1.654868721961975, + "step": 3610 + }, + { + "epoch": 1.31, + "grad_norm": 20.775972068583737, + "learning_rate": 4.763733784205434e-08, + "logits/chosen": -2.157090902328491, + "logits/rejected": -1.7526382207870483, + "logps/chosen": -83.79398345947266, + "logps/rejected": -138.4102020263672, + "loss": 0.1308, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.2044012546539307, + "rewards/margins": 3.7157578468322754, + "rewards/rejected": -2.5113563537597656, + "step": 3620 + }, + { + "epoch": 1.32, + "grad_norm": 27.624631894217778, + "learning_rate": 4.7195504987269736e-08, + "logits/chosen": -1.8666884899139404, + "logits/rejected": -1.962099313735962, + "logps/chosen": -92.00392150878906, + "logps/rejected": -265.90936279296875, + "loss": 0.1808, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.662792444229126, + "rewards/margins": 4.868588924407959, + "rewards/rejected": -3.205796718597412, + "step": 3630 + }, + { + "epoch": 1.32, + "grad_norm": 35.3219065228648, + "learning_rate": 4.6754788031998294e-08, + "logits/chosen": -2.1062629222869873, + "logits/rejected": -1.9411357641220093, + "logps/chosen": -81.11627960205078, + "logps/rejected": -207.1051483154297, + "loss": 0.1747, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.1783745288848877, + "rewards/margins": 4.6277360916137695, + "rewards/rejected": -3.4493613243103027, + "step": 3640 + }, + { + "epoch": 1.32, + "grad_norm": 15.460191475491241, + "learning_rate": 4.631520466387777e-08, + "logits/chosen": -1.867004632949829, + "logits/rejected": -1.6915152072906494, + "logps/chosen": -110.6289291381836, + "logps/rejected": -156.9150390625, + "loss": 0.1242, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.2745405435562134, + "rewards/margins": 3.7722878456115723, + "rewards/rejected": -2.4977474212646484, + "step": 3650 + }, + { + "epoch": 1.33, + "grad_norm": 35.15290230831595, + "learning_rate": 4.587677252505077e-08, + "logits/chosen": -1.8647207021713257, + "logits/rejected": -1.6314834356307983, + "logps/chosen": -82.48086547851562, + "logps/rejected": -158.58407592773438, + "loss": 0.1358, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5844959020614624, + "rewards/margins": 4.390726566314697, + "rewards/rejected": -2.806230068206787, + "step": 3660 + }, + { + "epoch": 1.33, + "grad_norm": 31.64353347707315, + "learning_rate": 4.5439509211456734e-08, + "logits/chosen": -2.0931248664855957, + "logits/rejected": -1.6264079809188843, + "logps/chosen": -62.890602111816406, + "logps/rejected": -150.46688842773438, + "loss": 0.1504, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.1719433069229126, + "rewards/margins": 4.196671009063721, + "rewards/rejected": -3.0247273445129395, + "step": 3670 + }, + { + "epoch": 1.34, + "grad_norm": 14.71173714786521, + "learning_rate": 4.500343227212572e-08, + "logits/chosen": -1.7733246088027954, + "logits/rejected": -2.041614294052124, + "logps/chosen": -123.92191314697266, + "logps/rejected": -155.9501953125, + "loss": 0.1176, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.6424921154975891, + "rewards/margins": 3.105193853378296, + "rewards/rejected": -2.4627020359039307, + "step": 3680 + }, + { + "epoch": 1.34, + "grad_norm": 24.247587972420103, + "learning_rate": 4.4568559208474127e-08, + "logits/chosen": -1.6494388580322266, + "logits/rejected": -1.4123470783233643, + "logps/chosen": -117.4607162475586, + "logps/rejected": -177.09315490722656, + "loss": 0.1495, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.41657859086990356, + "rewards/margins": 4.236758232116699, + "rewards/rejected": -3.8201801776885986, + "step": 3690 + }, + { + "epoch": 1.34, + "grad_norm": 26.64386939815121, + "learning_rate": 4.4134907473602205e-08, + "logits/chosen": -2.0543887615203857, + "logits/rejected": -1.703619360923767, + "logps/chosen": -76.43885803222656, + "logps/rejected": -196.35208129882812, + "loss": 0.1192, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.2960059642791748, + "rewards/margins": 4.0435991287231445, + "rewards/rejected": -2.747593641281128, + "step": 3700 + }, + { + "epoch": 1.34, + "eval_logits/chosen": -2.443880319595337, + "eval_logits/rejected": -2.264932155609131, + "eval_logps/chosen": -91.3131332397461, + "eval_logps/rejected": -153.05369567871094, + "eval_loss": 0.26342639327049255, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 0.8643126487731934, + "eval_rewards/margins": 3.6325364112854004, + "eval_rewards/rejected": -2.768224000930786, + "eval_runtime": 71.8523, + "eval_samples_per_second": 12.387, + "eval_steps_per_second": 0.195, + "step": 3700 + }, + { + "epoch": 1.35, + "grad_norm": 40.03721241840345, + "learning_rate": 4.370249447159372e-08, + "logits/chosen": -1.892690896987915, + "logits/rejected": -1.5442091226577759, + "logps/chosen": -105.28565979003906, + "logps/rejected": -149.9592742919922, + "loss": 0.1496, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.7859503030776978, + "rewards/margins": 3.7989330291748047, + "rewards/rejected": -3.0129826068878174, + "step": 3710 + }, + { + "epoch": 1.35, + "grad_norm": 35.14173667461832, + "learning_rate": 4.32713375568174e-08, + "logits/chosen": -1.6806223392486572, + "logits/rejected": -1.7743151187896729, + "logps/chosen": -96.13143157958984, + "logps/rejected": -149.65716552734375, + "loss": 0.1656, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.204106092453003, + "rewards/margins": 3.2453396320343018, + "rewards/rejected": -2.041234016418457, + "step": 3720 + }, + { + "epoch": 1.35, + "grad_norm": 18.483200235512868, + "learning_rate": 4.284145403323043e-08, + "logits/chosen": -1.508544683456421, + "logits/rejected": -1.6680357456207275, + "logps/chosen": -111.27012634277344, + "logps/rejected": -159.14892578125, + "loss": 0.1446, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.998451828956604, + "rewards/margins": 4.472312927246094, + "rewards/rejected": -3.4738609790802, + "step": 3730 + }, + { + "epoch": 1.36, + "grad_norm": 36.84141617747612, + "learning_rate": 4.2412861153684e-08, + "logits/chosen": -1.8184179067611694, + "logits/rejected": -1.6679880619049072, + "logps/chosen": -99.74363708496094, + "logps/rejected": -159.52243041992188, + "loss": 0.1462, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8888320922851562, + "rewards/margins": 3.3356125354766846, + "rewards/rejected": -2.4467806816101074, + "step": 3740 + }, + { + "epoch": 1.36, + "grad_norm": 20.65082234776276, + "learning_rate": 4.198557611923083e-08, + "logits/chosen": -1.5405653715133667, + "logits/rejected": -1.5137519836425781, + "logps/chosen": -83.92024230957031, + "logps/rejected": -171.1388702392578, + "loss": 0.1492, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2103972434997559, + "rewards/margins": 5.250774383544922, + "rewards/rejected": -4.040377616882324, + "step": 3750 + }, + { + "epoch": 1.36, + "grad_norm": 25.88209046742063, + "learning_rate": 4.1559616078434946e-08, + "logits/chosen": -2.0902841091156006, + "logits/rejected": -1.6206400394439697, + "logps/chosen": -79.99602508544922, + "logps/rejected": -223.35891723632812, + "loss": 0.1586, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.2605037689208984, + "rewards/margins": 4.679810047149658, + "rewards/rejected": -3.419306993484497, + "step": 3760 + }, + { + "epoch": 1.37, + "grad_norm": 31.138553957115032, + "learning_rate": 4.113499812668331e-08, + "logits/chosen": -2.0036697387695312, + "logits/rejected": -1.8283746242523193, + "logps/chosen": -75.56672668457031, + "logps/rejected": -130.79147338867188, + "loss": 0.1369, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.610881507396698, + "rewards/margins": 3.6262385845184326, + "rewards/rejected": -3.015357255935669, + "step": 3770 + }, + { + "epoch": 1.37, + "grad_norm": 22.960729088747243, + "learning_rate": 4.071173930549979e-08, + "logits/chosen": -1.7299280166625977, + "logits/rejected": -1.4976304769515991, + "logps/chosen": -94.44389343261719, + "logps/rejected": -154.29798889160156, + "loss": 0.1498, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5378260612487793, + "rewards/margins": 3.7743492126464844, + "rewards/rejected": -3.236523151397705, + "step": 3780 + }, + { + "epoch": 1.38, + "grad_norm": 28.61684578895429, + "learning_rate": 4.0289856601861285e-08, + "logits/chosen": -1.8740425109863281, + "logits/rejected": -1.5999120473861694, + "logps/chosen": -87.98422241210938, + "logps/rejected": -153.87075805664062, + "loss": 0.1542, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.4116048216819763, + "rewards/margins": 3.741361141204834, + "rewards/rejected": -3.32975697517395, + "step": 3790 + }, + { + "epoch": 1.38, + "grad_norm": 35.76212329882805, + "learning_rate": 3.9869366947515747e-08, + "logits/chosen": -2.0777387619018555, + "logits/rejected": -1.9436269998550415, + "logps/chosen": -91.76081848144531, + "logps/rejected": -149.74957275390625, + "loss": 0.144, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.5345284938812256, + "rewards/margins": 4.457833290100098, + "rewards/rejected": -2.923304557800293, + "step": 3800 + }, + { + "epoch": 1.38, + "eval_logits/chosen": -2.445099115371704, + "eval_logits/rejected": -2.2687337398529053, + "eval_logps/chosen": -92.02857971191406, + "eval_logps/rejected": -154.06690979003906, + "eval_loss": 0.263904869556427, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 0.7927670478820801, + "eval_rewards/margins": 3.662313938140869, + "eval_rewards/rejected": -2.869547128677368, + "eval_runtime": 72.6671, + "eval_samples_per_second": 12.248, + "eval_steps_per_second": 0.193, + "step": 3800 + }, + { + "epoch": 1.38, + "grad_norm": 34.70092608089086, + "learning_rate": 3.945028721830289e-08, + "logits/chosen": -1.8025611639022827, + "logits/rejected": -1.4805386066436768, + "logps/chosen": -94.7345962524414, + "logps/rejected": -197.67295837402344, + "loss": 0.1849, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.3399019241333008, + "rewards/margins": 4.401653289794922, + "rewards/rejected": -3.061751127243042, + "step": 3810 + }, + { + "epoch": 1.39, + "grad_norm": 35.29839166964984, + "learning_rate": 3.903263423347678e-08, + "logits/chosen": -1.9683891534805298, + "logits/rejected": -1.4604440927505493, + "logps/chosen": -89.45817565917969, + "logps/rejected": -148.0181121826172, + "loss": 0.1761, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.6148315072059631, + "rewards/margins": 3.239147186279297, + "rewards/rejected": -2.624316453933716, + "step": 3820 + }, + { + "epoch": 1.39, + "grad_norm": 39.48429335160676, + "learning_rate": 3.8616424755030845e-08, + "logits/chosen": -1.5986571311950684, + "logits/rejected": -1.3868935108184814, + "logps/chosen": -101.1136474609375, + "logps/rejected": -143.46604919433594, + "loss": 0.1891, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.8768807649612427, + "rewards/margins": 3.3683598041534424, + "rewards/rejected": -2.4914793968200684, + "step": 3830 + }, + { + "epoch": 1.39, + "grad_norm": 49.23729367857185, + "learning_rate": 3.820167548702516e-08, + "logits/chosen": -1.5271122455596924, + "logits/rejected": -1.0103445053100586, + "logps/chosen": -68.83717346191406, + "logps/rejected": -165.7958984375, + "loss": 0.1597, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0218682289123535, + "rewards/margins": 5.000029563903809, + "rewards/rejected": -3.978161334991455, + "step": 3840 + }, + { + "epoch": 1.4, + "grad_norm": 24.868995408783775, + "learning_rate": 3.778840307491595e-08, + "logits/chosen": -1.700553297996521, + "logits/rejected": -1.1538689136505127, + "logps/chosen": -91.43379211425781, + "logps/rejected": -162.2164306640625, + "loss": 0.1497, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.1438677310943604, + "rewards/margins": 4.128595352172852, + "rewards/rejected": -2.9847278594970703, + "step": 3850 + }, + { + "epoch": 1.4, + "grad_norm": 27.603218968201812, + "learning_rate": 3.737662410488772e-08, + "logits/chosen": -1.768294334411621, + "logits/rejected": -1.8469886779785156, + "logps/chosen": -95.86221313476562, + "logps/rejected": -161.03114318847656, + "loss": 0.1792, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.108917236328125, + "rewards/margins": 4.2280168533325195, + "rewards/rejected": -3.1190993785858154, + "step": 3860 + }, + { + "epoch": 1.4, + "grad_norm": 36.9707293925801, + "learning_rate": 3.696635510318747e-08, + "logits/chosen": -2.072791814804077, + "logits/rejected": -1.609438180923462, + "logps/chosen": -96.32298278808594, + "logps/rejected": -139.48504638671875, + "loss": 0.1769, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.095580816268921, + "rewards/margins": 2.998603105545044, + "rewards/rejected": -1.9030221700668335, + "step": 3870 + }, + { + "epoch": 1.41, + "grad_norm": 18.130530465583483, + "learning_rate": 3.655761253546142e-08, + "logits/chosen": -1.739894151687622, + "logits/rejected": -1.5710958242416382, + "logps/chosen": -73.37211608886719, + "logps/rejected": -147.71841430664062, + "loss": 0.1407, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.7478740215301514, + "rewards/margins": 4.458775520324707, + "rewards/rejected": -2.7109017372131348, + "step": 3880 + }, + { + "epoch": 1.41, + "grad_norm": 41.20250396110067, + "learning_rate": 3.6150412806094344e-08, + "logits/chosen": -2.221762180328369, + "logits/rejected": -1.7284727096557617, + "logps/chosen": -87.04502868652344, + "logps/rejected": -178.15328979492188, + "loss": 0.1511, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.5952017307281494, + "rewards/margins": 4.024810791015625, + "rewards/rejected": -3.4296088218688965, + "step": 3890 + }, + { + "epoch": 1.42, + "grad_norm": 30.898435410912892, + "learning_rate": 3.574477225755092e-08, + "logits/chosen": -1.9189503192901611, + "logits/rejected": -1.6017463207244873, + "logps/chosen": -91.14994049072266, + "logps/rejected": -140.59140014648438, + "loss": 0.1603, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.5078619718551636, + "rewards/margins": 3.65142560005188, + "rewards/rejected": -2.143563747406006, + "step": 3900 + }, + { + "epoch": 1.42, + "eval_logits/chosen": -2.447599411010742, + "eval_logits/rejected": -2.271988868713379, + "eval_logps/chosen": -89.99532318115234, + "eval_logps/rejected": -152.36782836914062, + "eval_loss": 0.2630765736103058, + "eval_rewards/accuracies": 0.875, + "eval_rewards/chosen": 0.9960936903953552, + "eval_rewards/margins": 3.695732831954956, + "eval_rewards/rejected": -2.699639081954956, + "eval_runtime": 71.5499, + "eval_samples_per_second": 12.439, + "eval_steps_per_second": 0.196, + "step": 3900 + }, + { + "epoch": 1.42, + "grad_norm": 15.623201016392487, + "learning_rate": 3.534070716972011e-08, + "logits/chosen": -1.98562490940094, + "logits/rejected": -1.7183992862701416, + "logps/chosen": -67.69291687011719, + "logps/rejected": -132.70785522460938, + "loss": 0.1543, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.898485541343689, + "rewards/margins": 4.570878028869629, + "rewards/rejected": -2.6723921298980713, + "step": 3910 + }, + { + "epoch": 1.42, + "grad_norm": 29.502016145046106, + "learning_rate": 3.493823375926165e-08, + "logits/chosen": -2.040684938430786, + "logits/rejected": -1.617761254310608, + "logps/chosen": -94.72434997558594, + "logps/rejected": -180.46612548828125, + "loss": 0.1709, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.4578778743743896, + "rewards/margins": 4.434561729431152, + "rewards/rejected": -2.9766833782196045, + "step": 3920 + }, + { + "epoch": 1.43, + "grad_norm": 43.61044884590936, + "learning_rate": 3.4537368178955237e-08, + "logits/chosen": -2.5114989280700684, + "logits/rejected": -1.929688811302185, + "logps/chosen": -63.439727783203125, + "logps/rejected": -167.50003051757812, + "loss": 0.1306, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.4715431928634644, + "rewards/margins": 5.039919853210449, + "rewards/rejected": -3.5683765411376953, + "step": 3930 + }, + { + "epoch": 1.43, + "grad_norm": 31.861912930744854, + "learning_rate": 3.4138126517052315e-08, + "logits/chosen": -2.129574775695801, + "logits/rejected": -2.0275707244873047, + "logps/chosen": -91.8274917602539, + "logps/rejected": -152.502685546875, + "loss": 0.1992, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.9415672421455383, + "rewards/margins": 3.8349013328552246, + "rewards/rejected": -2.893334150314331, + "step": 3940 + }, + { + "epoch": 1.43, + "grad_norm": 10.50472857948733, + "learning_rate": 3.374052479663024e-08, + "logits/chosen": -1.59491765499115, + "logits/rejected": -1.3649214506149292, + "logps/chosen": -104.72843933105469, + "logps/rejected": -193.97207641601562, + "loss": 0.1731, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.1260311603546143, + "rewards/margins": 5.22696590423584, + "rewards/rejected": -4.1009345054626465, + "step": 3950 + }, + { + "epoch": 1.44, + "grad_norm": 23.4008383750431, + "learning_rate": 3.334457897494941e-08, + "logits/chosen": -1.9443241357803345, + "logits/rejected": -1.6959731578826904, + "logps/chosen": -85.87373352050781, + "logps/rejected": -158.98065185546875, + "loss": 0.1546, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3743432760238647, + "rewards/margins": 4.179642677307129, + "rewards/rejected": -2.8052992820739746, + "step": 3960 + }, + { + "epoch": 1.44, + "grad_norm": 33.938523290472, + "learning_rate": 3.29503049428127e-08, + "logits/chosen": -1.9471511840820312, + "logits/rejected": -1.7361555099487305, + "logps/chosen": -84.15934753417969, + "logps/rejected": -153.57553100585938, + "loss": 0.1711, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.410812497138977, + "rewards/margins": 4.902033805847168, + "rewards/rejected": -3.4912219047546387, + "step": 3970 + }, + { + "epoch": 1.44, + "grad_norm": 42.88422707114712, + "learning_rate": 3.255771852392775e-08, + "logits/chosen": -1.8716990947723389, + "logits/rejected": -1.7490049600601196, + "logps/chosen": -93.93757629394531, + "logps/rejected": -118.72102355957031, + "loss": 0.1712, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.7341419458389282, + "rewards/margins": 2.5488927364349365, + "rewards/rejected": -1.8147506713867188, + "step": 3980 + }, + { + "epoch": 1.45, + "grad_norm": 31.96232509403753, + "learning_rate": 3.2166835474271995e-08, + "logits/chosen": -2.2499547004699707, + "logits/rejected": -1.7057373523712158, + "logps/chosen": -70.68228912353516, + "logps/rejected": -136.07669067382812, + "loss": 0.1381, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.8701726794242859, + "rewards/margins": 2.541351556777954, + "rewards/rejected": -1.671179175376892, + "step": 3990 + }, + { + "epoch": 1.45, + "grad_norm": 33.013579048970634, + "learning_rate": 3.177767148146004e-08, + "logits/chosen": -1.196561336517334, + "logits/rejected": -0.9995512962341309, + "logps/chosen": -89.14258575439453, + "logps/rejected": -131.5675048828125, + "loss": 0.2054, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6014581918716431, + "rewards/margins": 2.8417820930480957, + "rewards/rejected": -2.240324020385742, + "step": 4000 + }, + { + "epoch": 1.45, + "eval_logits/chosen": -2.5068588256835938, + "eval_logits/rejected": -2.316378116607666, + "eval_logps/chosen": -92.20320129394531, + "eval_logps/rejected": -153.47879028320312, + "eval_loss": 0.25913339853286743, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 0.7753064036369324, + "eval_rewards/margins": 3.5860416889190674, + "eval_rewards/rejected": -2.8107354640960693, + "eval_runtime": 71.5791, + "eval_samples_per_second": 12.434, + "eval_steps_per_second": 0.196, + "step": 4000 + }, + { + "epoch": 1.46, + "grad_norm": 29.18650004448433, + "learning_rate": 3.139024216411438e-08, + "logits/chosen": -1.9775199890136719, + "logits/rejected": -1.7694860696792603, + "logps/chosen": -74.32111358642578, + "logps/rejected": -114.33467102050781, + "loss": 0.137, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.9357797503471375, + "rewards/margins": 3.2922019958496094, + "rewards/rejected": -2.356421947479248, + "step": 4010 + }, + { + "epoch": 1.46, + "grad_norm": 29.240217688028995, + "learning_rate": 3.100456307123838e-08, + "logits/chosen": -2.0903449058532715, + "logits/rejected": -1.7641003131866455, + "logps/chosen": -81.42185974121094, + "logps/rejected": -165.4561309814453, + "loss": 0.1658, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.1881463527679443, + "rewards/margins": 4.836016654968262, + "rewards/rejected": -3.6478705406188965, + "step": 4020 + }, + { + "epoch": 1.46, + "grad_norm": 30.89971434639723, + "learning_rate": 3.062064968159231e-08, + "logits/chosen": -2.3161113262176514, + "logits/rejected": -1.780773401260376, + "logps/chosen": -84.45279693603516, + "logps/rejected": -165.2125701904297, + "loss": 0.1622, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.193415880203247, + "rewards/margins": 4.027709484100342, + "rewards/rejected": -2.834293842315674, + "step": 4030 + }, + { + "epoch": 1.47, + "grad_norm": 17.28427871490954, + "learning_rate": 3.023851740307201e-08, + "logits/chosen": -2.0731730461120605, + "logits/rejected": -1.6455036401748657, + "logps/chosen": -110.5876693725586, + "logps/rejected": -180.8351287841797, + "loss": 0.1633, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24699239432811737, + "rewards/margins": 3.7656092643737793, + "rewards/rejected": -3.5186171531677246, + "step": 4040 + }, + { + "epoch": 1.47, + "grad_norm": 25.73309666117762, + "learning_rate": 2.9858181572090675e-08, + "logits/chosen": -1.7583844661712646, + "logits/rejected": -1.8522754907608032, + "logps/chosen": -94.708251953125, + "logps/rejected": -162.52867126464844, + "loss": 0.1325, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0252021551132202, + "rewards/margins": 3.6236395835876465, + "rewards/rejected": -2.5984373092651367, + "step": 4050 + }, + { + "epoch": 1.47, + "grad_norm": 31.111251255931403, + "learning_rate": 2.9479657452963256e-08, + "logits/chosen": -2.248737096786499, + "logits/rejected": -1.7086880207061768, + "logps/chosen": -104.32584381103516, + "logps/rejected": -170.23004150390625, + "loss": 0.151, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08391883969306946, + "rewards/margins": 3.075791835784912, + "rewards/rejected": -2.991872787475586, + "step": 4060 + }, + { + "epoch": 1.48, + "grad_norm": 18.513737492370375, + "learning_rate": 2.910296023729384e-08, + "logits/chosen": -1.4177262783050537, + "logits/rejected": -1.364580750465393, + "logps/chosen": -89.30236053466797, + "logps/rejected": -147.00576782226562, + "loss": 0.1544, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0421960353851318, + "rewards/margins": 3.4464340209960938, + "rewards/rejected": -2.404238224029541, + "step": 4070 + }, + { + "epoch": 1.48, + "grad_norm": 45.87936243907442, + "learning_rate": 2.8728105043365984e-08, + "logits/chosen": -1.8459665775299072, + "logits/rejected": -1.5093791484832764, + "logps/chosen": -97.5422592163086, + "logps/rejected": -131.14620971679688, + "loss": 0.1398, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.9450712203979492, + "rewards/margins": 3.339611530303955, + "rewards/rejected": -2.394540309906006, + "step": 4080 + }, + { + "epoch": 1.48, + "grad_norm": 27.364987131956532, + "learning_rate": 2.8355106915535932e-08, + "logits/chosen": -1.9753601551055908, + "logits/rejected": -1.554747462272644, + "logps/chosen": -72.57929229736328, + "logps/rejected": -136.12213134765625, + "loss": 0.135, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8972055315971375, + "rewards/margins": 4.053958415985107, + "rewards/rejected": -3.156752824783325, + "step": 4090 + }, + { + "epoch": 1.49, + "grad_norm": 32.389832751742276, + "learning_rate": 2.798398082362886e-08, + "logits/chosen": -1.6929162740707397, + "logits/rejected": -1.7215791940689087, + "logps/chosen": -115.38690185546875, + "logps/rejected": -146.203125, + "loss": 0.1413, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.22363097965717316, + "rewards/margins": 3.6949760913848877, + "rewards/rejected": -3.4713454246520996, + "step": 4100 + }, + { + "epoch": 1.49, + "eval_logits/chosen": -2.485478639602661, + "eval_logits/rejected": -2.2992606163024902, + "eval_logps/chosen": -91.98760223388672, + "eval_logps/rejected": -154.0671844482422, + "eval_loss": 0.26037731766700745, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 0.796866238117218, + "eval_rewards/margins": 3.6664421558380127, + "eval_rewards/rejected": -2.8695759773254395, + "eval_runtime": 71.6081, + "eval_samples_per_second": 12.429, + "eval_steps_per_second": 0.196, + "step": 4100 + }, + { + "epoch": 1.49, + "grad_norm": 14.2142969265953, + "learning_rate": 2.761474166233805e-08, + "logits/chosen": -1.9014062881469727, + "logits/rejected": -1.554809331893921, + "logps/chosen": -91.71360778808594, + "logps/rejected": -198.91380310058594, + "loss": 0.1107, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8375331163406372, + "rewards/margins": 5.089506149291992, + "rewards/rejected": -4.2519731521606445, + "step": 4110 + }, + { + "epoch": 1.5, + "grad_norm": 36.46497820122211, + "learning_rate": 2.724740425062714e-08, + "logits/chosen": -1.5665128231048584, + "logits/rejected": -1.6638376712799072, + "logps/chosen": -97.43113708496094, + "logps/rejected": -185.1346893310547, + "loss": 0.1309, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.911544680595398, + "rewards/margins": 4.338427543640137, + "rewards/rejected": -3.4268829822540283, + "step": 4120 + }, + { + "epoch": 1.5, + "grad_norm": 50.17754349555712, + "learning_rate": 2.6881983331135378e-08, + "logits/chosen": -1.887635588645935, + "logits/rejected": -1.7573896646499634, + "logps/chosen": -80.06953430175781, + "logps/rejected": -133.02284240722656, + "loss": 0.1722, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.585761547088623, + "rewards/margins": 3.1335628032684326, + "rewards/rejected": -2.5478012561798096, + "step": 4130 + }, + { + "epoch": 1.5, + "grad_norm": 22.44755921748364, + "learning_rate": 2.6518493569585857e-08, + "logits/chosen": -1.984948754310608, + "logits/rejected": -2.0342633724212646, + "logps/chosen": -114.79627990722656, + "logps/rejected": -142.2074432373047, + "loss": 0.199, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6468585729598999, + "rewards/margins": 2.1931614875793457, + "rewards/rejected": -1.546303153038025, + "step": 4140 + }, + { + "epoch": 1.51, + "grad_norm": 24.955293029439613, + "learning_rate": 2.6156949554197095e-08, + "logits/chosen": -2.307645797729492, + "logits/rejected": -1.599973201751709, + "logps/chosen": -90.35932922363281, + "logps/rejected": -148.82672119140625, + "loss": 0.1611, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.5787138938903809, + "rewards/margins": 3.0710272789001465, + "rewards/rejected": -2.492313861846924, + "step": 4150 + }, + { + "epoch": 1.51, + "grad_norm": 36.71467993455758, + "learning_rate": 2.5797365795097407e-08, + "logits/chosen": -1.9828819036483765, + "logits/rejected": -1.6299062967300415, + "logps/chosen": -103.05985260009766, + "logps/rejected": -187.84640502929688, + "loss": 0.1561, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7202814817428589, + "rewards/margins": 4.248810768127441, + "rewards/rejected": -3.528529405593872, + "step": 4160 + }, + { + "epoch": 1.51, + "grad_norm": 25.40867584669409, + "learning_rate": 2.543975672374264e-08, + "logits/chosen": -1.7713234424591064, + "logits/rejected": -1.78484308719635, + "logps/chosen": -91.5001220703125, + "logps/rejected": -149.6800079345703, + "loss": 0.1647, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.5808213353157043, + "rewards/margins": 3.1474668979644775, + "rewards/rejected": -2.566645383834839, + "step": 4170 + }, + { + "epoch": 1.52, + "grad_norm": 19.8252963506834, + "learning_rate": 2.5084136692336926e-08, + "logits/chosen": -1.3749628067016602, + "logits/rejected": -1.4281375408172607, + "logps/chosen": -70.6727066040039, + "logps/rejected": -277.53790283203125, + "loss": 0.1486, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.1151177883148193, + "rewards/margins": 4.473038196563721, + "rewards/rejected": -3.3579201698303223, + "step": 4180 + }, + { + "epoch": 1.52, + "grad_norm": 22.788082275545953, + "learning_rate": 2.4730519973256725e-08, + "logits/chosen": -1.382116436958313, + "logits/rejected": -1.2883073091506958, + "logps/chosen": -77.42723083496094, + "logps/rejected": -168.14895629882812, + "loss": 0.1365, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.9808204770088196, + "rewards/margins": 4.480748176574707, + "rewards/rejected": -3.499927520751953, + "step": 4190 + }, + { + "epoch": 1.52, + "grad_norm": 17.55083100932304, + "learning_rate": 2.4378920758477996e-08, + "logits/chosen": -2.1403841972351074, + "logits/rejected": -1.4080195426940918, + "logps/chosen": -69.2994384765625, + "logps/rejected": -133.26760864257812, + "loss": 0.1498, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.458009958267212, + "rewards/margins": 3.704465389251709, + "rewards/rejected": -2.246455669403076, + "step": 4200 + }, + { + "epoch": 1.52, + "eval_logits/chosen": -2.486311435699463, + "eval_logits/rejected": -2.3020877838134766, + "eval_logps/chosen": -91.95867919921875, + "eval_logps/rejected": -154.05789184570312, + "eval_loss": 0.2579483687877655, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 0.7997574210166931, + "eval_rewards/margins": 3.6684017181396484, + "eval_rewards/rejected": -2.8686444759368896, + "eval_runtime": 71.505, + "eval_samples_per_second": 12.447, + "eval_steps_per_second": 0.196, + "step": 4200 + }, + { + "epoch": 1.53, + "grad_norm": 35.97397604275955, + "learning_rate": 2.4029353159006606e-08, + "logits/chosen": -1.889622688293457, + "logits/rejected": -1.7178815603256226, + "logps/chosen": -70.93376159667969, + "logps/rejected": -144.45614624023438, + "loss": 0.1486, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.563561201095581, + "rewards/margins": 4.030331611633301, + "rewards/rejected": -2.466770648956299, + "step": 4210 + }, + { + "epoch": 1.53, + "grad_norm": 25.472759515203542, + "learning_rate": 2.368183120431205e-08, + "logits/chosen": -1.6090589761734009, + "logits/rejected": -1.5134804248809814, + "logps/chosen": -93.47647857666016, + "logps/rejected": -154.41531372070312, + "loss": 0.1534, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2347394227981567, + "rewards/margins": 4.24044942855835, + "rewards/rejected": -3.0057103633880615, + "step": 4220 + }, + { + "epoch": 1.54, + "grad_norm": 26.748913628743217, + "learning_rate": 2.3336368841764356e-08, + "logits/chosen": -1.6902498006820679, + "logits/rejected": -1.2772510051727295, + "logps/chosen": -72.32806396484375, + "logps/rejected": -127.91917419433594, + "loss": 0.105, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5376206636428833, + "rewards/margins": 3.2143516540527344, + "rewards/rejected": -1.676730751991272, + "step": 4230 + }, + { + "epoch": 1.54, + "grad_norm": 38.70782633546678, + "learning_rate": 2.2992979936074264e-08, + "logits/chosen": -2.003537654876709, + "logits/rejected": -1.539896011352539, + "logps/chosen": -107.56465148925781, + "logps/rejected": -155.28176879882812, + "loss": 0.1745, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1425919532775879, + "rewards/margins": 2.318450927734375, + "rewards/rejected": -2.461042881011963, + "step": 4240 + }, + { + "epoch": 1.54, + "grad_norm": 29.839188839127143, + "learning_rate": 2.2651678268736942e-08, + "logits/chosen": -1.5676755905151367, + "logits/rejected": -1.5963351726531982, + "logps/chosen": -117.62342834472656, + "logps/rejected": -148.70346069335938, + "loss": 0.1028, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9985088109970093, + "rewards/margins": 3.8390114307403564, + "rewards/rejected": -2.840503215789795, + "step": 4250 + }, + { + "epoch": 1.55, + "grad_norm": 19.326703363193936, + "learning_rate": 2.2312477537478763e-08, + "logits/chosen": -2.033979892730713, + "logits/rejected": -1.5905263423919678, + "logps/chosen": -78.28575134277344, + "logps/rejected": -167.78639221191406, + "loss": 0.1449, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.2223570346832275, + "rewards/margins": 4.399596214294434, + "rewards/rejected": -3.1772396564483643, + "step": 4260 + }, + { + "epoch": 1.55, + "grad_norm": 34.56110655069492, + "learning_rate": 2.1975391355707567e-08, + "logits/chosen": -2.090172529220581, + "logits/rejected": -1.7063558101654053, + "logps/chosen": -69.6578369140625, + "logps/rejected": -146.59194946289062, + "loss": 0.1482, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.4781410694122314, + "rewards/margins": 4.314321994781494, + "rewards/rejected": -2.836181163787842, + "step": 4270 + }, + { + "epoch": 1.55, + "grad_norm": 37.15442072437241, + "learning_rate": 2.164043325196635e-08, + "logits/chosen": -1.6971238851547241, + "logits/rejected": -1.8592636585235596, + "logps/chosen": -82.22840881347656, + "logps/rejected": -163.99058532714844, + "loss": 0.134, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3080239295959473, + "rewards/margins": 3.911048412322998, + "rewards/rejected": -2.60302472114563, + "step": 4280 + }, + { + "epoch": 1.56, + "grad_norm": 22.652340412985147, + "learning_rate": 2.13076166693903e-08, + "logits/chosen": -2.017364501953125, + "logits/rejected": -1.9596898555755615, + "logps/chosen": -100.14933776855469, + "logps/rejected": -186.43014526367188, + "loss": 0.1298, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3219308853149414, + "rewards/margins": 4.975711822509766, + "rewards/rejected": -3.653780698776245, + "step": 4290 + }, + { + "epoch": 1.56, + "grad_norm": 26.988430640272753, + "learning_rate": 2.0976954965167228e-08, + "logits/chosen": -2.23913311958313, + "logits/rejected": -2.0381526947021484, + "logps/chosen": -100.4576644897461, + "logps/rejected": -205.36434936523438, + "loss": 0.174, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9682433009147644, + "rewards/margins": 4.4989728927612305, + "rewards/rejected": -3.5307304859161377, + "step": 4300 + }, + { + "epoch": 1.56, + "eval_logits/chosen": -2.39996600151062, + "eval_logits/rejected": -2.239955186843872, + "eval_logps/chosen": -92.15650939941406, + "eval_logps/rejected": -155.64747619628906, + "eval_loss": 0.26092293858528137, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 0.7799752354621887, + "eval_rewards/margins": 3.8075778484344482, + "eval_rewards/rejected": -3.0276029109954834, + "eval_runtime": 71.3862, + "eval_samples_per_second": 12.467, + "eval_steps_per_second": 0.196, + "step": 4300 + }, + { + "epoch": 1.56, + "grad_norm": 20.13961405134126, + "learning_rate": 2.064846141000156e-08, + "logits/chosen": -1.785154104232788, + "logits/rejected": -1.782621145248413, + "logps/chosen": -100.51982116699219, + "logps/rejected": -170.2032012939453, + "loss": 0.1628, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.1105843782424927, + "rewards/margins": 4.3706769943237305, + "rewards/rejected": -3.260092258453369, + "step": 4310 + }, + { + "epoch": 1.57, + "grad_norm": 26.57133171694419, + "learning_rate": 2.0322149187581696e-08, + "logits/chosen": -2.1262426376342773, + "logits/rejected": -1.7420053482055664, + "logps/chosen": -85.6638412475586, + "logps/rejected": -192.35430908203125, + "loss": 0.1245, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9371218681335449, + "rewards/margins": 5.653896331787109, + "rewards/rejected": -4.7167744636535645, + "step": 4320 + }, + { + "epoch": 1.57, + "grad_norm": 30.486425514760217, + "learning_rate": 1.9998031394050925e-08, + "logits/chosen": -2.0862393379211426, + "logits/rejected": -1.73501718044281, + "logps/chosen": -110.37203216552734, + "logps/rejected": -184.22003173828125, + "loss": 0.1412, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.701205849647522, + "rewards/margins": 4.286178112030029, + "rewards/rejected": -3.584972381591797, + "step": 4330 + }, + { + "epoch": 1.58, + "grad_norm": 29.613748447060853, + "learning_rate": 1.9676121037481733e-08, + "logits/chosen": -1.8443056344985962, + "logits/rejected": -1.37973153591156, + "logps/chosen": -87.62504577636719, + "logps/rejected": -148.2349090576172, + "loss": 0.1641, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0517264604568481, + "rewards/margins": 3.312018632888794, + "rewards/rejected": -2.2602920532226562, + "step": 4340 + }, + { + "epoch": 1.58, + "grad_norm": 38.941503925465945, + "learning_rate": 1.935643103735389e-08, + "logits/chosen": -2.3647444248199463, + "logits/rejected": -1.6877624988555908, + "logps/chosen": -78.06304168701172, + "logps/rejected": -170.48617553710938, + "loss": 0.1479, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1019824743270874, + "rewards/margins": 4.5098443031311035, + "rewards/rejected": -3.4078621864318848, + "step": 4350 + }, + { + "epoch": 1.58, + "grad_norm": 50.92032810215081, + "learning_rate": 1.9038974224035848e-08, + "logits/chosen": -1.945586919784546, + "logits/rejected": -1.535428762435913, + "logps/chosen": -93.87750244140625, + "logps/rejected": -185.80007934570312, + "loss": 0.1745, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.7316128611564636, + "rewards/margins": 3.479790210723877, + "rewards/rejected": -2.7481772899627686, + "step": 4360 + }, + { + "epoch": 1.59, + "grad_norm": 33.56498772715042, + "learning_rate": 1.8723763338269824e-08, + "logits/chosen": -2.1573054790496826, + "logits/rejected": -1.6914573907852173, + "logps/chosen": -97.86043548583984, + "logps/rejected": -163.97647094726562, + "loss": 0.1815, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.35831624269485474, + "rewards/margins": 3.7290472984313965, + "rewards/rejected": -3.3707308769226074, + "step": 4370 + }, + { + "epoch": 1.59, + "grad_norm": 26.65376761598817, + "learning_rate": 1.8410811030660466e-08, + "logits/chosen": -1.8529369831085205, + "logits/rejected": -1.8225460052490234, + "logps/chosen": -126.21983337402344, + "logps/rejected": -171.356689453125, + "loss": 0.1578, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.27563539147377014, + "rewards/margins": 3.433450222015381, + "rewards/rejected": -3.1578147411346436, + "step": 4380 + }, + { + "epoch": 1.59, + "grad_norm": 41.658972724370116, + "learning_rate": 1.810012986116715e-08, + "logits/chosen": -2.1115715503692627, + "logits/rejected": -1.5005934238433838, + "logps/chosen": -78.10107421875, + "logps/rejected": -153.611572265625, + "loss": 0.1355, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.1123895645141602, + "rewards/margins": 4.308642387390137, + "rewards/rejected": -3.1962530612945557, + "step": 4390 + }, + { + "epoch": 1.6, + "grad_norm": 23.104818947703635, + "learning_rate": 1.7791732298599888e-08, + "logits/chosen": -1.8663402795791626, + "logits/rejected": -1.7844841480255127, + "logps/chosen": -83.04045104980469, + "logps/rejected": -153.29029846191406, + "loss": 0.1129, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5516669750213623, + "rewards/margins": 4.48703670501709, + "rewards/rejected": -2.9353702068328857, + "step": 4400 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -2.4757511615753174, + "eval_logits/rejected": -2.2932116985321045, + "eval_logps/chosen": -92.66739654541016, + "eval_logps/rejected": -155.24755859375, + "eval_loss": 0.2576039731502533, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 0.7288867235183716, + "eval_rewards/margins": 3.716498851776123, + "eval_rewards/rejected": -2.987612009048462, + "eval_runtime": 71.4688, + "eval_samples_per_second": 12.453, + "eval_steps_per_second": 0.196, + "step": 4400 + }, + { + "epoch": 1.6, + "grad_norm": 30.602138050700304, + "learning_rate": 1.7485630720118904e-08, + "logits/chosen": -1.7544384002685547, + "logits/rejected": -1.5731135606765747, + "logps/chosen": -95.06303405761719, + "logps/rejected": -155.60504150390625, + "loss": 0.1394, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6706847548484802, + "rewards/margins": 3.354102373123169, + "rewards/rejected": -2.683417320251465, + "step": 4410 + }, + { + "epoch": 1.6, + "grad_norm": 24.591287815914352, + "learning_rate": 1.7181837410737932e-08, + "logits/chosen": -1.6455243825912476, + "logits/rejected": -1.8114473819732666, + "logps/chosen": -91.33721923828125, + "logps/rejected": -158.2550811767578, + "loss": 0.1183, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.6706674098968506, + "rewards/margins": 3.6940701007843018, + "rewards/rejected": -2.023402690887451, + "step": 4420 + }, + { + "epoch": 1.61, + "grad_norm": 28.285812502384303, + "learning_rate": 1.688036456283108e-08, + "logits/chosen": -1.4960906505584717, + "logits/rejected": -1.562246322631836, + "logps/chosen": -92.74504089355469, + "logps/rejected": -129.95484924316406, + "loss": 0.1344, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7566136121749878, + "rewards/margins": 3.488935947418213, + "rewards/rejected": -2.7323222160339355, + "step": 4430 + }, + { + "epoch": 1.61, + "grad_norm": 21.99612078068548, + "learning_rate": 1.6581224275643602e-08, + "logits/chosen": -2.4308037757873535, + "logits/rejected": -2.031317710876465, + "logps/chosen": -81.02774810791016, + "logps/rejected": -149.41329956054688, + "loss": 0.124, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0930092334747314, + "rewards/margins": 3.704552173614502, + "rewards/rejected": -2.6115427017211914, + "step": 4440 + }, + { + "epoch": 1.62, + "grad_norm": 37.76234741898126, + "learning_rate": 1.6284428554806282e-08, + "logits/chosen": -1.8159290552139282, + "logits/rejected": -1.7262006998062134, + "logps/chosen": -84.27841186523438, + "logps/rejected": -133.88829040527344, + "loss": 0.1195, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7679897546768188, + "rewards/margins": 3.5734684467315674, + "rewards/rejected": -2.805478572845459, + "step": 4450 + }, + { + "epoch": 1.62, + "grad_norm": 20.751714992592504, + "learning_rate": 1.598998931185358e-08, + "logits/chosen": -1.6911875009536743, + "logits/rejected": -1.4998903274536133, + "logps/chosen": -85.60092163085938, + "logps/rejected": -157.67355346679688, + "loss": 0.1644, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.2675038576126099, + "rewards/margins": 3.786600112915039, + "rewards/rejected": -2.5190961360931396, + "step": 4460 + }, + { + "epoch": 1.62, + "grad_norm": 31.961939419295213, + "learning_rate": 1.5697918363745567e-08, + "logits/chosen": -1.8280389308929443, + "logits/rejected": -1.5867749452590942, + "logps/chosen": -108.87247467041016, + "logps/rejected": -214.6339111328125, + "loss": 0.1564, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.9745191335678101, + "rewards/margins": 5.607847213745117, + "rewards/rejected": -4.633328437805176, + "step": 4470 + }, + { + "epoch": 1.63, + "grad_norm": 38.17110938660031, + "learning_rate": 1.5408227432393714e-08, + "logits/chosen": -2.4660305976867676, + "logits/rejected": -1.5764108896255493, + "logps/chosen": -89.4808120727539, + "logps/rejected": -156.589111328125, + "loss": 0.1159, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.1624079942703247, + "rewards/margins": 3.693568706512451, + "rewards/rejected": -2.531160593032837, + "step": 4480 + }, + { + "epoch": 1.63, + "grad_norm": 36.10544360692532, + "learning_rate": 1.5120928144190412e-08, + "logits/chosen": -1.9956271648406982, + "logits/rejected": -1.7658560276031494, + "logps/chosen": -117.41062927246094, + "logps/rejected": -145.37120056152344, + "loss": 0.1571, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.006312882993370295, + "rewards/margins": 2.9415173530578613, + "rewards/rejected": -2.93520450592041, + "step": 4490 + }, + { + "epoch": 1.63, + "grad_norm": 27.729025187636683, + "learning_rate": 1.483603202954238e-08, + "logits/chosen": -1.9484901428222656, + "logits/rejected": -1.8669170141220093, + "logps/chosen": -92.9903335571289, + "logps/rejected": -148.825927734375, + "loss": 0.1424, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7940378189086914, + "rewards/margins": 3.46022367477417, + "rewards/rejected": -2.6661858558654785, + "step": 4500 + }, + { + "epoch": 1.63, + "eval_logits/chosen": -2.4859211444854736, + "eval_logits/rejected": -2.302330732345581, + "eval_logps/chosen": -92.06935119628906, + "eval_logps/rejected": -154.83358764648438, + "eval_loss": 0.2585107982158661, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 0.7886915802955627, + "eval_rewards/margins": 3.734905242919922, + "eval_rewards/rejected": -2.9462132453918457, + "eval_runtime": 71.5035, + "eval_samples_per_second": 12.447, + "eval_steps_per_second": 0.196, + "step": 4500 + }, + { + "epoch": 1.64, + "grad_norm": 20.03094242415938, + "learning_rate": 1.4553550522407868e-08, + "logits/chosen": -1.8428974151611328, + "logits/rejected": -1.440744161605835, + "logps/chosen": -84.1056137084961, + "logps/rejected": -152.34756469726562, + "loss": 0.1426, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6048812866210938, + "rewards/margins": 4.184465408325195, + "rewards/rejected": -2.5795845985412598, + "step": 4510 + }, + { + "epoch": 1.64, + "grad_norm": 34.20593083265495, + "learning_rate": 1.4273494959837854e-08, + "logits/chosen": -1.979557752609253, + "logits/rejected": -1.8085553646087646, + "logps/chosen": -87.48872375488281, + "logps/rejected": -144.94422912597656, + "loss": 0.1596, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.7250168323516846, + "rewards/margins": 3.676483154296875, + "rewards/rejected": -1.9514662027359009, + "step": 4520 + }, + { + "epoch": 1.64, + "grad_norm": 15.481458090321123, + "learning_rate": 1.3995876581520893e-08, + "logits/chosen": -1.636747121810913, + "logits/rejected": -1.7493568658828735, + "logps/chosen": -104.45011901855469, + "logps/rejected": -178.22219848632812, + "loss": 0.1425, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.7096387147903442, + "rewards/margins": 4.242130756378174, + "rewards/rejected": -3.5324923992156982, + "step": 4530 + }, + { + "epoch": 1.65, + "grad_norm": 26.973245248163447, + "learning_rate": 1.3720706529332202e-08, + "logits/chosen": -2.262129545211792, + "logits/rejected": -1.6040267944335938, + "logps/chosen": -77.4686279296875, + "logps/rejected": -172.17333984375, + "loss": 0.1379, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.497057318687439, + "rewards/margins": 4.911464691162109, + "rewards/rejected": -3.414407253265381, + "step": 4540 + }, + { + "epoch": 1.65, + "grad_norm": 24.948661161132325, + "learning_rate": 1.3447995846886393e-08, + "logits/chosen": -1.9659942388534546, + "logits/rejected": -1.7948821783065796, + "logps/chosen": -74.55500793457031, + "logps/rejected": -173.8816680908203, + "loss": 0.149, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5743433237075806, + "rewards/margins": 5.000136375427246, + "rewards/rejected": -3.425793409347534, + "step": 4550 + }, + { + "epoch": 1.66, + "grad_norm": 27.776188946775548, + "learning_rate": 1.317775547909426e-08, + "logits/chosen": -1.6119670867919922, + "logits/rejected": -1.385824203491211, + "logps/chosen": -96.07972717285156, + "logps/rejected": -175.02867126464844, + "loss": 0.1963, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7563056945800781, + "rewards/margins": 3.682525634765625, + "rewards/rejected": -2.926219940185547, + "step": 4560 + }, + { + "epoch": 1.66, + "grad_norm": 43.370687625677974, + "learning_rate": 1.2909996271723539e-08, + "logits/chosen": -2.330936908721924, + "logits/rejected": -1.7249195575714111, + "logps/chosen": -81.22795104980469, + "logps/rejected": -180.45004272460938, + "loss": 0.1118, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1179533004760742, + "rewards/margins": 4.484683990478516, + "rewards/rejected": -3.3667304515838623, + "step": 4570 + }, + { + "epoch": 1.66, + "grad_norm": 52.192177435639636, + "learning_rate": 1.2644728970963616e-08, + "logits/chosen": -2.1745333671569824, + "logits/rejected": -1.5888748168945312, + "logps/chosen": -76.57296752929688, + "logps/rejected": -189.40585327148438, + "loss": 0.1714, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.506654143333435, + "rewards/margins": 5.129171371459961, + "rewards/rejected": -3.6225173473358154, + "step": 4580 + }, + { + "epoch": 1.67, + "grad_norm": 15.461194155069908, + "learning_rate": 1.2381964222994248e-08, + "logits/chosen": -1.6822162866592407, + "logits/rejected": -1.4808707237243652, + "logps/chosen": -88.69744110107422, + "logps/rejected": -152.0595245361328, + "loss": 0.1573, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7915393710136414, + "rewards/margins": 4.33894157409668, + "rewards/rejected": -3.5474014282226562, + "step": 4590 + }, + { + "epoch": 1.67, + "grad_norm": 30.84506660543241, + "learning_rate": 1.2121712573558262e-08, + "logits/chosen": -1.8852580785751343, + "logits/rejected": -1.933457374572754, + "logps/chosen": -88.162841796875, + "logps/rejected": -145.27273559570312, + "loss": 0.1531, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.409010410308838, + "rewards/margins": 4.220501899719238, + "rewards/rejected": -2.8114914894104004, + "step": 4600 + }, + { + "epoch": 1.67, + "eval_logits/chosen": -2.4970266819000244, + "eval_logits/rejected": -2.308969020843506, + "eval_logps/chosen": -92.29559326171875, + "eval_logps/rejected": -154.68136596679688, + "eval_loss": 0.257046639919281, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 0.7660664319992065, + "eval_rewards/margins": 3.6970560550689697, + "eval_rewards/rejected": -2.930989980697632, + "eval_runtime": 71.5124, + "eval_samples_per_second": 12.445, + "eval_steps_per_second": 0.196, + "step": 4600 + }, + { + "epoch": 1.67, + "grad_norm": 32.065023239868225, + "learning_rate": 1.1863984467538368e-08, + "logits/chosen": -1.5617568492889404, + "logits/rejected": -1.5484387874603271, + "logps/chosen": -84.5037612915039, + "logps/rejected": -140.86459350585938, + "loss": 0.1778, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0827562808990479, + "rewards/margins": 4.16703987121582, + "rewards/rejected": -3.0842833518981934, + "step": 4610 + }, + { + "epoch": 1.68, + "grad_norm": 40.53992535432375, + "learning_rate": 1.1608790248537947e-08, + "logits/chosen": -1.849543809890747, + "logits/rejected": -1.3818111419677734, + "logps/chosen": -98.61241912841797, + "logps/rejected": -204.64102172851562, + "loss": 0.1815, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39084136486053467, + "rewards/margins": 4.119144916534424, + "rewards/rejected": -3.7283034324645996, + "step": 4620 + }, + { + "epoch": 1.68, + "grad_norm": 44.60677557796461, + "learning_rate": 1.1356140158465846e-08, + "logits/chosen": -1.6492226123809814, + "logits/rejected": -1.5845674276351929, + "logps/chosen": -99.52052307128906, + "logps/rejected": -152.729248046875, + "loss": 0.152, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.465355783700943, + "rewards/margins": 3.311537981033325, + "rewards/rejected": -2.846181631088257, + "step": 4630 + }, + { + "epoch": 1.68, + "grad_norm": 38.975584748722106, + "learning_rate": 1.1106044337125478e-08, + "logits/chosen": -1.9782928228378296, + "logits/rejected": -1.9134845733642578, + "logps/chosen": -93.96483612060547, + "logps/rejected": -147.70205688476562, + "loss": 0.1498, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.3415434956550598, + "rewards/margins": 3.8670287132263184, + "rewards/rejected": -3.5254852771759033, + "step": 4640 + }, + { + "epoch": 1.69, + "grad_norm": 30.98909384785237, + "learning_rate": 1.0858512821807742e-08, + "logits/chosen": -1.6544148921966553, + "logits/rejected": -1.434407114982605, + "logps/chosen": -97.20882415771484, + "logps/rejected": -143.4949188232422, + "loss": 0.184, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.9800602793693542, + "rewards/margins": 2.9634454250335693, + "rewards/rejected": -1.9833850860595703, + "step": 4650 + }, + { + "epoch": 1.69, + "grad_norm": 37.51557850755165, + "learning_rate": 1.0613555546888275e-08, + "logits/chosen": -1.5016518831253052, + "logits/rejected": -1.68484628200531, + "logps/chosen": -88.76266479492188, + "logps/rejected": -165.8483123779297, + "loss": 0.1375, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8447409868240356, + "rewards/margins": 4.642016887664795, + "rewards/rejected": -3.797276020050049, + "step": 4660 + }, + { + "epoch": 1.7, + "grad_norm": 38.74297206743728, + "learning_rate": 1.0371182343428694e-08, + "logits/chosen": -1.381477952003479, + "logits/rejected": -2.027268171310425, + "logps/chosen": -98.62785339355469, + "logps/rejected": -132.6935577392578, + "loss": 0.1417, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.7803923487663269, + "rewards/margins": 3.797145366668701, + "rewards/rejected": -3.0167534351348877, + "step": 4670 + }, + { + "epoch": 1.7, + "grad_norm": 37.59159294477782, + "learning_rate": 1.0131402938782063e-08, + "logits/chosen": -1.829708456993103, + "logits/rejected": -1.5681079626083374, + "logps/chosen": -89.08270263671875, + "logps/rejected": -157.6560821533203, + "loss": 0.1658, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0118838548660278, + "rewards/margins": 4.21151065826416, + "rewards/rejected": -3.19962739944458, + "step": 4680 + }, + { + "epoch": 1.7, + "grad_norm": 18.933900384534923, + "learning_rate": 9.894226956202484e-09, + "logits/chosen": -1.6854585409164429, + "logits/rejected": -1.6041696071624756, + "logps/chosen": -87.95982360839844, + "logps/rejected": -164.80075073242188, + "loss": 0.1541, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1225395202636719, + "rewards/margins": 4.531619071960449, + "rewards/rejected": -3.4090800285339355, + "step": 4690 + }, + { + "epoch": 1.71, + "grad_norm": 25.177344401658505, + "learning_rate": 9.659663914458913e-09, + "logits/chosen": -1.7834850549697876, + "logits/rejected": -1.7646923065185547, + "logps/chosen": -72.14763641357422, + "logps/rejected": -126.3357162475586, + "loss": 0.1295, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4011167287826538, + "rewards/margins": 3.857720136642456, + "rewards/rejected": -2.4566032886505127, + "step": 4700 + }, + { + "epoch": 1.71, + "eval_logits/chosen": -2.4951701164245605, + "eval_logits/rejected": -2.3085877895355225, + "eval_logps/chosen": -93.2258071899414, + "eval_logps/rejected": -155.58546447753906, + "eval_loss": 0.25638025999069214, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 0.6730464100837708, + "eval_rewards/margins": 3.6944470405578613, + "eval_rewards/rejected": -3.0214006900787354, + "eval_runtime": 71.5377, + "eval_samples_per_second": 12.441, + "eval_steps_per_second": 0.196, + "step": 4700 + }, + { + "epoch": 1.71, + "grad_norm": 36.30094639660896, + "learning_rate": 9.427723227453092e-09, + "logits/chosen": -2.1383352279663086, + "logits/rejected": -1.7759078741073608, + "logps/chosen": -97.42940521240234, + "logps/rejected": -208.8566131591797, + "loss": 0.1838, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45860522985458374, + "rewards/margins": 4.734647274017334, + "rewards/rejected": -4.276042461395264, + "step": 4710 + }, + { + "epoch": 1.71, + "grad_norm": 46.07295378551623, + "learning_rate": 9.198414203841732e-09, + "logits/chosen": -2.0975241661071777, + "logits/rejected": -1.4306576251983643, + "logps/chosen": -75.3366928100586, + "logps/rejected": -164.50413513183594, + "loss": 0.148, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5581697225570679, + "rewards/margins": 5.427392482757568, + "rewards/rejected": -3.869223117828369, + "step": 4720 + }, + { + "epoch": 1.72, + "grad_norm": 42.391432305833874, + "learning_rate": 8.971746046662982e-09, + "logits/chosen": -2.13923978805542, + "logits/rejected": -1.8475595712661743, + "logps/chosen": -85.93409729003906, + "logps/rejected": -238.55447387695312, + "loss": 0.1795, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.9917472004890442, + "rewards/margins": 4.265665054321289, + "rewards/rejected": -3.2739181518554688, + "step": 4730 + }, + { + "epoch": 1.72, + "grad_norm": 41.9386180805472, + "learning_rate": 8.747727852967013e-09, + "logits/chosen": -1.9976091384887695, + "logits/rejected": -1.6085163354873657, + "logps/chosen": -85.17452239990234, + "logps/rejected": -167.8629608154297, + "loss": 0.1809, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7708709239959717, + "rewards/margins": 4.317328929901123, + "rewards/rejected": -3.5464580059051514, + "step": 4740 + }, + { + "epoch": 1.72, + "grad_norm": 27.55797286092099, + "learning_rate": 8.526368613450938e-09, + "logits/chosen": -1.8709558248519897, + "logits/rejected": -1.3762298822402954, + "logps/chosen": -77.7394790649414, + "logps/rejected": -249.26089477539062, + "loss": 0.137, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0649783611297607, + "rewards/margins": 5.4183220863342285, + "rewards/rejected": -4.353343963623047, + "step": 4750 + }, + { + "epoch": 1.73, + "grad_norm": 37.03238003499655, + "learning_rate": 8.307677212098013e-09, + "logits/chosen": -2.055666446685791, + "logits/rejected": -1.7216275930404663, + "logps/chosen": -78.11017608642578, + "logps/rejected": -145.96435546875, + "loss": 0.1323, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.7488200664520264, + "rewards/margins": 4.145888328552246, + "rewards/rejected": -2.3970682621002197, + "step": 4760 + }, + { + "epoch": 1.73, + "grad_norm": 14.394014565941422, + "learning_rate": 8.091662425821027e-09, + "logits/chosen": -2.1242330074310303, + "logits/rejected": -1.989148736000061, + "logps/chosen": -96.9617691040039, + "logps/rejected": -156.48365783691406, + "loss": 0.146, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1661460399627686, + "rewards/margins": 4.207326889038086, + "rewards/rejected": -3.0411806106567383, + "step": 4770 + }, + { + "epoch": 1.74, + "grad_norm": 45.068535611865215, + "learning_rate": 7.878332924110114e-09, + "logits/chosen": -1.6985795497894287, + "logits/rejected": -1.4704097509384155, + "logps/chosen": -105.20362854003906, + "logps/rejected": -165.98757934570312, + "loss": 0.1946, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.41264334321022034, + "rewards/margins": 3.4486331939697266, + "rewards/rejected": -3.035989761352539, + "step": 4780 + }, + { + "epoch": 1.74, + "grad_norm": 32.28722287206916, + "learning_rate": 7.66769726868476e-09, + "logits/chosen": -2.389949321746826, + "logits/rejected": -1.8705673217773438, + "logps/chosen": -78.36787414550781, + "logps/rejected": -172.9830780029297, + "loss": 0.1543, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.4516584873199463, + "rewards/margins": 4.765595436096191, + "rewards/rejected": -3.313936948776245, + "step": 4790 + }, + { + "epoch": 1.74, + "grad_norm": 27.305664645978823, + "learning_rate": 7.459763913150232e-09, + "logits/chosen": -1.8216984272003174, + "logits/rejected": -1.5844495296478271, + "logps/chosen": -94.18133544921875, + "logps/rejected": -148.17559814453125, + "loss": 0.1277, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8979905843734741, + "rewards/margins": 4.38170051574707, + "rewards/rejected": -3.483710527420044, + "step": 4800 + }, + { + "epoch": 1.74, + "eval_logits/chosen": -2.5054409503936768, + "eval_logits/rejected": -2.3161699771881104, + "eval_logps/chosen": -93.1070327758789, + "eval_logps/rejected": -155.18020629882812, + "eval_loss": 0.2575148642063141, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 0.6849234700202942, + "eval_rewards/margins": 3.665799856185913, + "eval_rewards/rejected": -2.9808764457702637, + "eval_runtime": 71.4935, + "eval_samples_per_second": 12.449, + "eval_steps_per_second": 0.196, + "step": 4800 + }, + { + "epoch": 1.75, + "grad_norm": 33.21624232483828, + "learning_rate": 7.254541202658298e-09, + "logits/chosen": -1.866506576538086, + "logits/rejected": -1.7012319564819336, + "logps/chosen": -72.63794708251953, + "logps/rejected": -140.51022338867188, + "loss": 0.1671, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.9564884305000305, + "rewards/margins": 3.6187520027160645, + "rewards/rejected": -2.6622631549835205, + "step": 4810 + }, + { + "epoch": 1.75, + "grad_norm": 33.30055773008314, + "learning_rate": 7.052037373572247e-09, + "logits/chosen": -2.050233840942383, + "logits/rejected": -1.8784773349761963, + "logps/chosen": -76.64759826660156, + "logps/rejected": -121.65924072265625, + "loss": 0.1252, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1043304204940796, + "rewards/margins": 3.284348249435425, + "rewards/rejected": -2.1800179481506348, + "step": 4820 + }, + { + "epoch": 1.75, + "grad_norm": 30.226579991502298, + "learning_rate": 6.8522605531363995e-09, + "logits/chosen": -2.1329455375671387, + "logits/rejected": -1.5537135601043701, + "logps/chosen": -81.99913787841797, + "logps/rejected": -166.19219970703125, + "loss": 0.1238, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.3520516157150269, + "rewards/margins": 4.49784517288208, + "rewards/rejected": -3.1457934379577637, + "step": 4830 + }, + { + "epoch": 1.76, + "grad_norm": 20.76586446781195, + "learning_rate": 6.655218759149936e-09, + "logits/chosen": -1.8627560138702393, + "logits/rejected": -1.4420944452285767, + "logps/chosen": -81.84294128417969, + "logps/rejected": -161.7127227783203, + "loss": 0.1383, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9598122835159302, + "rewards/margins": 3.9863998889923096, + "rewards/rejected": -3.026587963104248, + "step": 4840 + }, + { + "epoch": 1.76, + "grad_norm": 39.39440151277521, + "learning_rate": 6.460919899645045e-09, + "logits/chosen": -1.4853651523590088, + "logits/rejected": -1.0775649547576904, + "logps/chosen": -79.50499725341797, + "logps/rejected": -160.72036743164062, + "loss": 0.1273, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1241073608398438, + "rewards/margins": 4.072818756103516, + "rewards/rejected": -2.948711633682251, + "step": 4850 + }, + { + "epoch": 1.76, + "grad_norm": 28.117092220468667, + "learning_rate": 6.2693717725696064e-09, + "logits/chosen": -2.3298912048339844, + "logits/rejected": -1.7818387746810913, + "logps/chosen": -81.07538604736328, + "logps/rejected": -149.2225341796875, + "loss": 0.1653, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2655856609344482, + "rewards/margins": 4.464354038238525, + "rewards/rejected": -3.198768138885498, + "step": 4860 + }, + { + "epoch": 1.77, + "grad_norm": 27.103505885216574, + "learning_rate": 6.080582065474191e-09, + "logits/chosen": -1.792157769203186, + "logits/rejected": -1.6782718896865845, + "logps/chosen": -82.19720458984375, + "logps/rejected": -142.2840118408203, + "loss": 0.181, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.406231164932251, + "rewards/margins": 3.7717814445495605, + "rewards/rejected": -2.365550994873047, + "step": 4870 + }, + { + "epoch": 1.77, + "grad_norm": 30.313103414178183, + "learning_rate": 5.8945583552035664e-09, + "logits/chosen": -2.15095853805542, + "logits/rejected": -1.6400353908538818, + "logps/chosen": -76.02684783935547, + "logps/rejected": -157.7095947265625, + "loss": 0.1545, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7171390056610107, + "rewards/margins": 4.5480499267578125, + "rewards/rejected": -2.8309109210968018, + "step": 4880 + }, + { + "epoch": 1.77, + "grad_norm": 21.326128946676743, + "learning_rate": 5.711308107592575e-09, + "logits/chosen": -2.1534788608551025, + "logits/rejected": -1.7716478109359741, + "logps/chosen": -78.25006103515625, + "logps/rejected": -139.49354553222656, + "loss": 0.115, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0134289264678955, + "rewards/margins": 3.7863471508026123, + "rewards/rejected": -2.7729179859161377, + "step": 4890 + }, + { + "epoch": 1.78, + "grad_norm": 23.675706025969955, + "learning_rate": 5.530838677166514e-09, + "logits/chosen": -2.062622308731079, + "logits/rejected": -1.7914386987686157, + "logps/chosen": -71.51246643066406, + "logps/rejected": -151.47512817382812, + "loss": 0.1166, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6398112773895264, + "rewards/margins": 4.577850341796875, + "rewards/rejected": -2.9380390644073486, + "step": 4900 + }, + { + "epoch": 1.78, + "eval_logits/chosen": -2.498281478881836, + "eval_logits/rejected": -2.3098886013031006, + "eval_logps/chosen": -92.86518859863281, + "eval_logps/rejected": -155.0869140625, + "eval_loss": 0.25684645771980286, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 0.7091068625450134, + "eval_rewards/margins": 3.680652618408203, + "eval_rewards/rejected": -2.971545934677124, + "eval_runtime": 71.3383, + "eval_samples_per_second": 12.476, + "eval_steps_per_second": 0.196, + "step": 4900 + }, + { + "epoch": 1.78, + "grad_norm": 38.69506032631391, + "learning_rate": 5.353157306845987e-09, + "logits/chosen": -1.927911400794983, + "logits/rejected": -1.6266618967056274, + "logps/chosen": -102.5804443359375, + "logps/rejected": -156.56808471679688, + "loss": 0.1485, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6019805669784546, + "rewards/margins": 3.804718017578125, + "rewards/rejected": -3.202737331390381, + "step": 4910 + }, + { + "epoch": 1.79, + "grad_norm": 13.185460602842793, + "learning_rate": 5.178271127656184e-09, + "logits/chosen": -1.6339209079742432, + "logits/rejected": -1.4754202365875244, + "logps/chosen": -77.33897399902344, + "logps/rejected": -137.8895721435547, + "loss": 0.1252, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.3045456409454346, + "rewards/margins": 4.217949867248535, + "rewards/rejected": -2.9134037494659424, + "step": 4920 + }, + { + "epoch": 1.79, + "grad_norm": 19.230753940442828, + "learning_rate": 5.006187158440717e-09, + "logits/chosen": -1.7411714792251587, + "logits/rejected": -1.4912729263305664, + "logps/chosen": -79.78264617919922, + "logps/rejected": -136.29086303710938, + "loss": 0.1605, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.1097663640975952, + "rewards/margins": 3.9273438453674316, + "rewards/rejected": -2.817577838897705, + "step": 4930 + }, + { + "epoch": 1.79, + "grad_norm": 46.07395788437603, + "learning_rate": 4.8369123055799295e-09, + "logits/chosen": -1.8338695764541626, + "logits/rejected": -1.4250398874282837, + "logps/chosen": -88.02108001708984, + "logps/rejected": -194.01132202148438, + "loss": 0.1362, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0556516647338867, + "rewards/margins": 5.144927024841309, + "rewards/rejected": -4.0892744064331055, + "step": 4940 + }, + { + "epoch": 1.8, + "grad_norm": 13.001369713867522, + "learning_rate": 4.67045336271368e-09, + "logits/chosen": -1.945007085800171, + "logits/rejected": -1.674750566482544, + "logps/chosen": -85.85972595214844, + "logps/rejected": -164.24606323242188, + "loss": 0.1525, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.063456654548645, + "rewards/margins": 4.070924758911133, + "rewards/rejected": -3.0074684619903564, + "step": 4950 + }, + { + "epoch": 1.8, + "grad_norm": 36.531934974949316, + "learning_rate": 4.506817010468731e-09, + "logits/chosen": -1.7169277667999268, + "logits/rejected": -1.929560899734497, + "logps/chosen": -106.58872985839844, + "logps/rejected": -153.10482788085938, + "loss": 0.1546, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9283292889595032, + "rewards/margins": 3.2342593669891357, + "rewards/rejected": -2.3059301376342773, + "step": 4960 + }, + { + "epoch": 1.8, + "grad_norm": 21.121804389411686, + "learning_rate": 4.346009816190596e-09, + "logits/chosen": -1.7104358673095703, + "logits/rejected": -1.638135313987732, + "logps/chosen": -96.46279907226562, + "logps/rejected": -162.83309936523438, + "loss": 0.1416, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8155636787414551, + "rewards/margins": 4.107938766479492, + "rewards/rejected": -3.292375087738037, + "step": 4970 + }, + { + "epoch": 1.81, + "grad_norm": 25.432800607703953, + "learning_rate": 4.188038233680005e-09, + "logits/chosen": -2.3670763969421387, + "logits/rejected": -1.915074348449707, + "logps/chosen": -76.34284210205078, + "logps/rejected": -146.34146118164062, + "loss": 0.1577, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.011473536491394, + "rewards/margins": 3.7747600078582764, + "rewards/rejected": -2.763286590576172, + "step": 4980 + }, + { + "epoch": 1.81, + "grad_norm": 31.52925521872735, + "learning_rate": 4.032908602933835e-09, + "logits/chosen": -1.9845958948135376, + "logits/rejected": -1.6368509531021118, + "logps/chosen": -78.15166473388672, + "logps/rejected": -183.4367218017578, + "loss": 0.1362, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.296852946281433, + "rewards/margins": 5.180371284484863, + "rewards/rejected": -3.8835182189941406, + "step": 4990 + }, + { + "epoch": 1.81, + "grad_norm": 24.267438377007352, + "learning_rate": 3.880627149890725e-09, + "logits/chosen": -1.933571219444275, + "logits/rejected": -1.8695943355560303, + "logps/chosen": -86.22135925292969, + "logps/rejected": -128.43087768554688, + "loss": 0.1273, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.3141534328460693, + "rewards/margins": 3.2799339294433594, + "rewards/rejected": -1.9657806158065796, + "step": 5000 + }, + { + "epoch": 1.81, + "eval_logits/chosen": -2.4975826740264893, + "eval_logits/rejected": -2.3100476264953613, + "eval_logps/chosen": -92.64019775390625, + "eval_logps/rejected": -155.10391235351562, + "eval_loss": 0.25643154978752136, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 0.7316060066223145, + "eval_rewards/margins": 3.70485258102417, + "eval_rewards/rejected": -2.9732463359832764, + "eval_runtime": 71.5276, + "eval_samples_per_second": 12.443, + "eval_steps_per_second": 0.196, + "step": 5000 + }, + { + "epoch": 1.82, + "grad_norm": 23.454697098867037, + "learning_rate": 3.731199986181161e-09, + "logits/chosen": -1.693359375, + "logits/rejected": -1.7281681299209595, + "logps/chosen": -86.59770202636719, + "logps/rejected": -154.07138061523438, + "loss": 0.1429, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8508288264274597, + "rewards/margins": 3.944756031036377, + "rewards/rejected": -3.0939269065856934, + "step": 5010 + }, + { + "epoch": 1.82, + "grad_norm": 21.363949418831528, + "learning_rate": 3.5846331088821848e-09, + "logits/chosen": -1.7737480401992798, + "logits/rejected": -1.9281337261199951, + "logps/chosen": -82.63226318359375, + "logps/rejected": -151.45726013183594, + "loss": 0.1787, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.7113445401191711, + "rewards/margins": 3.4898841381073, + "rewards/rejected": -2.7785401344299316, + "step": 5020 + }, + { + "epoch": 1.83, + "grad_norm": 49.695952302649566, + "learning_rate": 3.440932400276758e-09, + "logits/chosen": -2.308803081512451, + "logits/rejected": -2.2135608196258545, + "logps/chosen": -83.8191909790039, + "logps/rejected": -141.08892822265625, + "loss": 0.1696, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.9245914220809937, + "rewards/margins": 3.2115063667297363, + "rewards/rejected": -2.286914825439453, + "step": 5030 + }, + { + "epoch": 1.83, + "grad_norm": 40.81068471888173, + "learning_rate": 3.300103627617656e-09, + "logits/chosen": -1.7834829092025757, + "logits/rejected": -1.9567530155181885, + "logps/chosen": -70.00810241699219, + "logps/rejected": -122.92684173583984, + "loss": 0.1558, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0573456287384033, + "rewards/margins": 4.120889186859131, + "rewards/rejected": -3.0635437965393066, + "step": 5040 + }, + { + "epoch": 1.83, + "grad_norm": 21.938277111569853, + "learning_rate": 3.162152442895996e-09, + "logits/chosen": -1.793460488319397, + "logits/rejected": -1.8442881107330322, + "logps/chosen": -101.73194885253906, + "logps/rejected": -148.40234375, + "loss": 0.1295, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0204694271087646, + "rewards/margins": 3.785799503326416, + "rewards/rejected": -2.7653300762176514, + "step": 5050 + }, + { + "epoch": 1.84, + "grad_norm": 37.435753201875194, + "learning_rate": 3.0270843826143837e-09, + "logits/chosen": -1.8540923595428467, + "logits/rejected": -1.9534356594085693, + "logps/chosen": -107.13368225097656, + "logps/rejected": -207.95645141601562, + "loss": 0.1731, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3544046878814697, + "rewards/margins": 4.78790807723999, + "rewards/rejected": -3.4335033893585205, + "step": 5060 + }, + { + "epoch": 1.84, + "grad_norm": 19.208193674331728, + "learning_rate": 2.894904867564793e-09, + "logits/chosen": -1.6729189157485962, + "logits/rejected": -1.791587471961975, + "logps/chosen": -92.18028259277344, + "logps/rejected": -181.7095947265625, + "loss": 0.1094, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.4680941104888916, + "rewards/margins": 5.646910190582275, + "rewards/rejected": -4.178815841674805, + "step": 5070 + }, + { + "epoch": 1.84, + "grad_norm": 29.97158214233071, + "learning_rate": 2.765619202610939e-09, + "logits/chosen": -1.8277852535247803, + "logits/rejected": -1.64214289188385, + "logps/chosen": -70.78485870361328, + "logps/rejected": -141.9372100830078, + "loss": 0.1758, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.2488759756088257, + "rewards/margins": 3.880674362182617, + "rewards/rejected": -2.631798505783081, + "step": 5080 + }, + { + "epoch": 1.85, + "grad_norm": 17.74181700068005, + "learning_rate": 2.639232576475364e-09, + "logits/chosen": -1.552199125289917, + "logits/rejected": -1.4841490983963013, + "logps/chosen": -81.1534652709961, + "logps/rejected": -125.95042419433594, + "loss": 0.1427, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.3184212446212769, + "rewards/margins": 3.3990256786346436, + "rewards/rejected": -2.080604314804077, + "step": 5090 + }, + { + "epoch": 1.85, + "grad_norm": 22.318813582652908, + "learning_rate": 2.5157500615312577e-09, + "logits/chosen": -1.8579572439193726, + "logits/rejected": -2.0184388160705566, + "logps/chosen": -89.17039489746094, + "logps/rejected": -150.26077270507812, + "loss": 0.1221, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.099916934967041, + "rewards/margins": 4.67440938949585, + "rewards/rejected": -3.5744922161102295, + "step": 5100 + }, + { + "epoch": 1.85, + "eval_logits/chosen": -2.479732036590576, + "eval_logits/rejected": -2.2966747283935547, + "eval_logps/chosen": -92.56282806396484, + "eval_logps/rejected": -155.0851593017578, + "eval_loss": 0.2575416564941406, + "eval_rewards/accuracies": 0.9107142686843872, + "eval_rewards/chosen": 0.7393438220024109, + "eval_rewards/margins": 3.7107155323028564, + "eval_rewards/rejected": -2.9713714122772217, + "eval_runtime": 71.474, + "eval_samples_per_second": 12.452, + "eval_steps_per_second": 0.196, + "step": 5100 + }, + { + "epoch": 1.85, + "grad_norm": 44.029689486297656, + "learning_rate": 2.395176613598815e-09, + "logits/chosen": -1.909277319908142, + "logits/rejected": -1.5398352146148682, + "logps/chosen": -100.70805358886719, + "logps/rejected": -156.3821563720703, + "loss": 0.1731, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.70123291015625, + "rewards/margins": 3.1760170459747314, + "rewards/rejected": -2.4747843742370605, + "step": 5110 + }, + { + "epoch": 1.86, + "grad_norm": 29.264329504378058, + "learning_rate": 2.2775170717463902e-09, + "logits/chosen": -2.2539279460906982, + "logits/rejected": -2.0020687580108643, + "logps/chosen": -96.94004821777344, + "logps/rejected": -158.09829711914062, + "loss": 0.1387, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7452301383018494, + "rewards/margins": 4.1389360427856445, + "rewards/rejected": -3.3937058448791504, + "step": 5120 + }, + { + "epoch": 1.86, + "grad_norm": 25.34055019816174, + "learning_rate": 2.1627761580962687e-09, + "logits/chosen": -2.1744818687438965, + "logits/rejected": -1.969853162765503, + "logps/chosen": -87.83367156982422, + "logps/rejected": -131.83323669433594, + "loss": 0.1646, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.7479078769683838, + "rewards/margins": 2.210872173309326, + "rewards/rejected": -1.4629642963409424, + "step": 5130 + }, + { + "epoch": 1.87, + "grad_norm": 28.764305762206437, + "learning_rate": 2.0509584776351506e-09, + "logits/chosen": -1.656760573387146, + "logits/rejected": -1.4122374057769775, + "logps/chosen": -72.59513092041016, + "logps/rejected": -148.3843994140625, + "loss": 0.1104, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.1340370178222656, + "rewards/margins": 4.139061450958252, + "rewards/rejected": -3.0050246715545654, + "step": 5140 + }, + { + "epoch": 1.87, + "grad_norm": 25.193191046247687, + "learning_rate": 1.942068518029333e-09, + "logits/chosen": -2.1100473403930664, + "logits/rejected": -1.478046178817749, + "logps/chosen": -74.53004455566406, + "logps/rejected": -263.0050964355469, + "loss": 0.1563, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.2791024446487427, + "rewards/margins": 4.334400177001953, + "rewards/rejected": -3.055298089981079, + "step": 5150 + }, + { + "epoch": 1.87, + "grad_norm": 38.53224961606162, + "learning_rate": 1.8361106494445943e-09, + "logits/chosen": -2.0669713020324707, + "logits/rejected": -1.703884482383728, + "logps/chosen": -94.36878967285156, + "logps/rejected": -157.6041717529297, + "loss": 0.1447, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8586713671684265, + "rewards/margins": 3.7407805919647217, + "rewards/rejected": -2.8821091651916504, + "step": 5160 + }, + { + "epoch": 1.88, + "grad_norm": 43.063554524576524, + "learning_rate": 1.7330891243708456e-09, + "logits/chosen": -1.6464707851409912, + "logits/rejected": -1.8101832866668701, + "logps/chosen": -121.72802734375, + "logps/rejected": -158.669677734375, + "loss": 0.1641, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.5665308237075806, + "rewards/margins": 3.349714756011963, + "rewards/rejected": -2.7831835746765137, + "step": 5170 + }, + { + "epoch": 1.88, + "grad_norm": 40.989160241743704, + "learning_rate": 1.633008077451406e-09, + "logits/chosen": -2.263235569000244, + "logits/rejected": -2.035400867462158, + "logps/chosen": -78.44254302978516, + "logps/rejected": -152.73580932617188, + "loss": 0.1752, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6941092014312744, + "rewards/margins": 4.51024055480957, + "rewards/rejected": -2.816131353378296, + "step": 5180 + }, + { + "epoch": 1.88, + "grad_norm": 35.64059838178094, + "learning_rate": 1.5358715253170785e-09, + "logits/chosen": -1.6528642177581787, + "logits/rejected": -1.7316625118255615, + "logps/chosen": -97.46934509277344, + "logps/rejected": -128.89291381835938, + "loss": 0.1489, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.1307111978530884, + "rewards/margins": 3.048318386077881, + "rewards/rejected": -1.917607069015503, + "step": 5190 + }, + { + "epoch": 1.89, + "grad_norm": 21.32011780611291, + "learning_rate": 1.4416833664249867e-09, + "logits/chosen": -1.8180415630340576, + "logits/rejected": -1.7365614175796509, + "logps/chosen": -99.86917877197266, + "logps/rejected": -146.1660919189453, + "loss": 0.1752, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.311732530593872, + "rewards/margins": 3.4762260913848877, + "rewards/rejected": -2.1644930839538574, + "step": 5200 + }, + { + "epoch": 1.89, + "eval_logits/chosen": -2.4827630519866943, + "eval_logits/rejected": -2.2978460788726807, + "eval_logps/chosen": -92.51727294921875, + "eval_logps/rejected": -155.1031036376953, + "eval_loss": 0.2567766308784485, + "eval_rewards/accuracies": 0.9107142686843872, + "eval_rewards/chosen": 0.7438983917236328, + "eval_rewards/margins": 3.717064380645752, + "eval_rewards/rejected": -2.973165512084961, + "eval_runtime": 71.4535, + "eval_samples_per_second": 12.456, + "eval_steps_per_second": 0.196, + "step": 5200 + }, + { + "epoch": 1.89, + "grad_norm": 14.906342368048502, + "learning_rate": 1.3504473809020673e-09, + "logits/chosen": -1.907004714012146, + "logits/rejected": -1.4068089723587036, + "logps/chosen": -86.4648208618164, + "logps/rejected": -206.89736938476562, + "loss": 0.1684, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7831238508224487, + "rewards/margins": 4.615767478942871, + "rewards/rejected": -3.8326430320739746, + "step": 5210 + }, + { + "epoch": 1.89, + "grad_norm": 34.29482897768453, + "learning_rate": 1.2621672303933739e-09, + "logits/chosen": -1.811163306236267, + "logits/rejected": -1.6804473400115967, + "logps/chosen": -95.26229858398438, + "logps/rejected": -157.65988159179688, + "loss": 0.1628, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.0265977382659912, + "rewards/margins": 3.7031357288360596, + "rewards/rejected": -2.6765377521514893, + "step": 5220 + }, + { + "epoch": 1.9, + "grad_norm": 28.11537513046596, + "learning_rate": 1.1768464579151373e-09, + "logits/chosen": -1.8990215063095093, + "logits/rejected": -1.9127355813980103, + "logps/chosen": -82.57646179199219, + "logps/rejected": -136.97134399414062, + "loss": 0.1398, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.8750249743461609, + "rewards/margins": 3.5186927318573, + "rewards/rejected": -2.643667697906494, + "step": 5230 + }, + { + "epoch": 1.9, + "grad_norm": 9.966571219874533, + "learning_rate": 1.0944884877125527e-09, + "logits/chosen": -1.402052640914917, + "logits/rejected": -1.1487385034561157, + "logps/chosen": -104.62020111083984, + "logps/rejected": -200.74160766601562, + "loss": 0.1199, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.388352632522583, + "rewards/margins": 4.531402111053467, + "rewards/rejected": -4.143049716949463, + "step": 5240 + }, + { + "epoch": 1.91, + "grad_norm": 20.5792937534979, + "learning_rate": 1.0150966251223664e-09, + "logits/chosen": -2.0096936225891113, + "logits/rejected": -1.805040717124939, + "logps/chosen": -64.17274475097656, + "logps/rejected": -122.17179870605469, + "loss": 0.1306, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.0267283916473389, + "rewards/margins": 4.174807071685791, + "rewards/rejected": -3.1480789184570312, + "step": 5250 + }, + { + "epoch": 1.91, + "grad_norm": 20.585526219504107, + "learning_rate": 9.386740564401808e-10, + "logits/chosen": -2.2588255405426025, + "logits/rejected": -1.8842235803604126, + "logps/chosen": -115.33465576171875, + "logps/rejected": -176.01412963867188, + "loss": 0.1458, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.2555707693099976, + "rewards/margins": 4.751716613769531, + "rewards/rejected": -3.4961459636688232, + "step": 5260 + }, + { + "epoch": 1.91, + "grad_norm": 34.226305800359405, + "learning_rate": 8.652238487926566e-10, + "logits/chosen": -1.9869095087051392, + "logits/rejected": -2.1987595558166504, + "logps/chosen": -92.55632781982422, + "logps/rejected": -212.2284393310547, + "loss": 0.1584, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0699260234832764, + "rewards/margins": 3.5520572662353516, + "rewards/rejected": -2.4821314811706543, + "step": 5270 + }, + { + "epoch": 1.92, + "grad_norm": 36.554661458368045, + "learning_rate": 7.947489500143206e-10, + "logits/chosen": -2.079451560974121, + "logits/rejected": -1.7273361682891846, + "logps/chosen": -59.18944549560547, + "logps/rejected": -125.39189147949219, + "loss": 0.1414, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8779332637786865, + "rewards/margins": 3.9232311248779297, + "rewards/rejected": -2.045297622680664, + "step": 5280 + }, + { + "epoch": 1.92, + "grad_norm": 29.3453328036471, + "learning_rate": 7.272521885293343e-10, + "logits/chosen": -1.7185804843902588, + "logits/rejected": -1.2968146800994873, + "logps/chosen": -69.44091033935547, + "logps/rejected": -128.18624877929688, + "loss": 0.1573, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.9322481155395508, + "rewards/margins": 3.3514316082000732, + "rewards/rejected": -2.4191837310791016, + "step": 5290 + }, + { + "epoch": 1.92, + "grad_norm": 19.154161212924226, + "learning_rate": 6.627362732379433e-10, + "logits/chosen": -1.8239357471466064, + "logits/rejected": -1.7603832483291626, + "logps/chosen": -96.85991668701172, + "logps/rejected": -174.81370544433594, + "loss": 0.1235, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8014510869979858, + "rewards/margins": 3.704523801803589, + "rewards/rejected": -2.9030728340148926, + "step": 5300 + }, + { + "epoch": 1.92, + "eval_logits/chosen": -2.494673490524292, + "eval_logits/rejected": -2.307727336883545, + "eval_logps/chosen": -92.54339599609375, + "eval_logps/rejected": -155.1482696533203, + "eval_loss": 0.25662314891815186, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 0.7412868142127991, + "eval_rewards/margins": 3.718968629837036, + "eval_rewards/rejected": -2.97768235206604, + "eval_runtime": 71.3827, + "eval_samples_per_second": 12.468, + "eval_steps_per_second": 0.196, + "step": 5300 + }, + { + "epoch": 1.93, + "grad_norm": 37.14417488124076, + "learning_rate": 6.012037934077563e-10, + "logits/chosen": -1.4998286962509155, + "logits/rejected": -1.2366199493408203, + "logps/chosen": -93.3680648803711, + "logps/rejected": -167.04367065429688, + "loss": 0.148, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9477823972702026, + "rewards/margins": 3.8279433250427246, + "rewards/rejected": -2.8801610469818115, + "step": 5310 + }, + { + "epoch": 1.93, + "grad_norm": 36.70052172928171, + "learning_rate": 5.426572185698363e-10, + "logits/chosen": -2.037949323654175, + "logits/rejected": -1.7177932262420654, + "logps/chosen": -88.7237548828125, + "logps/rejected": -163.99179077148438, + "loss": 0.1393, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0409705638885498, + "rewards/margins": 3.7800567150115967, + "rewards/rejected": -2.7390859127044678, + "step": 5320 + }, + { + "epoch": 1.93, + "grad_norm": 57.51895033093402, + "learning_rate": 4.870988984196134e-10, + "logits/chosen": -1.5953149795532227, + "logits/rejected": -1.3189541101455688, + "logps/chosen": -92.99564361572266, + "logps/rejected": -165.5485382080078, + "loss": 0.1624, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.016343355178833, + "rewards/margins": 4.139420509338379, + "rewards/rejected": -3.123077392578125, + "step": 5330 + }, + { + "epoch": 1.94, + "grad_norm": 29.612269350826534, + "learning_rate": 4.345310627225179e-10, + "logits/chosen": -2.3673906326293945, + "logits/rejected": -1.5509955883026123, + "logps/chosen": -72.61958312988281, + "logps/rejected": -196.7791290283203, + "loss": 0.1208, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.651362419128418, + "rewards/margins": 6.749650478363037, + "rewards/rejected": -5.098288536071777, + "step": 5340 + }, + { + "epoch": 1.94, + "grad_norm": 13.368844959539826, + "learning_rate": 3.849558212245696e-10, + "logits/chosen": -1.5998425483703613, + "logits/rejected": -1.4031832218170166, + "logps/chosen": -85.34275817871094, + "logps/rejected": -154.41502380371094, + "loss": 0.1364, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0656564235687256, + "rewards/margins": 3.6739463806152344, + "rewards/rejected": -2.608290195465088, + "step": 5350 + }, + { + "epoch": 1.95, + "grad_norm": 32.464438755302695, + "learning_rate": 3.3837516356764464e-10, + "logits/chosen": -2.0724587440490723, + "logits/rejected": -1.6080551147460938, + "logps/chosen": -91.76570129394531, + "logps/rejected": -153.9037628173828, + "loss": 0.1588, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.46443551778793335, + "rewards/margins": 2.967500925064087, + "rewards/rejected": -2.503065586090088, + "step": 5360 + }, + { + "epoch": 1.95, + "grad_norm": 17.335570454742644, + "learning_rate": 2.947909592096648e-10, + "logits/chosen": -1.7819137573242188, + "logits/rejected": -1.6901859045028687, + "logps/chosen": -83.9917221069336, + "logps/rejected": -155.39439392089844, + "loss": 0.17, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1832668781280518, + "rewards/margins": 4.525895118713379, + "rewards/rejected": -3.3426289558410645, + "step": 5370 + }, + { + "epoch": 1.95, + "grad_norm": 24.93213979968397, + "learning_rate": 2.542049573495325e-10, + "logits/chosen": -1.923814058303833, + "logits/rejected": -1.7243621349334717, + "logps/chosen": -109.0640869140625, + "logps/rejected": -164.2493438720703, + "loss": 0.1342, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.23761026561260223, + "rewards/margins": 3.5649254322052, + "rewards/rejected": -3.327315092086792, + "step": 5380 + }, + { + "epoch": 1.96, + "grad_norm": 25.680467090159638, + "learning_rate": 2.166187868569619e-10, + "logits/chosen": -1.646674394607544, + "logits/rejected": -1.8654903173446655, + "logps/chosen": -107.28630065917969, + "logps/rejected": -148.73915100097656, + "loss": 0.1411, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.48937278985977173, + "rewards/margins": 3.2133946418762207, + "rewards/rejected": -2.7240214347839355, + "step": 5390 + }, + { + "epoch": 1.96, + "grad_norm": 26.62093298807946, + "learning_rate": 1.8203395620708107e-10, + "logits/chosen": -2.068347215652466, + "logits/rejected": -1.635754942893982, + "logps/chosen": -80.6448745727539, + "logps/rejected": -183.50900268554688, + "loss": 0.1288, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8882523775100708, + "rewards/margins": 4.448118209838867, + "rewards/rejected": -3.559866428375244, + "step": 5400 + }, + { + "epoch": 1.96, + "eval_logits/chosen": -2.488870143890381, + "eval_logits/rejected": -2.304009199142456, + "eval_logps/chosen": -92.57149505615234, + "eval_logps/rejected": -155.0608673095703, + "eval_loss": 0.2569134533405304, + "eval_rewards/accuracies": 0.9107142686843872, + "eval_rewards/chosen": 0.738475501537323, + "eval_rewards/margins": 3.7074153423309326, + "eval_rewards/rejected": -2.968940258026123, + "eval_runtime": 71.568, + "eval_samples_per_second": 12.436, + "eval_steps_per_second": 0.196, + "step": 5400 + }, + { + "epoch": 1.96, + "grad_norm": 60.89983951450628, + "learning_rate": 1.5045185341992228e-10, + "logits/chosen": -2.275635242462158, + "logits/rejected": -2.2029271125793457, + "logps/chosen": -95.09009552001953, + "logps/rejected": -132.654296875, + "loss": 0.1563, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.0345149040222168, + "rewards/margins": 3.2276673316955566, + "rewards/rejected": -2.193152666091919, + "step": 5410 + }, + { + "epoch": 1.97, + "grad_norm": 42.30577726329947, + "learning_rate": 1.218737460046748e-10, + "logits/chosen": -1.8863006830215454, + "logits/rejected": -1.45746910572052, + "logps/chosen": -86.98088836669922, + "logps/rejected": -162.14358520507812, + "loss": 0.1412, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.4029314517974854, + "rewards/margins": 4.1309075355529785, + "rewards/rejected": -2.727975368499756, + "step": 5420 + }, + { + "epoch": 1.97, + "grad_norm": 22.851908976691202, + "learning_rate": 9.630078090883398e-11, + "logits/chosen": -2.069901943206787, + "logits/rejected": -1.7926028966903687, + "logps/chosen": -121.98948669433594, + "logps/rejected": -309.41168212890625, + "loss": 0.1431, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005540943238884211, + "rewards/margins": 4.165543556213379, + "rewards/rejected": -4.171084403991699, + "step": 5430 + }, + { + "epoch": 1.97, + "grad_norm": 24.450065176253204, + "learning_rate": 7.373398447218792e-11, + "logits/chosen": -1.7372915744781494, + "logits/rejected": -1.5503891706466675, + "logps/chosen": -108.78375244140625, + "logps/rejected": -175.95468139648438, + "loss": 0.1728, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1438463926315308, + "rewards/margins": 4.212247848510742, + "rewards/rejected": -3.06840181350708, + "step": 5440 + }, + { + "epoch": 1.98, + "grad_norm": 13.790530961269235, + "learning_rate": 5.417426238560896e-11, + "logits/chosen": -1.820669412612915, + "logits/rejected": -1.7920053005218506, + "logps/chosen": -83.18289184570312, + "logps/rejected": -164.08082580566406, + "loss": 0.1263, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.9669841527938843, + "rewards/margins": 4.896847724914551, + "rewards/rejected": -3.9298641681671143, + "step": 5450 + }, + { + "epoch": 1.98, + "grad_norm": 21.370222843228014, + "learning_rate": 3.7622399654682614e-11, + "logits/chosen": -2.2546284198760986, + "logits/rejected": -1.9654285907745361, + "logps/chosen": -91.28435516357422, + "logps/rejected": -227.45193481445312, + "loss": 0.1813, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.5948358774185181, + "rewards/margins": 4.2998552322387695, + "rewards/rejected": -3.705019474029541, + "step": 5460 + }, + { + "epoch": 1.99, + "grad_norm": 19.65088303473795, + "learning_rate": 2.4079060568257813e-11, + "logits/chosen": -1.8808691501617432, + "logits/rejected": -1.6000111103057861, + "logps/chosen": -88.05937957763672, + "logps/rejected": -188.5884552001953, + "loss": 0.1328, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3033487796783447, + "rewards/margins": 4.862802982330322, + "rewards/rejected": -3.5594539642333984, + "step": 5470 + }, + { + "epoch": 1.99, + "grad_norm": 27.89489697549713, + "learning_rate": 1.354478867173492e-11, + "logits/chosen": -1.8578226566314697, + "logits/rejected": -1.4207289218902588, + "logps/chosen": -113.3355941772461, + "logps/rejected": -168.69332885742188, + "loss": 0.1488, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.764236330986023, + "rewards/margins": 3.6643853187561035, + "rewards/rejected": -2.900148868560791, + "step": 5480 + }, + { + "epoch": 1.99, + "grad_norm": 20.734274017785214, + "learning_rate": 6.020006745274808e-12, + "logits/chosen": -2.122195243835449, + "logits/rejected": -1.6640615463256836, + "logps/chosen": -59.822662353515625, + "logps/rejected": -172.18661499023438, + "loss": 0.1118, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6453478336334229, + "rewards/margins": 4.6514892578125, + "rewards/rejected": -3.006141185760498, + "step": 5490 + }, + { + "epoch": 2.0, + "grad_norm": 45.552319616907596, + "learning_rate": 1.5050167868208009e-12, + "logits/chosen": -1.6916072368621826, + "logits/rejected": -1.7415540218353271, + "logps/chosen": -121.29073333740234, + "logps/rejected": -157.93690490722656, + "loss": 0.145, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7984371185302734, + "rewards/margins": 4.016904354095459, + "rewards/rejected": -3.2184669971466064, + "step": 5500 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -2.4879519939422607, + "eval_logits/rejected": -2.303224802017212, + "eval_logps/chosen": -92.59040069580078, + "eval_logps/rejected": -155.1884002685547, + "eval_loss": 0.2571623623371124, + "eval_rewards/accuracies": 0.8928571343421936, + "eval_rewards/chosen": 0.7365875840187073, + "eval_rewards/margins": 3.7182834148406982, + "eval_rewards/rejected": -2.9816958904266357, + "eval_runtime": 71.3635, + "eval_samples_per_second": 12.471, + "eval_steps_per_second": 0.196, + "step": 5500 + }, + { + "epoch": 2.0, + "grad_norm": 18.200273756614273, + "learning_rate": 0.0, + "logits/chosen": -1.8841540813446045, + "logits/rejected": -1.5108808279037476, + "logps/chosen": -88.10948181152344, + "logps/rejected": -193.3212127685547, + "loss": 0.1231, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.3105828762054443, + "rewards/margins": 4.1531171798706055, + "rewards/rejected": -2.842533826828003, + "step": 5510 + }, + { + "epoch": 2.0, + "step": 5510, + "total_flos": 0.0, + "train_loss": 0.2711078498627443, + "train_runtime": 49070.973, + "train_samples_per_second": 3.592, + "train_steps_per_second": 0.112 + } + ], + "logging_steps": 10, + "max_steps": 5510, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}