diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3886 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 100, + "global_step": 2476, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 2.0161290322580643e-09, + "logits/chosen": -2.8790624141693115, + "logits/rejected": -2.726853847503662, + "logps/chosen": -133.1189727783203, + "logps/rejected": -82.92158508300781, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 2.0161290322580644e-08, + "logits/chosen": -2.8126273155212402, + "logits/rejected": -2.793696165084839, + "logps/chosen": -165.1085968017578, + "logps/rejected": -99.26773834228516, + "loss": 0.6913, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.008918720297515392, + "rewards/margins": 0.006136136595159769, + "rewards/rejected": 0.0027825830038636923, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 4.032258064516129e-08, + "logits/chosen": -2.9156007766723633, + "logits/rejected": -2.7959983348846436, + "logps/chosen": -156.39736938476562, + "logps/rejected": -82.81832122802734, + "loss": 0.6749, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02916492521762848, + "rewards/margins": 0.026107853278517723, + "rewards/rejected": 0.003057069843634963, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 6.048387096774194e-08, + "logits/chosen": -3.0353140830993652, + "logits/rejected": -2.9207911491394043, + "logps/chosen": -143.68771362304688, + "logps/rejected": -88.21602630615234, + "loss": 0.6045, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.08668029308319092, + "rewards/margins": 0.16722653806209564, + "rewards/rejected": -0.08054624497890472, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 8.064516129032257e-08, + "logits/chosen": -2.872088670730591, + "logits/rejected": -2.754939556121826, + "logps/chosen": -156.96128845214844, + "logps/rejected": -94.48432922363281, + "loss": 0.4921, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.31132370233535767, + "rewards/margins": 0.49964460730552673, + "rewards/rejected": -0.18832087516784668, + "step": 40 + }, + { + "epoch": 0.04, + "learning_rate": 1.0080645161290321e-07, + "logits/chosen": -2.892703056335449, + "logits/rejected": -2.8270375728607178, + "logps/chosen": -163.22244262695312, + "logps/rejected": -99.32060241699219, + "loss": 0.3446, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5208563208580017, + "rewards/margins": 1.1299794912338257, + "rewards/rejected": -0.609123170375824, + "step": 50 + }, + { + "epoch": 0.05, + "learning_rate": 1.2096774193548387e-07, + "logits/chosen": -2.9103317260742188, + "logits/rejected": -2.7615671157836914, + "logps/chosen": -149.80177307128906, + "logps/rejected": -103.58976745605469, + "loss": 0.2715, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.14492809772491455, + "rewards/margins": 1.546500563621521, + "rewards/rejected": -1.4015724658966064, + "step": 60 + }, + { + "epoch": 0.06, + "learning_rate": 1.4112903225806453e-07, + "logits/chosen": -2.9492242336273193, + "logits/rejected": -2.7814266681671143, + "logps/chosen": -155.3502960205078, + "logps/rejected": -108.44856262207031, + "loss": 0.2274, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4613724648952484, + "rewards/margins": 2.1766598224639893, + "rewards/rejected": -1.7152869701385498, + "step": 70 + }, + { + "epoch": 0.06, + "learning_rate": 1.6129032258064515e-07, + "logits/chosen": -2.925982713699341, + "logits/rejected": -2.812129497528076, + "logps/chosen": -148.31051635742188, + "logps/rejected": -101.76717376708984, + "loss": 0.2063, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6585937142372131, + "rewards/margins": 2.0880322456359863, + "rewards/rejected": -1.4294384717941284, + "step": 80 + }, + { + "epoch": 0.07, + "learning_rate": 1.814516129032258e-07, + "logits/chosen": -2.8553051948547363, + "logits/rejected": -2.752856731414795, + "logps/chosen": -135.9008026123047, + "logps/rejected": -98.0966567993164, + "loss": 0.1354, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.5049271583557129, + "rewards/margins": 3.050931453704834, + "rewards/rejected": -2.546004056930542, + "step": 90 + }, + { + "epoch": 0.08, + "learning_rate": 2.0161290322580642e-07, + "logits/chosen": -2.776674747467041, + "logits/rejected": -2.6762099266052246, + "logps/chosen": -143.11875915527344, + "logps/rejected": -110.66424560546875, + "loss": 0.1657, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5857507586479187, + "rewards/margins": 2.9393153190612793, + "rewards/rejected": -2.353564739227295, + "step": 100 + }, + { + "epoch": 0.08, + "eval_logits/chosen": -2.7206850051879883, + "eval_logits/rejected": -2.5952277183532715, + "eval_logps/chosen": -171.36961364746094, + "eval_logps/rejected": -130.100830078125, + "eval_loss": 0.17399193346500397, + "eval_rewards/accuracies": 0.9801136255264282, + "eval_rewards/chosen": -1.4537159204483032, + "eval_rewards/margins": 2.7167909145355225, + "eval_rewards/rejected": -4.170506954193115, + "eval_runtime": 781.3781, + "eval_samples_per_second": 5.367, + "eval_steps_per_second": 0.169, + "step": 100 + }, + { + "epoch": 0.09, + "learning_rate": 2.2177419354838707e-07, + "logits/chosen": -2.8722500801086426, + "logits/rejected": -2.6607398986816406, + "logps/chosen": -171.62054443359375, + "logps/rejected": -124.04454040527344, + "loss": 0.1121, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.037531446665525436, + "rewards/margins": 3.501634120941162, + "rewards/rejected": -3.4641032218933105, + "step": 110 + }, + { + "epoch": 0.1, + "learning_rate": 2.4193548387096775e-07, + "logits/chosen": -2.8588368892669678, + "logits/rejected": -2.7413594722747803, + "logps/chosen": -155.94931030273438, + "logps/rejected": -128.44300842285156, + "loss": 0.1322, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.024041980504989624, + "rewards/margins": 3.836723804473877, + "rewards/rejected": -3.8607661724090576, + "step": 120 + }, + { + "epoch": 0.11, + "learning_rate": 2.6209677419354835e-07, + "logits/chosen": -2.904231071472168, + "logits/rejected": -2.7524828910827637, + "logps/chosen": -172.7019500732422, + "logps/rejected": -139.36602783203125, + "loss": 0.0942, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.3851674497127533, + "rewards/margins": 4.335513114929199, + "rewards/rejected": -3.9503464698791504, + "step": 130 + }, + { + "epoch": 0.11, + "learning_rate": 2.8225806451612905e-07, + "logits/chosen": -2.768216609954834, + "logits/rejected": -2.5636191368103027, + "logps/chosen": -140.77847290039062, + "logps/rejected": -119.4648208618164, + "loss": 0.1081, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.3905811309814453, + "rewards/margins": 4.004052639007568, + "rewards/rejected": -3.613471508026123, + "step": 140 + }, + { + "epoch": 0.12, + "learning_rate": 3.0241935483870965e-07, + "logits/chosen": -2.818434000015259, + "logits/rejected": -2.687941312789917, + "logps/chosen": -168.5465545654297, + "logps/rejected": -148.0998992919922, + "loss": 0.0744, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11472310870885849, + "rewards/margins": 4.6449689865112305, + "rewards/rejected": -4.759692192077637, + "step": 150 + }, + { + "epoch": 0.13, + "learning_rate": 3.225806451612903e-07, + "logits/chosen": -2.620514392852783, + "logits/rejected": -2.5738983154296875, + "logps/chosen": -137.00265502929688, + "logps/rejected": -124.59332275390625, + "loss": 0.0943, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20515938103199005, + "rewards/margins": 4.401058673858643, + "rewards/rejected": -4.6062188148498535, + "step": 160 + }, + { + "epoch": 0.14, + "learning_rate": 3.4274193548387095e-07, + "logits/chosen": -2.7410850524902344, + "logits/rejected": -2.579488754272461, + "logps/chosen": -159.1106414794922, + "logps/rejected": -131.1763153076172, + "loss": 0.0549, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02330092154443264, + "rewards/margins": 4.570059776306152, + "rewards/rejected": -4.546759128570557, + "step": 170 + }, + { + "epoch": 0.15, + "learning_rate": 3.629032258064516e-07, + "logits/chosen": -2.783935308456421, + "logits/rejected": -2.5747122764587402, + "logps/chosen": -163.29624938964844, + "logps/rejected": -138.75405883789062, + "loss": 0.0857, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5057327151298523, + "rewards/margins": 4.796816825866699, + "rewards/rejected": -5.302549839019775, + "step": 180 + }, + { + "epoch": 0.15, + "learning_rate": 3.8306451612903225e-07, + "logits/chosen": -2.7054946422576904, + "logits/rejected": -2.531750440597534, + "logps/chosen": -159.70993041992188, + "logps/rejected": -145.06936645507812, + "loss": 0.0649, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03374135494232178, + "rewards/margins": 5.383774757385254, + "rewards/rejected": -5.417515754699707, + "step": 190 + }, + { + "epoch": 0.16, + "learning_rate": 4.0322580645161285e-07, + "logits/chosen": -2.6880927085876465, + "logits/rejected": -2.574934482574463, + "logps/chosen": -147.19931030273438, + "logps/rejected": -138.7591094970703, + "loss": 0.0782, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.06928863376379013, + "rewards/margins": 4.963142395019531, + "rewards/rejected": -5.032431602478027, + "step": 200 + }, + { + "epoch": 0.16, + "eval_logits/chosen": -2.586061954498291, + "eval_logits/rejected": -2.4356651306152344, + "eval_logps/chosen": -161.23614501953125, + "eval_logps/rejected": -145.42860412597656, + "eval_loss": 0.06267153471708298, + "eval_rewards/accuracies": 0.9867424368858337, + "eval_rewards/chosen": -0.4403703808784485, + "eval_rewards/margins": 5.262913227081299, + "eval_rewards/rejected": -5.703283786773682, + "eval_runtime": 773.4519, + "eval_samples_per_second": 5.422, + "eval_steps_per_second": 0.171, + "step": 200 + }, + { + "epoch": 0.17, + "learning_rate": 4.2338709677419355e-07, + "logits/chosen": -2.745858669281006, + "logits/rejected": -2.536473035812378, + "logps/chosen": -170.9766387939453, + "logps/rejected": -152.20152282714844, + "loss": 0.0748, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35801127552986145, + "rewards/margins": 5.608736038208008, + "rewards/rejected": -5.966746807098389, + "step": 210 + }, + { + "epoch": 0.18, + "learning_rate": 4.4354838709677415e-07, + "logits/chosen": -2.8358755111694336, + "logits/rejected": -2.5442118644714355, + "logps/chosen": -182.1422882080078, + "logps/rejected": -157.044189453125, + "loss": 0.0481, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2051059901714325, + "rewards/margins": 6.3389997482299805, + "rewards/rejected": -6.54410457611084, + "step": 220 + }, + { + "epoch": 0.19, + "learning_rate": 4.637096774193548e-07, + "logits/chosen": -2.6918585300445557, + "logits/rejected": -2.5473952293395996, + "logps/chosen": -150.92379760742188, + "logps/rejected": -136.5950469970703, + "loss": 0.071, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0055077215656638145, + "rewards/margins": 5.303520202636719, + "rewards/rejected": -5.309027671813965, + "step": 230 + }, + { + "epoch": 0.19, + "learning_rate": 4.838709677419355e-07, + "logits/chosen": -2.566920042037964, + "logits/rejected": -2.4885470867156982, + "logps/chosen": -150.51084899902344, + "logps/rejected": -159.2104949951172, + "loss": 0.0309, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8494253158569336, + "rewards/margins": 6.480669975280762, + "rewards/rejected": -7.330096244812012, + "step": 240 + }, + { + "epoch": 0.2, + "learning_rate": 4.995511669658887e-07, + "logits/chosen": -2.7524497509002686, + "logits/rejected": -2.5501372814178467, + "logps/chosen": -167.72291564941406, + "logps/rejected": -157.83465576171875, + "loss": 0.0794, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1402127742767334, + "rewards/margins": 5.685840129852295, + "rewards/rejected": -6.826052665710449, + "step": 250 + }, + { + "epoch": 0.21, + "learning_rate": 4.973070017953321e-07, + "logits/chosen": -2.6455769538879395, + "logits/rejected": -2.458173990249634, + "logps/chosen": -159.10935974121094, + "logps/rejected": -158.1671600341797, + "loss": 0.0445, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4249328672885895, + "rewards/margins": 6.448081970214844, + "rewards/rejected": -6.8730149269104, + "step": 260 + }, + { + "epoch": 0.22, + "learning_rate": 4.950628366247755e-07, + "logits/chosen": -2.6962273120880127, + "logits/rejected": -2.456935405731201, + "logps/chosen": -153.63746643066406, + "logps/rejected": -153.63490295410156, + "loss": 0.0345, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9769867062568665, + "rewards/margins": 6.403193473815918, + "rewards/rejected": -7.380180358886719, + "step": 270 + }, + { + "epoch": 0.23, + "learning_rate": 4.92818671454219e-07, + "logits/chosen": -2.798576831817627, + "logits/rejected": -2.623798370361328, + "logps/chosen": -171.66201782226562, + "logps/rejected": -155.81637573242188, + "loss": 0.0538, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.07002735137939453, + "rewards/margins": 6.355216979980469, + "rewards/rejected": -6.285189151763916, + "step": 280 + }, + { + "epoch": 0.23, + "learning_rate": 4.905745062836625e-07, + "logits/chosen": -2.5673298835754395, + "logits/rejected": -2.409681558609009, + "logps/chosen": -161.04302978515625, + "logps/rejected": -153.73336791992188, + "loss": 0.0762, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8906537294387817, + "rewards/margins": 5.4741902351379395, + "rewards/rejected": -6.364843845367432, + "step": 290 + }, + { + "epoch": 0.24, + "learning_rate": 4.883303411131059e-07, + "logits/chosen": -2.5682172775268555, + "logits/rejected": -2.4165453910827637, + "logps/chosen": -151.08840942382812, + "logps/rejected": -148.3418426513672, + "loss": 0.0324, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8436424136161804, + "rewards/margins": 6.011213302612305, + "rewards/rejected": -6.854855537414551, + "step": 300 + }, + { + "epoch": 0.24, + "eval_logits/chosen": -2.546065330505371, + "eval_logits/rejected": -2.3642754554748535, + "eval_logps/chosen": -166.13868713378906, + "eval_logps/rejected": -169.60226440429688, + "eval_loss": 0.03670450299978256, + "eval_rewards/accuracies": 0.9886363744735718, + "eval_rewards/chosen": -0.9306213855743408, + "eval_rewards/margins": 7.190025806427002, + "eval_rewards/rejected": -8.120647430419922, + "eval_runtime": 786.1067, + "eval_samples_per_second": 5.335, + "eval_steps_per_second": 0.168, + "step": 300 + }, + { + "epoch": 0.25, + "learning_rate": 4.860861759425494e-07, + "logits/chosen": -2.6268093585968018, + "logits/rejected": -2.3745460510253906, + "logps/chosen": -155.4282989501953, + "logps/rejected": -154.62167358398438, + "loss": 0.0678, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.217402696609497, + "rewards/margins": 6.246805667877197, + "rewards/rejected": -7.464208126068115, + "step": 310 + }, + { + "epoch": 0.26, + "learning_rate": 4.838420107719928e-07, + "logits/chosen": -2.5372049808502197, + "logits/rejected": -2.342886447906494, + "logps/chosen": -149.39791870117188, + "logps/rejected": -156.192626953125, + "loss": 0.0606, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.4715734720230103, + "rewards/margins": 6.226667881011963, + "rewards/rejected": -7.698241233825684, + "step": 320 + }, + { + "epoch": 0.27, + "learning_rate": 4.815978456014363e-07, + "logits/chosen": -2.605268955230713, + "logits/rejected": -2.4892284870147705, + "logps/chosen": -176.8946075439453, + "logps/rejected": -188.3580780029297, + "loss": 0.0459, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0982472896575928, + "rewards/margins": 7.327727317810059, + "rewards/rejected": -8.42597484588623, + "step": 330 + }, + { + "epoch": 0.27, + "learning_rate": 4.793536804308798e-07, + "logits/chosen": -2.639091968536377, + "logits/rejected": -2.4117789268493652, + "logps/chosen": -175.70474243164062, + "logps/rejected": -185.2564239501953, + "loss": 0.0348, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.5422905683517456, + "rewards/margins": 7.561958312988281, + "rewards/rejected": -9.104249954223633, + "step": 340 + }, + { + "epoch": 0.28, + "learning_rate": 4.771095152603231e-07, + "logits/chosen": -2.607506275177002, + "logits/rejected": -2.3423047065734863, + "logps/chosen": -156.63760375976562, + "logps/rejected": -168.81532287597656, + "loss": 0.03, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4262692332267761, + "rewards/margins": 8.089990615844727, + "rewards/rejected": -8.51625919342041, + "step": 350 + }, + { + "epoch": 0.29, + "learning_rate": 4.748653500897666e-07, + "logits/chosen": -2.640336275100708, + "logits/rejected": -2.4621763229370117, + "logps/chosen": -182.67959594726562, + "logps/rejected": -192.45523071289062, + "loss": 0.0264, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.4861513376235962, + "rewards/margins": 7.722003936767578, + "rewards/rejected": -9.208155632019043, + "step": 360 + }, + { + "epoch": 0.3, + "learning_rate": 4.7262118491921007e-07, + "logits/chosen": -2.663761615753174, + "logits/rejected": -2.471992015838623, + "logps/chosen": -153.07244873046875, + "logps/rejected": -164.763427734375, + "loss": 0.0248, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.455026775598526, + "rewards/margins": 7.994199275970459, + "rewards/rejected": -8.449226379394531, + "step": 370 + }, + { + "epoch": 0.31, + "learning_rate": 4.7037701974865345e-07, + "logits/chosen": -2.6942331790924072, + "logits/rejected": -2.403059244155884, + "logps/chosen": -164.22671508789062, + "logps/rejected": -168.85794067382812, + "loss": 0.0338, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5185848474502563, + "rewards/margins": 7.680899143218994, + "rewards/rejected": -8.199483871459961, + "step": 380 + }, + { + "epoch": 0.32, + "learning_rate": 4.6813285457809694e-07, + "logits/chosen": -2.577643871307373, + "logits/rejected": -2.4389777183532715, + "logps/chosen": -165.57760620117188, + "logps/rejected": -183.7444610595703, + "loss": 0.0392, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7765599489212036, + "rewards/margins": 8.763890266418457, + "rewards/rejected": -9.540449142456055, + "step": 390 + }, + { + "epoch": 0.32, + "learning_rate": 4.658886894075404e-07, + "logits/chosen": -2.6918811798095703, + "logits/rejected": -2.4850518703460693, + "logps/chosen": -176.52276611328125, + "logps/rejected": -199.3242950439453, + "loss": 0.0373, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4871985912322998, + "rewards/margins": 9.156364440917969, + "rewards/rejected": -10.643563270568848, + "step": 400 + }, + { + "epoch": 0.32, + "eval_logits/chosen": -2.5608372688293457, + "eval_logits/rejected": -2.335174560546875, + "eval_logps/chosen": -160.62420654296875, + "eval_logps/rejected": -161.9944305419922, + "eval_loss": 0.0351564958691597, + "eval_rewards/accuracies": 0.9905303120613098, + "eval_rewards/chosen": -0.37917277216911316, + "eval_rewards/margins": 6.980693817138672, + "eval_rewards/rejected": -7.359867095947266, + "eval_runtime": 1110.4682, + "eval_samples_per_second": 3.777, + "eval_steps_per_second": 0.119, + "step": 400 + }, + { + "epoch": 0.33, + "learning_rate": 4.636445242369838e-07, + "logits/chosen": -2.692749500274658, + "logits/rejected": -2.426208972930908, + "logps/chosen": -170.6109619140625, + "logps/rejected": -174.5077362060547, + "loss": 0.038, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5429098010063171, + "rewards/margins": 7.455277919769287, + "rewards/rejected": -7.9981889724731445, + "step": 410 + }, + { + "epoch": 0.34, + "learning_rate": 4.6140035906642726e-07, + "logits/chosen": -2.6611812114715576, + "logits/rejected": -2.518275737762451, + "logps/chosen": -176.6471710205078, + "logps/rejected": -179.11354064941406, + "loss": 0.0429, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.369283676147461, + "rewards/margins": 7.647196292877197, + "rewards/rejected": -9.0164794921875, + "step": 420 + }, + { + "epoch": 0.35, + "learning_rate": 4.5915619389587075e-07, + "logits/chosen": -2.7353851795196533, + "logits/rejected": -2.507570505142212, + "logps/chosen": -163.0355224609375, + "logps/rejected": -174.5540771484375, + "loss": 0.047, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.3989851474761963, + "rewards/margins": 7.921950340270996, + "rewards/rejected": -9.32093620300293, + "step": 430 + }, + { + "epoch": 0.36, + "learning_rate": 4.5691202872531413e-07, + "logits/chosen": -2.784371852874756, + "logits/rejected": -2.4980039596557617, + "logps/chosen": -177.0889129638672, + "logps/rejected": -178.2031707763672, + "loss": 0.0204, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0966353639960289, + "rewards/margins": 9.405332565307617, + "rewards/rejected": -9.308697700500488, + "step": 440 + }, + { + "epoch": 0.36, + "learning_rate": 4.546678635547576e-07, + "logits/chosen": -2.695643663406372, + "logits/rejected": -2.4506072998046875, + "logps/chosen": -182.611083984375, + "logps/rejected": -188.56802368164062, + "loss": 0.0387, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.810103416442871, + "rewards/margins": 7.231103420257568, + "rewards/rejected": -10.041206359863281, + "step": 450 + }, + { + "epoch": 0.37, + "learning_rate": 4.5242369838420106e-07, + "logits/chosen": -2.569213390350342, + "logits/rejected": -2.4080493450164795, + "logps/chosen": -169.8690948486328, + "logps/rejected": -176.78543090820312, + "loss": 0.0481, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.5461719036102295, + "rewards/margins": 6.8780083656311035, + "rewards/rejected": -8.42418098449707, + "step": 460 + }, + { + "epoch": 0.38, + "learning_rate": 4.501795332136445e-07, + "logits/chosen": -2.7197229862213135, + "logits/rejected": -2.463156223297119, + "logps/chosen": -171.43092346191406, + "logps/rejected": -185.08908081054688, + "loss": 0.0273, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1758825778961182, + "rewards/margins": 8.558847427368164, + "rewards/rejected": -9.734728813171387, + "step": 470 + }, + { + "epoch": 0.39, + "learning_rate": 4.4793536804308794e-07, + "logits/chosen": -2.6020545959472656, + "logits/rejected": -2.419907808303833, + "logps/chosen": -177.44053649902344, + "logps/rejected": -183.7080078125, + "loss": 0.0592, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.427366852760315, + "rewards/margins": 8.426870346069336, + "rewards/rejected": -9.85423755645752, + "step": 480 + }, + { + "epoch": 0.4, + "learning_rate": 4.4569120287253143e-07, + "logits/chosen": -2.6287271976470947, + "logits/rejected": -2.4881701469421387, + "logps/chosen": -170.64492797851562, + "logps/rejected": -179.0166015625, + "loss": 0.0244, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.0986562967300415, + "rewards/margins": 7.85208797454834, + "rewards/rejected": -8.950745582580566, + "step": 490 + }, + { + "epoch": 0.4, + "learning_rate": 4.434470377019748e-07, + "logits/chosen": -2.6637444496154785, + "logits/rejected": -2.4109792709350586, + "logps/chosen": -163.22930908203125, + "logps/rejected": -184.43386840820312, + "loss": 0.0151, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8302538394927979, + "rewards/margins": 8.593484878540039, + "rewards/rejected": -10.423739433288574, + "step": 500 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -2.5164012908935547, + "eval_logits/rejected": -2.3514113426208496, + "eval_logps/chosen": -177.7890167236328, + "eval_logps/rejected": -199.7706756591797, + "eval_loss": 0.025920618325471878, + "eval_rewards/accuracies": 0.9895833134651184, + "eval_rewards/chosen": -2.0956544876098633, + "eval_rewards/margins": 9.041834831237793, + "eval_rewards/rejected": -11.137490272521973, + "eval_runtime": 1115.6923, + "eval_samples_per_second": 3.759, + "eval_steps_per_second": 0.118, + "step": 500 + }, + { + "epoch": 0.41, + "learning_rate": 4.412028725314183e-07, + "logits/chosen": -2.606102466583252, + "logits/rejected": -2.4682023525238037, + "logps/chosen": -166.85171508789062, + "logps/rejected": -192.0852508544922, + "loss": 0.0317, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.542297840118408, + "rewards/margins": 7.499887943267822, + "rewards/rejected": -10.04218578338623, + "step": 510 + }, + { + "epoch": 0.42, + "learning_rate": 4.389587073608617e-07, + "logits/chosen": -2.7541720867156982, + "logits/rejected": -2.5523972511291504, + "logps/chosen": -155.0984344482422, + "logps/rejected": -166.35794067382812, + "loss": 0.0293, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9780011177062988, + "rewards/margins": 7.406359672546387, + "rewards/rejected": -8.384361267089844, + "step": 520 + }, + { + "epoch": 0.43, + "learning_rate": 4.367145421903052e-07, + "logits/chosen": -2.80572247505188, + "logits/rejected": -2.562246799468994, + "logps/chosen": -195.3736572265625, + "logps/rejected": -210.16983032226562, + "loss": 0.0264, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.055255174636841, + "rewards/margins": 9.52314567565918, + "rewards/rejected": -11.578401565551758, + "step": 530 + }, + { + "epoch": 0.44, + "learning_rate": 4.3447037701974867e-07, + "logits/chosen": -2.819672107696533, + "logits/rejected": -2.564779758453369, + "logps/chosen": -199.82110595703125, + "logps/rejected": -214.34976196289062, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3769965171813965, + "rewards/margins": 9.320544242858887, + "rewards/rejected": -11.697539329528809, + "step": 540 + }, + { + "epoch": 0.44, + "learning_rate": 4.3222621184919205e-07, + "logits/chosen": -2.578565835952759, + "logits/rejected": -2.3880248069763184, + "logps/chosen": -159.0529022216797, + "logps/rejected": -190.7191925048828, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1098084449768066, + "rewards/margins": 9.689680099487305, + "rewards/rejected": -10.799488067626953, + "step": 550 + }, + { + "epoch": 0.45, + "learning_rate": 4.2998204667863554e-07, + "logits/chosen": -2.5756804943084717, + "logits/rejected": -2.3731701374053955, + "logps/chosen": -168.4532470703125, + "logps/rejected": -193.602294921875, + "loss": 0.0308, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.453439235687256, + "rewards/margins": 8.603952407836914, + "rewards/rejected": -11.057391166687012, + "step": 560 + }, + { + "epoch": 0.46, + "learning_rate": 4.27737881508079e-07, + "logits/chosen": -2.7255911827087402, + "logits/rejected": -2.4435410499572754, + "logps/chosen": -179.69967651367188, + "logps/rejected": -202.15634155273438, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1704384088516235, + "rewards/margins": 9.947944641113281, + "rewards/rejected": -11.118382453918457, + "step": 570 + }, + { + "epoch": 0.47, + "learning_rate": 4.254937163375224e-07, + "logits/chosen": -2.703125, + "logits/rejected": -2.330751419067383, + "logps/chosen": -186.79486083984375, + "logps/rejected": -182.8419189453125, + "loss": 0.0378, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.196533679962158, + "rewards/margins": 7.0288496017456055, + "rewards/rejected": -9.225382804870605, + "step": 580 + }, + { + "epoch": 0.48, + "learning_rate": 4.2324955116696586e-07, + "logits/chosen": -2.641286611557007, + "logits/rejected": -2.3214941024780273, + "logps/chosen": -163.7178955078125, + "logps/rejected": -178.2230224609375, + "loss": 0.0242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9554588198661804, + "rewards/margins": 8.69752311706543, + "rewards/rejected": -9.652982711791992, + "step": 590 + }, + { + "epoch": 0.48, + "learning_rate": 4.2100538599640935e-07, + "logits/chosen": -2.5387253761291504, + "logits/rejected": -2.4063801765441895, + "logps/chosen": -175.10360717773438, + "logps/rejected": -205.7406005859375, + "loss": 0.0227, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.580308437347412, + "rewards/margins": 9.250917434692383, + "rewards/rejected": -10.831225395202637, + "step": 600 + }, + { + "epoch": 0.48, + "eval_logits/chosen": -2.514103412628174, + "eval_logits/rejected": -2.2953569889068604, + "eval_logps/chosen": -172.6114959716797, + "eval_logps/rejected": -197.78921508789062, + "eval_loss": 0.022758163511753082, + "eval_rewards/accuracies": 0.9905303120613098, + "eval_rewards/chosen": -1.5779017210006714, + "eval_rewards/margins": 9.361441612243652, + "eval_rewards/rejected": -10.939343452453613, + "eval_runtime": 1103.8243, + "eval_samples_per_second": 3.8, + "eval_steps_per_second": 0.12, + "step": 600 + }, + { + "epoch": 0.49, + "learning_rate": 4.1876122082585273e-07, + "logits/chosen": -2.6256027221679688, + "logits/rejected": -2.385557174682617, + "logps/chosen": -161.47174072265625, + "logps/rejected": -192.36395263671875, + "loss": 0.0355, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.3544869422912598, + "rewards/margins": 8.635656356811523, + "rewards/rejected": -10.990143775939941, + "step": 610 + }, + { + "epoch": 0.5, + "learning_rate": 4.165170556552962e-07, + "logits/chosen": -2.6323723793029785, + "logits/rejected": -2.335218906402588, + "logps/chosen": -172.14358520507812, + "logps/rejected": -205.28488159179688, + "loss": 0.0246, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5935888290405273, + "rewards/margins": 10.57735824584961, + "rewards/rejected": -12.170948028564453, + "step": 620 + }, + { + "epoch": 0.51, + "learning_rate": 4.1427289048473966e-07, + "logits/chosen": -2.689697742462158, + "logits/rejected": -2.5054707527160645, + "logps/chosen": -179.0841064453125, + "logps/rejected": -193.1809844970703, + "loss": 0.0763, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.2709579467773438, + "rewards/margins": 7.9121809005737305, + "rewards/rejected": -10.18313980102539, + "step": 630 + }, + { + "epoch": 0.52, + "learning_rate": 4.120287253141831e-07, + "logits/chosen": -2.5570006370544434, + "logits/rejected": -2.4946446418762207, + "logps/chosen": -222.3629150390625, + "logps/rejected": -252.6125030517578, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.802462100982666, + "rewards/margins": 10.474245071411133, + "rewards/rejected": -14.276705741882324, + "step": 640 + }, + { + "epoch": 0.53, + "learning_rate": 4.0978456014362654e-07, + "logits/chosen": -2.652097225189209, + "logits/rejected": -2.424015760421753, + "logps/chosen": -206.3131561279297, + "logps/rejected": -218.7412109375, + "loss": 0.034, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.642483234405518, + "rewards/margins": 7.916609287261963, + "rewards/rejected": -12.55909252166748, + "step": 650 + }, + { + "epoch": 0.53, + "learning_rate": 4.0754039497307003e-07, + "logits/chosen": -2.6322474479675293, + "logits/rejected": -2.464550495147705, + "logps/chosen": -189.62890625, + "logps/rejected": -222.5862274169922, + "loss": 0.0301, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.360064744949341, + "rewards/margins": 9.689141273498535, + "rewards/rejected": -13.049206733703613, + "step": 660 + }, + { + "epoch": 0.54, + "learning_rate": 4.052962298025134e-07, + "logits/chosen": -2.746980905532837, + "logits/rejected": -2.6040847301483154, + "logps/chosen": -158.00064086914062, + "logps/rejected": -176.2129364013672, + "loss": 0.0871, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.534571886062622, + "rewards/margins": 7.206589698791504, + "rewards/rejected": -8.741161346435547, + "step": 670 + }, + { + "epoch": 0.55, + "learning_rate": 4.030520646319569e-07, + "logits/chosen": -2.8348677158355713, + "logits/rejected": -2.590057373046875, + "logps/chosen": -154.6415252685547, + "logps/rejected": -144.91162109375, + "loss": 0.0441, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.3402763605117798, + "rewards/margins": 5.1438798904418945, + "rewards/rejected": -6.484156608581543, + "step": 680 + }, + { + "epoch": 0.56, + "learning_rate": 4.0080789946140034e-07, + "logits/chosen": -2.893965244293213, + "logits/rejected": -2.5879921913146973, + "logps/chosen": -167.04054260253906, + "logps/rejected": -160.55148315429688, + "loss": 0.0273, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9482299089431763, + "rewards/margins": 6.818035125732422, + "rewards/rejected": -7.766264915466309, + "step": 690 + }, + { + "epoch": 0.57, + "learning_rate": 3.985637342908438e-07, + "logits/chosen": -2.8103203773498535, + "logits/rejected": -2.6285464763641357, + "logps/chosen": -174.45706176757812, + "logps/rejected": -195.96969604492188, + "loss": 0.0225, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.97511887550354, + "rewards/margins": 7.949165344238281, + "rewards/rejected": -9.924284934997559, + "step": 700 + }, + { + "epoch": 0.57, + "eval_logits/chosen": -2.650165319442749, + "eval_logits/rejected": -2.414780616760254, + "eval_logps/chosen": -175.13243103027344, + "eval_logps/rejected": -193.57337951660156, + "eval_loss": 0.02759966067969799, + "eval_rewards/accuracies": 0.9886363744735718, + "eval_rewards/chosen": -1.8299955129623413, + "eval_rewards/margins": 8.687764167785645, + "eval_rewards/rejected": -10.517760276794434, + "eval_runtime": 1093.7599, + "eval_samples_per_second": 3.834, + "eval_steps_per_second": 0.121, + "step": 700 + }, + { + "epoch": 0.57, + "learning_rate": 3.9631956912028727e-07, + "logits/chosen": -2.7256417274475098, + "logits/rejected": -2.494997501373291, + "logps/chosen": -162.87600708007812, + "logps/rejected": -191.05235290527344, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5426220893859863, + "rewards/margins": 8.404081344604492, + "rewards/rejected": -10.94670295715332, + "step": 710 + }, + { + "epoch": 0.58, + "learning_rate": 3.9407540394973066e-07, + "logits/chosen": -2.7863926887512207, + "logits/rejected": -2.5455687046051025, + "logps/chosen": -186.17837524414062, + "logps/rejected": -199.7962646484375, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5034596920013428, + "rewards/margins": 8.251537322998047, + "rewards/rejected": -10.754997253417969, + "step": 720 + }, + { + "epoch": 0.59, + "learning_rate": 3.9183123877917415e-07, + "logits/chosen": -2.8568665981292725, + "logits/rejected": -2.4928183555603027, + "logps/chosen": -181.79220581054688, + "logps/rejected": -211.6440887451172, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0168057680130005, + "rewards/margins": 11.163439750671387, + "rewards/rejected": -12.180245399475098, + "step": 730 + }, + { + "epoch": 0.6, + "learning_rate": 3.895870736086176e-07, + "logits/chosen": -2.6492223739624023, + "logits/rejected": -2.3800158500671387, + "logps/chosen": -229.8067626953125, + "logps/rejected": -424.43109130859375, + "loss": 0.0281, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.747796058654785, + "rewards/margins": 26.524486541748047, + "rewards/rejected": -33.27228546142578, + "step": 740 + }, + { + "epoch": 0.61, + "learning_rate": 3.87342908438061e-07, + "logits/chosen": -2.7311275005340576, + "logits/rejected": -2.4212193489074707, + "logps/chosen": -181.1782989501953, + "logps/rejected": -215.73434448242188, + "loss": 0.1274, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0061917304992676, + "rewards/margins": 11.330682754516602, + "rewards/rejected": -13.336874008178711, + "step": 750 + }, + { + "epoch": 0.61, + "learning_rate": 3.8509874326750446e-07, + "logits/chosen": -2.58239483833313, + "logits/rejected": -2.254823923110962, + "logps/chosen": -166.7842559814453, + "logps/rejected": -219.7112274169922, + "loss": 0.0438, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5383646488189697, + "rewards/margins": 11.430071830749512, + "rewards/rejected": -13.96843433380127, + "step": 760 + }, + { + "epoch": 0.62, + "learning_rate": 3.8285457809694795e-07, + "logits/chosen": -2.6240200996398926, + "logits/rejected": -2.343017578125, + "logps/chosen": -171.8400115966797, + "logps/rejected": -203.49620056152344, + "loss": 0.0122, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.0811808109283447, + "rewards/margins": 10.63041877746582, + "rewards/rejected": -12.711600303649902, + "step": 770 + }, + { + "epoch": 0.63, + "learning_rate": 3.8061041292639134e-07, + "logits/chosen": -2.600430965423584, + "logits/rejected": -2.368278980255127, + "logps/chosen": -211.08767700195312, + "logps/rejected": -263.0861511230469, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8089599609375, + "rewards/margins": 12.368398666381836, + "rewards/rejected": -16.177358627319336, + "step": 780 + }, + { + "epoch": 0.64, + "learning_rate": 3.783662477558348e-07, + "logits/chosen": -2.756709337234497, + "logits/rejected": -2.4216301441192627, + "logps/chosen": -179.80264282226562, + "logps/rejected": -212.94100952148438, + "loss": 0.0295, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.8555870056152344, + "rewards/margins": 10.147234916687012, + "rewards/rejected": -13.002822875976562, + "step": 790 + }, + { + "epoch": 0.65, + "learning_rate": 3.7612208258527826e-07, + "logits/chosen": -2.5451369285583496, + "logits/rejected": -2.2309484481811523, + "logps/chosen": -184.3829345703125, + "logps/rejected": -254.4412384033203, + "loss": 0.0251, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3452372550964355, + "rewards/margins": 12.825090408325195, + "rewards/rejected": -16.17032814025879, + "step": 800 + }, + { + "epoch": 0.65, + "eval_logits/chosen": -2.4265408515930176, + "eval_logits/rejected": -2.0531392097473145, + "eval_logps/chosen": -204.18695068359375, + "eval_logps/rejected": -254.88363647460938, + "eval_loss": 0.02958463504910469, + "eval_rewards/accuracies": 0.9867424368858337, + "eval_rewards/chosen": -4.735447883605957, + "eval_rewards/margins": 11.91334056854248, + "eval_rewards/rejected": -16.648788452148438, + "eval_runtime": 1122.0741, + "eval_samples_per_second": 3.738, + "eval_steps_per_second": 0.118, + "step": 800 + }, + { + "epoch": 0.65, + "learning_rate": 3.738779174147217e-07, + "logits/chosen": -2.440091371536255, + "logits/rejected": -1.9208959341049194, + "logps/chosen": -216.06692504882812, + "logps/rejected": -271.1013488769531, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.1133646965026855, + "rewards/margins": 12.381640434265137, + "rewards/rejected": -18.495006561279297, + "step": 810 + }, + { + "epoch": 0.66, + "learning_rate": 3.7163375224416514e-07, + "logits/chosen": -2.5130116939544678, + "logits/rejected": -1.959911584854126, + "logps/chosen": -210.6549835205078, + "logps/rejected": -271.46600341796875, + "loss": 0.0176, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.4466447830200195, + "rewards/margins": 13.212496757507324, + "rewards/rejected": -18.659141540527344, + "step": 820 + }, + { + "epoch": 0.67, + "learning_rate": 3.6938958707360863e-07, + "logits/chosen": -2.5360050201416016, + "logits/rejected": -2.075317859649658, + "logps/chosen": -215.08023071289062, + "logps/rejected": -279.94720458984375, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.763272762298584, + "rewards/margins": 13.919160842895508, + "rewards/rejected": -18.682432174682617, + "step": 830 + }, + { + "epoch": 0.68, + "learning_rate": 3.67145421903052e-07, + "logits/chosen": -2.427487850189209, + "logits/rejected": -1.9396463632583618, + "logps/chosen": -195.2036590576172, + "logps/rejected": -267.75909423828125, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.064001560211182, + "rewards/margins": 13.078373908996582, + "rewards/rejected": -19.14237403869629, + "step": 840 + }, + { + "epoch": 0.69, + "learning_rate": 3.649012567324955e-07, + "logits/chosen": -2.5623385906219482, + "logits/rejected": -1.9093387126922607, + "logps/chosen": -212.22177124023438, + "logps/rejected": -274.71875, + "loss": 0.0365, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.153492450714111, + "rewards/margins": 13.469499588012695, + "rewards/rejected": -18.62299346923828, + "step": 850 + }, + { + "epoch": 0.69, + "learning_rate": 3.6265709156193894e-07, + "logits/chosen": -2.5318610668182373, + "logits/rejected": -2.047170877456665, + "logps/chosen": -205.8806610107422, + "logps/rejected": -268.97137451171875, + "loss": 0.0273, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.7065229415893555, + "rewards/margins": 13.470039367675781, + "rewards/rejected": -18.176563262939453, + "step": 860 + }, + { + "epoch": 0.7, + "learning_rate": 3.604129263913824e-07, + "logits/chosen": -2.607186794281006, + "logits/rejected": -2.169048309326172, + "logps/chosen": -199.68447875976562, + "logps/rejected": -276.8939514160156, + "loss": 0.0184, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.01786470413208, + "rewards/margins": 13.837753295898438, + "rewards/rejected": -17.85561752319336, + "step": 870 + }, + { + "epoch": 0.71, + "learning_rate": 3.581687612208258e-07, + "logits/chosen": -2.4413928985595703, + "logits/rejected": -1.9049670696258545, + "logps/chosen": -183.354248046875, + "logps/rejected": -263.49395751953125, + "loss": 0.025, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.0026044845581055, + "rewards/margins": 13.259096145629883, + "rewards/rejected": -18.261699676513672, + "step": 880 + }, + { + "epoch": 0.72, + "learning_rate": 3.559245960502693e-07, + "logits/chosen": -2.545003890991211, + "logits/rejected": -1.9900169372558594, + "logps/chosen": -225.3459014892578, + "logps/rejected": -285.1943054199219, + "loss": 0.0151, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.646821022033691, + "rewards/margins": 12.7666015625, + "rewards/rejected": -18.413423538208008, + "step": 890 + }, + { + "epoch": 0.73, + "learning_rate": 3.536804308797127e-07, + "logits/chosen": -2.368264675140381, + "logits/rejected": -1.9987428188323975, + "logps/chosen": -220.85879516601562, + "logps/rejected": -274.87713623046875, + "loss": 0.0197, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.6578688621521, + "rewards/margins": 12.028022766113281, + "rewards/rejected": -18.685894012451172, + "step": 900 + }, + { + "epoch": 0.73, + "eval_logits/chosen": -2.330415725708008, + "eval_logits/rejected": -1.6415473222732544, + "eval_logps/chosen": -207.5311279296875, + "eval_logps/rejected": -273.0435485839844, + "eval_loss": 0.018391458317637444, + "eval_rewards/accuracies": 0.9924242496490479, + "eval_rewards/chosen": -5.069868087768555, + "eval_rewards/margins": 13.394913673400879, + "eval_rewards/rejected": -18.46478271484375, + "eval_runtime": 1239.5719, + "eval_samples_per_second": 3.383, + "eval_steps_per_second": 0.106, + "step": 900 + }, + { + "epoch": 0.74, + "learning_rate": 3.514362657091562e-07, + "logits/chosen": -2.637467384338379, + "logits/rejected": -1.9471591711044312, + "logps/chosen": -205.8500213623047, + "logps/rejected": -268.252685546875, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.260426044464111, + "rewards/margins": 13.376129150390625, + "rewards/rejected": -17.63655662536621, + "step": 910 + }, + { + "epoch": 0.74, + "learning_rate": 3.491921005385996e-07, + "logits/chosen": -2.294602394104004, + "logits/rejected": -1.6903289556503296, + "logps/chosen": -222.7003173828125, + "logps/rejected": -283.40966796875, + "loss": 0.0333, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -7.2451629638671875, + "rewards/margins": 11.498380661010742, + "rewards/rejected": -18.743541717529297, + "step": 920 + }, + { + "epoch": 0.75, + "learning_rate": 3.4694793536804306e-07, + "logits/chosen": -2.3132457733154297, + "logits/rejected": -1.8328949213027954, + "logps/chosen": -227.30868530273438, + "logps/rejected": -301.00238037109375, + "loss": 0.0269, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.731290340423584, + "rewards/margins": 14.241165161132812, + "rewards/rejected": -20.972454071044922, + "step": 930 + }, + { + "epoch": 0.76, + "learning_rate": 3.4470377019748655e-07, + "logits/chosen": -2.52744722366333, + "logits/rejected": -1.9447847604751587, + "logps/chosen": -225.65087890625, + "logps/rejected": -288.70111083984375, + "loss": 0.0373, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.149026870727539, + "rewards/margins": 13.226213455200195, + "rewards/rejected": -19.375240325927734, + "step": 940 + }, + { + "epoch": 0.77, + "learning_rate": 3.4245960502692994e-07, + "logits/chosen": -2.3694093227386475, + "logits/rejected": -1.8809788227081299, + "logps/chosen": -205.0959930419922, + "logps/rejected": -277.66571044921875, + "loss": 0.0182, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.043319225311279, + "rewards/margins": 13.691876411437988, + "rewards/rejected": -18.735193252563477, + "step": 950 + }, + { + "epoch": 0.78, + "learning_rate": 3.4021543985637343e-07, + "logits/chosen": -2.309321165084839, + "logits/rejected": -1.5435558557510376, + "logps/chosen": -208.56015014648438, + "logps/rejected": -278.6729736328125, + "loss": 0.0158, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.573729515075684, + "rewards/margins": 13.340641975402832, + "rewards/rejected": -18.914371490478516, + "step": 960 + }, + { + "epoch": 0.78, + "learning_rate": 3.3797127468581687e-07, + "logits/chosen": -2.4142823219299316, + "logits/rejected": -1.4831030368804932, + "logps/chosen": -193.0516815185547, + "logps/rejected": -278.08673095703125, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.008880615234375, + "rewards/margins": 15.484170913696289, + "rewards/rejected": -19.493053436279297, + "step": 970 + }, + { + "epoch": 0.79, + "learning_rate": 3.357271095152603e-07, + "logits/chosen": -2.3693246841430664, + "logits/rejected": -1.7226059436798096, + "logps/chosen": -190.80052185058594, + "logps/rejected": -271.7615661621094, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3757426738739014, + "rewards/margins": 14.809103012084961, + "rewards/rejected": -18.184844970703125, + "step": 980 + }, + { + "epoch": 0.8, + "learning_rate": 3.3348294434470374e-07, + "logits/chosen": -2.428826332092285, + "logits/rejected": -1.9628509283065796, + "logps/chosen": -186.20932006835938, + "logps/rejected": -277.74127197265625, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.140962600708008, + "rewards/margins": 15.251138687133789, + "rewards/rejected": -18.392099380493164, + "step": 990 + }, + { + "epoch": 0.81, + "learning_rate": 3.3123877917414723e-07, + "logits/chosen": -2.4653186798095703, + "logits/rejected": -1.6098353862762451, + "logps/chosen": -202.1477813720703, + "logps/rejected": -290.29681396484375, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.702713966369629, + "rewards/margins": 17.114429473876953, + "rewards/rejected": -19.817142486572266, + "step": 1000 + }, + { + "epoch": 0.81, + "eval_logits/chosen": -2.2844483852386475, + "eval_logits/rejected": -1.5669403076171875, + "eval_logps/chosen": -189.7405242919922, + "eval_logps/rejected": -284.7117004394531, + "eval_loss": 0.01894140988588333, + "eval_rewards/accuracies": 0.9905303120613098, + "eval_rewards/chosen": -3.290806293487549, + "eval_rewards/margins": 16.340787887573242, + "eval_rewards/rejected": -19.631591796875, + "eval_runtime": 1250.5756, + "eval_samples_per_second": 3.354, + "eval_steps_per_second": 0.106, + "step": 1000 + }, + { + "epoch": 0.82, + "learning_rate": 3.289946140035906e-07, + "logits/chosen": -2.492069721221924, + "logits/rejected": -1.884073257446289, + "logps/chosen": -191.72055053710938, + "logps/rejected": -280.27008056640625, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.344417095184326, + "rewards/margins": 16.0303955078125, + "rewards/rejected": -18.374813079833984, + "step": 1010 + }, + { + "epoch": 0.82, + "learning_rate": 3.267504488330341e-07, + "logits/chosen": -2.4207677841186523, + "logits/rejected": -1.779359221458435, + "logps/chosen": -210.0410919189453, + "logps/rejected": -304.53704833984375, + "loss": 0.0266, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7098872661590576, + "rewards/margins": 16.950992584228516, + "rewards/rejected": -20.660879135131836, + "step": 1020 + }, + { + "epoch": 0.83, + "learning_rate": 3.2450628366247755e-07, + "logits/chosen": -2.462038516998291, + "logits/rejected": -1.9092416763305664, + "logps/chosen": -201.73321533203125, + "logps/rejected": -265.5564270019531, + "loss": 0.0339, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.756110191345215, + "rewards/margins": 12.757057189941406, + "rewards/rejected": -17.513164520263672, + "step": 1030 + }, + { + "epoch": 0.84, + "learning_rate": 3.22262118491921e-07, + "logits/chosen": -2.5407001972198486, + "logits/rejected": -2.0681283473968506, + "logps/chosen": -194.4741973876953, + "logps/rejected": -259.650390625, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3851146697998047, + "rewards/margins": 13.322988510131836, + "rewards/rejected": -16.70810317993164, + "step": 1040 + }, + { + "epoch": 0.85, + "learning_rate": 3.200179533213644e-07, + "logits/chosen": -2.6250674724578857, + "logits/rejected": -2.136307716369629, + "logps/chosen": -179.30255126953125, + "logps/rejected": -254.7200927734375, + "loss": 0.0238, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8000853061676025, + "rewards/margins": 14.223075866699219, + "rewards/rejected": -17.02315902709961, + "step": 1050 + }, + { + "epoch": 0.86, + "learning_rate": 3.177737881508079e-07, + "logits/chosen": -2.6075491905212402, + "logits/rejected": -2.009011745452881, + "logps/chosen": -177.03890991210938, + "logps/rejected": -264.70501708984375, + "loss": 0.0204, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.44179368019104, + "rewards/margins": 15.438023567199707, + "rewards/rejected": -17.879817962646484, + "step": 1060 + }, + { + "epoch": 0.86, + "learning_rate": 3.155296229802513e-07, + "logits/chosen": -2.6745896339416504, + "logits/rejected": -2.205416679382324, + "logps/chosen": -202.7510223388672, + "logps/rejected": -274.425048828125, + "loss": 0.016, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.217827320098877, + "rewards/margins": 13.918096542358398, + "rewards/rejected": -18.13592529296875, + "step": 1070 + }, + { + "epoch": 0.87, + "learning_rate": 3.132854578096948e-07, + "logits/chosen": -2.418736696243286, + "logits/rejected": -2.0540010929107666, + "logps/chosen": -188.3748779296875, + "logps/rejected": -281.0201416015625, + "loss": 0.0329, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.675929069519043, + "rewards/margins": 15.002309799194336, + "rewards/rejected": -19.678237915039062, + "step": 1080 + }, + { + "epoch": 0.88, + "learning_rate": 3.110412926391383e-07, + "logits/chosen": -2.4094793796539307, + "logits/rejected": -2.089980363845825, + "logps/chosen": -203.20724487304688, + "logps/rejected": -278.23931884765625, + "loss": 0.0219, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.738194465637207, + "rewards/margins": 13.724352836608887, + "rewards/rejected": -19.46254539489746, + "step": 1090 + }, + { + "epoch": 0.89, + "learning_rate": 3.0879712746858166e-07, + "logits/chosen": -2.7008748054504395, + "logits/rejected": -2.070899724960327, + "logps/chosen": -228.24008178710938, + "logps/rejected": -286.18505859375, + "loss": 0.025, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.831046104431152, + "rewards/margins": 13.471226692199707, + "rewards/rejected": -19.302270889282227, + "step": 1100 + }, + { + "epoch": 0.89, + "eval_logits/chosen": -2.4808616638183594, + "eval_logits/rejected": -1.9588054418563843, + "eval_logps/chosen": -206.55545043945312, + "eval_logps/rejected": -287.3612976074219, + "eval_loss": 0.01608540490269661, + "eval_rewards/accuracies": 0.9924242496490479, + "eval_rewards/chosen": -4.972299575805664, + "eval_rewards/margins": 14.924251556396484, + "eval_rewards/rejected": -19.89655113220215, + "eval_runtime": 1244.6943, + "eval_samples_per_second": 3.37, + "eval_steps_per_second": 0.106, + "step": 1100 + }, + { + "epoch": 0.9, + "learning_rate": 3.0655296229802515e-07, + "logits/chosen": -2.30277681350708, + "logits/rejected": -1.6856743097305298, + "logps/chosen": -202.29348754882812, + "logps/rejected": -263.99066162109375, + "loss": 0.0617, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.638394832611084, + "rewards/margins": 12.62812614440918, + "rewards/rejected": -19.266521453857422, + "step": 1110 + }, + { + "epoch": 0.9, + "learning_rate": 3.0430879712746854e-07, + "logits/chosen": -2.0628719329833984, + "logits/rejected": -1.4895120859146118, + "logps/chosen": -248.9067840576172, + "logps/rejected": -317.18109130859375, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.054488182067871, + "rewards/margins": 13.580743789672852, + "rewards/rejected": -22.635234832763672, + "step": 1120 + }, + { + "epoch": 0.91, + "learning_rate": 3.0206463195691203e-07, + "logits/chosen": -2.159860134124756, + "logits/rejected": -1.505788803100586, + "logps/chosen": -225.930908203125, + "logps/rejected": -311.3579406738281, + "loss": 0.022, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.442475318908691, + "rewards/margins": 15.45228385925293, + "rewards/rejected": -22.894760131835938, + "step": 1130 + }, + { + "epoch": 0.92, + "learning_rate": 2.9982046678635547e-07, + "logits/chosen": -2.0184290409088135, + "logits/rejected": -1.3578821420669556, + "logps/chosen": -203.81130981445312, + "logps/rejected": -260.977783203125, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.375730991363525, + "rewards/margins": 11.906469345092773, + "rewards/rejected": -19.28219985961914, + "step": 1140 + }, + { + "epoch": 0.93, + "learning_rate": 2.975763016157989e-07, + "logits/chosen": -2.028810501098633, + "logits/rejected": -1.5329904556274414, + "logps/chosen": -253.272216796875, + "logps/rejected": -338.29351806640625, + "loss": 0.1369, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -8.697281837463379, + "rewards/margins": 15.825586318969727, + "rewards/rejected": -24.52286720275879, + "step": 1150 + }, + { + "epoch": 0.94, + "learning_rate": 2.9533213644524234e-07, + "logits/chosen": -2.2891311645507812, + "logits/rejected": -1.6473753452301025, + "logps/chosen": -268.4525451660156, + "logps/rejected": -307.92486572265625, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.149336814880371, + "rewards/margins": 12.22728443145752, + "rewards/rejected": -21.376623153686523, + "step": 1160 + }, + { + "epoch": 0.95, + "learning_rate": 2.9308797127468583e-07, + "logits/chosen": -2.2586817741394043, + "logits/rejected": -1.9194952249526978, + "logps/chosen": -252.7531280517578, + "logps/rejected": -307.14959716796875, + "loss": 0.0407, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.0676908493042, + "rewards/margins": 11.876630783081055, + "rewards/rejected": -21.944324493408203, + "step": 1170 + }, + { + "epoch": 0.95, + "learning_rate": 2.908438061041292e-07, + "logits/chosen": -2.1825761795043945, + "logits/rejected": -1.6960197687149048, + "logps/chosen": -244.33456420898438, + "logps/rejected": -325.9845886230469, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.559610366821289, + "rewards/margins": 15.206143379211426, + "rewards/rejected": -23.765756607055664, + "step": 1180 + }, + { + "epoch": 0.96, + "learning_rate": 2.885996409335727e-07, + "logits/chosen": -2.1046993732452393, + "logits/rejected": -1.3575652837753296, + "logps/chosen": -246.7059326171875, + "logps/rejected": -306.18218994140625, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.574252128601074, + "rewards/margins": 13.792829513549805, + "rewards/rejected": -22.367084503173828, + "step": 1190 + }, + { + "epoch": 0.97, + "learning_rate": 2.8635547576301615e-07, + "logits/chosen": -2.3686792850494385, + "logits/rejected": -2.0101513862609863, + "logps/chosen": -229.9496612548828, + "logps/rejected": -290.2330322265625, + "loss": 0.0299, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -8.17085075378418, + "rewards/margins": 11.523832321166992, + "rewards/rejected": -19.694683074951172, + "step": 1200 + }, + { + "epoch": 0.97, + "eval_logits/chosen": -2.334228992462158, + "eval_logits/rejected": -1.7745356559753418, + "eval_logps/chosen": -228.568603515625, + "eval_logps/rejected": -300.77685546875, + "eval_loss": 0.019226718693971634, + "eval_rewards/accuracies": 0.9914772510528564, + "eval_rewards/chosen": -7.173612594604492, + "eval_rewards/margins": 14.064496040344238, + "eval_rewards/rejected": -21.238107681274414, + "eval_runtime": 1235.2274, + "eval_samples_per_second": 3.395, + "eval_steps_per_second": 0.107, + "step": 1200 + }, + { + "epoch": 0.98, + "learning_rate": 2.841113105924596e-07, + "logits/chosen": -2.4672248363494873, + "logits/rejected": -1.998246431350708, + "logps/chosen": -210.8152313232422, + "logps/rejected": -304.40692138671875, + "loss": 0.0258, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.575493812561035, + "rewards/margins": 15.416450500488281, + "rewards/rejected": -21.991945266723633, + "step": 1210 + }, + { + "epoch": 0.99, + "learning_rate": 2.81867145421903e-07, + "logits/chosen": -2.5835328102111816, + "logits/rejected": -2.1223435401916504, + "logps/chosen": -219.01547241210938, + "logps/rejected": -297.76397705078125, + "loss": 0.0208, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.9382243156433105, + "rewards/margins": 14.4376802444458, + "rewards/rejected": -20.375904083251953, + "step": 1220 + }, + { + "epoch": 0.99, + "learning_rate": 2.796229802513465e-07, + "logits/chosen": -2.5194222927093506, + "logits/rejected": -1.838174819946289, + "logps/chosen": -206.17758178710938, + "logps/rejected": -275.22369384765625, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.673586368560791, + "rewards/margins": 15.025799751281738, + "rewards/rejected": -19.699384689331055, + "step": 1230 + }, + { + "epoch": 1.0, + "learning_rate": 2.773788150807899e-07, + "logits/chosen": -2.473797559738159, + "logits/rejected": -1.9282045364379883, + "logps/chosen": -218.5982208251953, + "logps/rejected": -317.567138671875, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.5297956466674805, + "rewards/margins": 17.271570205688477, + "rewards/rejected": -22.80136489868164, + "step": 1240 + }, + { + "epoch": 1.01, + "learning_rate": 2.751346499102334e-07, + "logits/chosen": -2.434544086456299, + "logits/rejected": -1.733507752418518, + "logps/chosen": -195.1844482421875, + "logps/rejected": -293.07415771484375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.768883228302002, + "rewards/margins": 17.48321533203125, + "rewards/rejected": -21.252099990844727, + "step": 1250 + }, + { + "epoch": 1.02, + "learning_rate": 2.7289048473967683e-07, + "logits/chosen": -2.548470973968506, + "logits/rejected": -1.947098970413208, + "logps/chosen": -217.6623077392578, + "logps/rejected": -387.7384033203125, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.762509822845459, + "rewards/margins": 22.999313354492188, + "rewards/rejected": -28.761821746826172, + "step": 1260 + }, + { + "epoch": 1.03, + "learning_rate": 2.7064631956912027e-07, + "logits/chosen": -2.5031285285949707, + "logits/rejected": -1.7813794612884521, + "logps/chosen": -199.9820556640625, + "logps/rejected": -301.0137634277344, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.086377143859863, + "rewards/margins": 17.737918853759766, + "rewards/rejected": -21.82429313659668, + "step": 1270 + }, + { + "epoch": 1.03, + "learning_rate": 2.684021543985637e-07, + "logits/chosen": -2.382187843322754, + "logits/rejected": -1.5963821411132812, + "logps/chosen": -187.2418212890625, + "logps/rejected": -264.92486572265625, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.647365570068359, + "rewards/margins": 14.429998397827148, + "rewards/rejected": -19.07736587524414, + "step": 1280 + }, + { + "epoch": 1.04, + "learning_rate": 2.661579892280072e-07, + "logits/chosen": -2.4244916439056396, + "logits/rejected": -1.8657194375991821, + "logps/chosen": -174.408935546875, + "logps/rejected": -270.9884033203125, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.466989517211914, + "rewards/margins": 16.325471878051758, + "rewards/rejected": -19.792461395263672, + "step": 1290 + }, + { + "epoch": 1.05, + "learning_rate": 2.6391382405745063e-07, + "logits/chosen": -2.4733824729919434, + "logits/rejected": -1.8393371105194092, + "logps/chosen": -205.424560546875, + "logps/rejected": -316.391357421875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.587893486022949, + "rewards/margins": 17.766637802124023, + "rewards/rejected": -22.354534149169922, + "step": 1300 + }, + { + "epoch": 1.05, + "eval_logits/chosen": -2.3594300746917725, + "eval_logits/rejected": -1.614759922027588, + "eval_logps/chosen": -194.88035583496094, + "eval_logps/rejected": -305.9737243652344, + "eval_loss": 0.018366051837801933, + "eval_rewards/accuracies": 0.9914772510528564, + "eval_rewards/chosen": -3.8047878742218018, + "eval_rewards/margins": 17.9530086517334, + "eval_rewards/rejected": -21.757795333862305, + "eval_runtime": 1239.8518, + "eval_samples_per_second": 3.383, + "eval_steps_per_second": 0.106, + "step": 1300 + }, + { + "epoch": 1.06, + "learning_rate": 2.6166965888689407e-07, + "logits/chosen": -2.5149292945861816, + "logits/rejected": -1.7752535343170166, + "logps/chosen": -203.27230834960938, + "logps/rejected": -290.1390380859375, + "loss": 0.0076, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.055700778961182, + "rewards/margins": 15.912434577941895, + "rewards/rejected": -19.968135833740234, + "step": 1310 + }, + { + "epoch": 1.07, + "learning_rate": 2.594254937163375e-07, + "logits/chosen": -2.505140781402588, + "logits/rejected": -1.835667610168457, + "logps/chosen": -190.5753936767578, + "logps/rejected": -304.220703125, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.295541763305664, + "rewards/margins": 17.55819320678711, + "rewards/rejected": -20.853734970092773, + "step": 1320 + }, + { + "epoch": 1.07, + "learning_rate": 2.5718132854578095e-07, + "logits/chosen": -2.5665476322174072, + "logits/rejected": -2.0295228958129883, + "logps/chosen": -188.3972625732422, + "logps/rejected": -289.68792724609375, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.316233158111572, + "rewards/margins": 15.678323745727539, + "rewards/rejected": -19.99455451965332, + "step": 1330 + }, + { + "epoch": 1.08, + "learning_rate": 2.5493716337522444e-07, + "logits/chosen": -2.3629024028778076, + "logits/rejected": -1.591646432876587, + "logps/chosen": -200.7397918701172, + "logps/rejected": -300.5146179199219, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.494296550750732, + "rewards/margins": 16.3477840423584, + "rewards/rejected": -21.842082977294922, + "step": 1340 + }, + { + "epoch": 1.09, + "learning_rate": 2.526929982046678e-07, + "logits/chosen": -2.3346006870269775, + "logits/rejected": -1.5865734815597534, + "logps/chosen": -213.29776000976562, + "logps/rejected": -310.63690185546875, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.321459770202637, + "rewards/margins": 16.042346954345703, + "rewards/rejected": -22.363807678222656, + "step": 1350 + }, + { + "epoch": 1.1, + "learning_rate": 2.504488330341113e-07, + "logits/chosen": -2.5068516731262207, + "logits/rejected": -1.9767701625823975, + "logps/chosen": -222.1580352783203, + "logps/rejected": -315.2189636230469, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.667050838470459, + "rewards/margins": 16.236698150634766, + "rewards/rejected": -22.903751373291016, + "step": 1360 + }, + { + "epoch": 1.11, + "learning_rate": 2.4820466786355475e-07, + "logits/chosen": -2.4731431007385254, + "logits/rejected": -1.8346115350723267, + "logps/chosen": -233.3339080810547, + "logps/rejected": -343.2152404785156, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.260836601257324, + "rewards/margins": 18.331096649169922, + "rewards/rejected": -25.591934204101562, + "step": 1370 + }, + { + "epoch": 1.11, + "learning_rate": 2.459605026929982e-07, + "logits/chosen": -2.3692972660064697, + "logits/rejected": -1.6732375621795654, + "logps/chosen": -191.49008178710938, + "logps/rejected": -344.28155517578125, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.415878772735596, + "rewards/margins": 22.362245559692383, + "rewards/rejected": -26.778125762939453, + "step": 1380 + }, + { + "epoch": 1.12, + "learning_rate": 2.437163375224416e-07, + "logits/chosen": -2.1915714740753174, + "logits/rejected": -1.4661176204681396, + "logps/chosen": -204.90560913085938, + "logps/rejected": -348.9564208984375, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.140778064727783, + "rewards/margins": 20.88172149658203, + "rewards/rejected": -27.022502899169922, + "step": 1390 + }, + { + "epoch": 1.13, + "learning_rate": 2.4147217235188506e-07, + "logits/chosen": -2.450209140777588, + "logits/rejected": -1.7828893661499023, + "logps/chosen": -212.88232421875, + "logps/rejected": -305.83319091796875, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.971795558929443, + "rewards/margins": 16.09402847290039, + "rewards/rejected": -22.06582260131836, + "step": 1400 + }, + { + "epoch": 1.13, + "eval_logits/chosen": -2.330862283706665, + "eval_logits/rejected": -1.6351375579833984, + "eval_logps/chosen": -216.05340576171875, + "eval_logps/rejected": -327.8304138183594, + "eval_loss": 0.015285570174455643, + "eval_rewards/accuracies": 0.9905303120613098, + "eval_rewards/chosen": -5.922094345092773, + "eval_rewards/margins": 18.021373748779297, + "eval_rewards/rejected": -23.943464279174805, + "eval_runtime": 1271.2137, + "eval_samples_per_second": 3.299, + "eval_steps_per_second": 0.104, + "step": 1400 + }, + { + "epoch": 1.14, + "learning_rate": 2.3922800718132855e-07, + "logits/chosen": -2.440065622329712, + "logits/rejected": -1.7403312921524048, + "logps/chosen": -210.8866729736328, + "logps/rejected": -344.93450927734375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.708631992340088, + "rewards/margins": 20.705875396728516, + "rewards/rejected": -25.414505004882812, + "step": 1410 + }, + { + "epoch": 1.15, + "learning_rate": 2.36983842010772e-07, + "logits/chosen": -2.418907403945923, + "logits/rejected": -1.7396522760391235, + "logps/chosen": -207.55419921875, + "logps/rejected": -323.6063537597656, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.0609307289123535, + "rewards/margins": 18.32230567932129, + "rewards/rejected": -23.383235931396484, + "step": 1420 + }, + { + "epoch": 1.16, + "learning_rate": 2.3473967684021543e-07, + "logits/chosen": -2.5781989097595215, + "logits/rejected": -1.9527013301849365, + "logps/chosen": -209.34481811523438, + "logps/rejected": -323.6206970214844, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.36617374420166, + "rewards/margins": 17.80306053161621, + "rewards/rejected": -23.169233322143555, + "step": 1430 + }, + { + "epoch": 1.16, + "learning_rate": 2.324955116696589e-07, + "logits/chosen": -2.358539342880249, + "logits/rejected": -1.7326939105987549, + "logps/chosen": -214.9184112548828, + "logps/rejected": -316.4329528808594, + "loss": 0.0047, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.85650634765625, + "rewards/margins": 17.366939544677734, + "rewards/rejected": -23.223445892333984, + "step": 1440 + }, + { + "epoch": 1.17, + "learning_rate": 2.3025134649910233e-07, + "logits/chosen": -2.378484010696411, + "logits/rejected": -1.59342622756958, + "logps/chosen": -201.44468688964844, + "logps/rejected": -304.1047058105469, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.023573875427246, + "rewards/margins": 18.577011108398438, + "rewards/rejected": -22.6005859375, + "step": 1450 + }, + { + "epoch": 1.18, + "learning_rate": 2.2800718132854577e-07, + "logits/chosen": -2.356367349624634, + "logits/rejected": -1.489546537399292, + "logps/chosen": -197.93316650390625, + "logps/rejected": -334.6156311035156, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.114386081695557, + "rewards/margins": 20.04515266418457, + "rewards/rejected": -25.15953826904297, + "step": 1460 + }, + { + "epoch": 1.19, + "learning_rate": 2.257630161579892e-07, + "logits/chosen": -2.608461856842041, + "logits/rejected": -1.9460254907608032, + "logps/chosen": -228.8014373779297, + "logps/rejected": -356.54974365234375, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.495268821716309, + "rewards/margins": 21.43979835510254, + "rewards/rejected": -25.935068130493164, + "step": 1470 + }, + { + "epoch": 1.2, + "learning_rate": 2.2351885098743267e-07, + "logits/chosen": -2.4536807537078857, + "logits/rejected": -1.6814321279525757, + "logps/chosen": -202.04776000976562, + "logps/rejected": -324.0910339355469, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.8585405349731445, + "rewards/margins": 18.24228286743164, + "rewards/rejected": -24.10082244873047, + "step": 1480 + }, + { + "epoch": 1.2, + "learning_rate": 2.212746858168761e-07, + "logits/chosen": -2.608203172683716, + "logits/rejected": -2.0539937019348145, + "logps/chosen": -193.1971893310547, + "logps/rejected": -306.14447021484375, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.94258189201355, + "rewards/margins": 17.872577667236328, + "rewards/rejected": -21.81515884399414, + "step": 1490 + }, + { + "epoch": 1.21, + "learning_rate": 2.1903052064631955e-07, + "logits/chosen": -2.5761284828186035, + "logits/rejected": -1.8783142566680908, + "logps/chosen": -175.76541137695312, + "logps/rejected": -371.50335693359375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3101706504821777, + "rewards/margins": 26.184829711914062, + "rewards/rejected": -28.4950008392334, + "step": 1500 + }, + { + "epoch": 1.21, + "eval_logits/chosen": -2.4214093685150146, + "eval_logits/rejected": -1.5711694955825806, + "eval_logps/chosen": -191.88214111328125, + "eval_logps/rejected": -320.6025085449219, + "eval_loss": 0.01659109815955162, + "eval_rewards/accuracies": 0.9905303120613098, + "eval_rewards/chosen": -3.5049686431884766, + "eval_rewards/margins": 19.71570587158203, + "eval_rewards/rejected": -23.220672607421875, + "eval_runtime": 1256.2696, + "eval_samples_per_second": 3.338, + "eval_steps_per_second": 0.105, + "step": 1500 + }, + { + "epoch": 1.22, + "learning_rate": 2.16786355475763e-07, + "logits/chosen": -2.416098117828369, + "logits/rejected": -1.2447493076324463, + "logps/chosen": -192.24453735351562, + "logps/rejected": -302.9978942871094, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.075081825256348, + "rewards/margins": 17.91281509399414, + "rewards/rejected": -21.987897872924805, + "step": 1510 + }, + { + "epoch": 1.23, + "learning_rate": 2.1454219030520645e-07, + "logits/chosen": -2.4774982929229736, + "logits/rejected": -1.5303045511245728, + "logps/chosen": -205.1507110595703, + "logps/rejected": -321.3560791015625, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.174956798553467, + "rewards/margins": 18.381664276123047, + "rewards/rejected": -22.556623458862305, + "step": 1520 + }, + { + "epoch": 1.24, + "learning_rate": 2.122980251346499e-07, + "logits/chosen": -2.4611382484436035, + "logits/rejected": -1.158782720565796, + "logps/chosen": -192.5150909423828, + "logps/rejected": -322.80426025390625, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.329662799835205, + "rewards/margins": 19.829822540283203, + "rewards/rejected": -24.159486770629883, + "step": 1530 + }, + { + "epoch": 1.24, + "learning_rate": 2.1005385996409335e-07, + "logits/chosen": -2.4676074981689453, + "logits/rejected": -1.0838234424591064, + "logps/chosen": -189.53555297851562, + "logps/rejected": -301.4139404296875, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8323395252227783, + "rewards/margins": 18.586456298828125, + "rewards/rejected": -22.41879653930664, + "step": 1540 + }, + { + "epoch": 1.25, + "learning_rate": 2.078096947935368e-07, + "logits/chosen": -2.3849031925201416, + "logits/rejected": -1.349827527999878, + "logps/chosen": -196.45498657226562, + "logps/rejected": -310.68255615234375, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.405018329620361, + "rewards/margins": 17.249290466308594, + "rewards/rejected": -22.654308319091797, + "step": 1550 + }, + { + "epoch": 1.26, + "learning_rate": 2.0556552962298023e-07, + "logits/chosen": -2.3526644706726074, + "logits/rejected": -1.3514336347579956, + "logps/chosen": -216.0058135986328, + "logps/rejected": -314.02386474609375, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.721417427062988, + "rewards/margins": 17.36629867553711, + "rewards/rejected": -23.087717056274414, + "step": 1560 + }, + { + "epoch": 1.27, + "learning_rate": 2.0332136445242366e-07, + "logits/chosen": -2.366161346435547, + "logits/rejected": -1.2069361209869385, + "logps/chosen": -198.00723266601562, + "logps/rejected": -331.432373046875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.333505153656006, + "rewards/margins": 20.263362884521484, + "rewards/rejected": -24.596866607666016, + "step": 1570 + }, + { + "epoch": 1.28, + "learning_rate": 2.0107719928186716e-07, + "logits/chosen": -2.53715443611145, + "logits/rejected": -1.3027998208999634, + "logps/chosen": -199.01109313964844, + "logps/rejected": -300.29583740234375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.91579008102417, + "rewards/margins": 17.468521118164062, + "rewards/rejected": -21.38431167602539, + "step": 1580 + }, + { + "epoch": 1.28, + "learning_rate": 1.988330341113106e-07, + "logits/chosen": -2.4570298194885254, + "logits/rejected": -1.2619421482086182, + "logps/chosen": -186.6202850341797, + "logps/rejected": -309.71478271484375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8402743339538574, + "rewards/margins": 19.227161407470703, + "rewards/rejected": -23.06743621826172, + "step": 1590 + }, + { + "epoch": 1.29, + "learning_rate": 1.9658886894075403e-07, + "logits/chosen": -2.170544385910034, + "logits/rejected": -1.1056536436080933, + "logps/chosen": -184.45266723632812, + "logps/rejected": -331.6368713378906, + "loss": 0.0055, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.9242756366729736, + "rewards/margins": 21.46536636352539, + "rewards/rejected": -25.389638900756836, + "step": 1600 + }, + { + "epoch": 1.29, + "eval_logits/chosen": -2.2401058673858643, + "eval_logits/rejected": -1.0331358909606934, + "eval_logps/chosen": -206.0149383544922, + "eval_logps/rejected": -337.9579162597656, + "eval_loss": 0.015002009458839893, + "eval_rewards/accuracies": 0.9895833134651184, + "eval_rewards/chosen": -4.918245792388916, + "eval_rewards/margins": 20.03797149658203, + "eval_rewards/rejected": -24.956214904785156, + "eval_runtime": 1271.1267, + "eval_samples_per_second": 3.299, + "eval_steps_per_second": 0.104, + "step": 1600 + }, + { + "epoch": 1.3, + "learning_rate": 1.943447037701975e-07, + "logits/chosen": -2.2028968334198, + "logits/rejected": -1.331644892692566, + "logps/chosen": -199.48641967773438, + "logps/rejected": -402.6153259277344, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.753117561340332, + "rewards/margins": 25.99526023864746, + "rewards/rejected": -30.748376846313477, + "step": 1610 + }, + { + "epoch": 1.31, + "learning_rate": 1.9210053859964093e-07, + "logits/chosen": -2.521888017654419, + "logits/rejected": -1.4896118640899658, + "logps/chosen": -214.9086151123047, + "logps/rejected": -331.5408020019531, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.080290794372559, + "rewards/margins": 18.44835090637207, + "rewards/rejected": -23.528640747070312, + "step": 1620 + }, + { + "epoch": 1.32, + "learning_rate": 1.8985637342908437e-07, + "logits/chosen": -2.550525665283203, + "logits/rejected": -1.4970852136611938, + "logps/chosen": -215.11196899414062, + "logps/rejected": -330.1084899902344, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6528236865997314, + "rewards/margins": 19.67332649230957, + "rewards/rejected": -23.32615089416504, + "step": 1630 + }, + { + "epoch": 1.32, + "learning_rate": 1.8761220825852784e-07, + "logits/chosen": -2.4502251148223877, + "logits/rejected": -1.2628605365753174, + "logps/chosen": -170.15115356445312, + "logps/rejected": -309.0227355957031, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3570239543914795, + "rewards/margins": 19.349599838256836, + "rewards/rejected": -22.706623077392578, + "step": 1640 + }, + { + "epoch": 1.33, + "learning_rate": 1.8536804308797127e-07, + "logits/chosen": -2.4148476123809814, + "logits/rejected": -1.19232177734375, + "logps/chosen": -172.6577911376953, + "logps/rejected": -316.9449157714844, + "loss": 0.013, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.6390812397003174, + "rewards/margins": 20.73019027709961, + "rewards/rejected": -23.369266510009766, + "step": 1650 + }, + { + "epoch": 1.34, + "learning_rate": 1.831238779174147e-07, + "logits/chosen": -2.359056234359741, + "logits/rejected": -1.4545716047286987, + "logps/chosen": -181.18606567382812, + "logps/rejected": -513.1529541015625, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3970236778259277, + "rewards/margins": 39.41077423095703, + "rewards/rejected": -42.807796478271484, + "step": 1660 + }, + { + "epoch": 1.35, + "learning_rate": 1.8087971274685815e-07, + "logits/chosen": -2.466386079788208, + "logits/rejected": -1.2338342666625977, + "logps/chosen": -186.37802124023438, + "logps/rejected": -360.255859375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.791513442993164, + "rewards/margins": 23.790407180786133, + "rewards/rejected": -27.581920623779297, + "step": 1670 + }, + { + "epoch": 1.36, + "learning_rate": 1.7863554757630161e-07, + "logits/chosen": -2.4585659503936768, + "logits/rejected": -1.4333058595657349, + "logps/chosen": -186.0174560546875, + "logps/rejected": -344.93304443359375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2950263023376465, + "rewards/margins": 22.118741989135742, + "rewards/rejected": -25.413768768310547, + "step": 1680 + }, + { + "epoch": 1.37, + "learning_rate": 1.7639138240574505e-07, + "logits/chosen": -2.3401618003845215, + "logits/rejected": -1.1980568170547485, + "logps/chosen": -195.11424255371094, + "logps/rejected": -333.1010437011719, + "loss": 0.0106, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.817376613616943, + "rewards/margins": 20.051599502563477, + "rewards/rejected": -24.868976593017578, + "step": 1690 + }, + { + "epoch": 1.37, + "learning_rate": 1.741472172351885e-07, + "logits/chosen": -2.2132911682128906, + "logits/rejected": -1.2666276693344116, + "logps/chosen": -208.95407104492188, + "logps/rejected": -350.6565856933594, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.166562557220459, + "rewards/margins": 19.45525550842285, + "rewards/rejected": -26.6218204498291, + "step": 1700 + }, + { + "epoch": 1.37, + "eval_logits/chosen": -2.1886727809906006, + "eval_logits/rejected": -0.9176675081253052, + "eval_logps/chosen": -214.611328125, + "eval_logps/rejected": -346.32073974609375, + "eval_loss": 0.015112359076738358, + "eval_rewards/accuracies": 0.9895833134651184, + "eval_rewards/chosen": -5.777885437011719, + "eval_rewards/margins": 20.014610290527344, + "eval_rewards/rejected": -25.792495727539062, + "eval_runtime": 1210.4017, + "eval_samples_per_second": 3.465, + "eval_steps_per_second": 0.109, + "step": 1700 + }, + { + "epoch": 1.38, + "learning_rate": 1.7190305206463195e-07, + "logits/chosen": -2.367215633392334, + "logits/rejected": -1.2320685386657715, + "logps/chosen": -220.96273803710938, + "logps/rejected": -338.31597900390625, + "loss": 0.0139, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.964104175567627, + "rewards/margins": 19.60570526123047, + "rewards/rejected": -24.569808959960938, + "step": 1710 + }, + { + "epoch": 1.39, + "learning_rate": 1.696588868940754e-07, + "logits/chosen": -2.187542200088501, + "logits/rejected": -0.8695700764656067, + "logps/chosen": -228.68148803710938, + "logps/rejected": -361.9653625488281, + "loss": 0.0112, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -7.120731353759766, + "rewards/margins": 19.401437759399414, + "rewards/rejected": -26.522167205810547, + "step": 1720 + }, + { + "epoch": 1.4, + "learning_rate": 1.6741472172351883e-07, + "logits/chosen": -2.1448841094970703, + "logits/rejected": -0.90544193983078, + "logps/chosen": -231.8255615234375, + "logps/rejected": -344.2624816894531, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.548605442047119, + "rewards/margins": 18.394935607910156, + "rewards/rejected": -25.94354248046875, + "step": 1730 + }, + { + "epoch": 1.41, + "learning_rate": 1.651705565529623e-07, + "logits/chosen": -2.144925594329834, + "logits/rejected": -0.7779609560966492, + "logps/chosen": -213.4037322998047, + "logps/rejected": -429.9497985839844, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.452700614929199, + "rewards/margins": 27.562122344970703, + "rewards/rejected": -34.01482391357422, + "step": 1740 + }, + { + "epoch": 1.41, + "learning_rate": 1.6292639138240573e-07, + "logits/chosen": -2.4616587162017822, + "logits/rejected": -1.16438627243042, + "logps/chosen": -198.54013061523438, + "logps/rejected": -349.3918151855469, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.990466356277466, + "rewards/margins": 21.930543899536133, + "rewards/rejected": -25.921010971069336, + "step": 1750 + }, + { + "epoch": 1.42, + "learning_rate": 1.6068222621184917e-07, + "logits/chosen": -2.3011021614074707, + "logits/rejected": -1.1753021478652954, + "logps/chosen": -191.2627410888672, + "logps/rejected": -369.66119384765625, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.050785779953003, + "rewards/margins": 24.80243492126465, + "rewards/rejected": -27.853219985961914, + "step": 1760 + }, + { + "epoch": 1.43, + "learning_rate": 1.5843806104129263e-07, + "logits/chosen": -2.3033337593078613, + "logits/rejected": -0.918114185333252, + "logps/chosen": -204.9922332763672, + "logps/rejected": -372.69207763671875, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.822578430175781, + "rewards/margins": 22.94754409790039, + "rewards/rejected": -27.77012062072754, + "step": 1770 + }, + { + "epoch": 1.44, + "learning_rate": 1.561938958707361e-07, + "logits/chosen": -2.4193952083587646, + "logits/rejected": -1.1801656484603882, + "logps/chosen": -207.52401733398438, + "logps/rejected": -339.1822814941406, + "loss": 0.0096, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.9387006759643555, + "rewards/margins": 19.72007179260254, + "rewards/rejected": -24.658771514892578, + "step": 1780 + }, + { + "epoch": 1.45, + "learning_rate": 1.5394973070017954e-07, + "logits/chosen": -2.530266523361206, + "logits/rejected": -1.4491612911224365, + "logps/chosen": -194.62411499023438, + "logps/rejected": -333.04400634765625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1551432609558105, + "rewards/margins": 21.45151710510254, + "rewards/rejected": -23.606660842895508, + "step": 1790 + }, + { + "epoch": 1.45, + "learning_rate": 1.5170556552962297e-07, + "logits/chosen": -1.9080320596694946, + "logits/rejected": -0.5136507153511047, + "logps/chosen": -191.48056030273438, + "logps/rejected": -361.7966003417969, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7912349700927734, + "rewards/margins": 23.479774475097656, + "rewards/rejected": -27.271011352539062, + "step": 1800 + }, + { + "epoch": 1.45, + "eval_logits/chosen": -1.991869330406189, + "eval_logits/rejected": -0.4171099066734314, + "eval_logps/chosen": -206.48728942871094, + "eval_logps/rejected": -355.65594482421875, + "eval_loss": 0.017375623807311058, + "eval_rewards/accuracies": 0.9886363744735718, + "eval_rewards/chosen": -4.965482711791992, + "eval_rewards/margins": 21.76053237915039, + "eval_rewards/rejected": -26.726016998291016, + "eval_runtime": 1182.528, + "eval_samples_per_second": 3.547, + "eval_steps_per_second": 0.112, + "step": 1800 + }, + { + "epoch": 1.46, + "learning_rate": 1.4946140035906644e-07, + "logits/chosen": -1.9850527048110962, + "logits/rejected": -0.7070460915565491, + "logps/chosen": -215.40475463867188, + "logps/rejected": -348.0308837890625, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.531234264373779, + "rewards/margins": 20.979633331298828, + "rewards/rejected": -25.510868072509766, + "step": 1810 + }, + { + "epoch": 1.47, + "learning_rate": 1.4721723518850988e-07, + "logits/chosen": -2.2196340560913086, + "logits/rejected": -0.7297879457473755, + "logps/chosen": -204.4512481689453, + "logps/rejected": -348.6492614746094, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.928321838378906, + "rewards/margins": 21.583662033081055, + "rewards/rejected": -26.511981964111328, + "step": 1820 + }, + { + "epoch": 1.48, + "learning_rate": 1.449730700179533e-07, + "logits/chosen": -2.542065382003784, + "logits/rejected": -1.3664993047714233, + "logps/chosen": -231.4185028076172, + "logps/rejected": -358.8476257324219, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.564633846282959, + "rewards/margins": 20.477113723754883, + "rewards/rejected": -26.041748046875, + "step": 1830 + }, + { + "epoch": 1.49, + "learning_rate": 1.4272890484739678e-07, + "logits/chosen": -2.0507869720458984, + "logits/rejected": -0.6834365129470825, + "logps/chosen": -207.55062866210938, + "logps/rejected": -345.182861328125, + "loss": 0.0134, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.903302192687988, + "rewards/margins": 19.341928482055664, + "rewards/rejected": -26.2452335357666, + "step": 1840 + }, + { + "epoch": 1.49, + "learning_rate": 1.4048473967684022e-07, + "logits/chosen": -1.942055344581604, + "logits/rejected": -0.7990915179252625, + "logps/chosen": -249.50405883789062, + "logps/rejected": -370.90362548828125, + "loss": 0.0074, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.648567199707031, + "rewards/margins": 17.695268630981445, + "rewards/rejected": -27.343835830688477, + "step": 1850 + }, + { + "epoch": 1.5, + "learning_rate": 1.3824057450628365e-07, + "logits/chosen": -1.8192236423492432, + "logits/rejected": -0.495781272649765, + "logps/chosen": -243.5706024169922, + "logps/rejected": -350.90179443359375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.485081672668457, + "rewards/margins": 17.48373031616211, + "rewards/rejected": -26.96881103515625, + "step": 1860 + }, + { + "epoch": 1.51, + "learning_rate": 1.359964093357271e-07, + "logits/chosen": -1.6976194381713867, + "logits/rejected": -0.23302654922008514, + "logps/chosen": -231.98208618164062, + "logps/rejected": -374.8025207519531, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.111143112182617, + "rewards/margins": 21.15093994140625, + "rewards/rejected": -29.2620849609375, + "step": 1870 + }, + { + "epoch": 1.52, + "learning_rate": 1.3375224416517056e-07, + "logits/chosen": -2.1988472938537598, + "logits/rejected": -1.1178010702133179, + "logps/chosen": -256.80126953125, + "logps/rejected": -412.605224609375, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.1202974319458, + "rewards/margins": 22.89120101928711, + "rewards/rejected": -31.01149559020996, + "step": 1880 + }, + { + "epoch": 1.53, + "learning_rate": 1.31508078994614e-07, + "logits/chosen": -2.1401596069335938, + "logits/rejected": -0.85423743724823, + "logps/chosen": -244.6685028076172, + "logps/rejected": -387.1705322265625, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.540109157562256, + "rewards/margins": 21.48582649230957, + "rewards/rejected": -29.02593421936035, + "step": 1890 + }, + { + "epoch": 1.53, + "learning_rate": 1.2926391382405743e-07, + "logits/chosen": -2.326422929763794, + "logits/rejected": -1.1414649486541748, + "logps/chosen": -231.3490447998047, + "logps/rejected": -349.94415283203125, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.250179290771484, + "rewards/margins": 18.434165954589844, + "rewards/rejected": -25.684345245361328, + "step": 1900 + }, + { + "epoch": 1.53, + "eval_logits/chosen": -2.1291425228118896, + "eval_logits/rejected": -0.7782645225524902, + "eval_logps/chosen": -220.05587768554688, + "eval_logps/rejected": -363.53216552734375, + "eval_loss": 0.015889223664999008, + "eval_rewards/accuracies": 0.9886363744735718, + "eval_rewards/chosen": -6.32234001159668, + "eval_rewards/margins": 21.191301345825195, + "eval_rewards/rejected": -27.513641357421875, + "eval_runtime": 1266.6258, + "eval_samples_per_second": 3.311, + "eval_steps_per_second": 0.104, + "step": 1900 + }, + { + "epoch": 1.54, + "learning_rate": 1.270197486535009e-07, + "logits/chosen": -2.249268054962158, + "logits/rejected": -0.5687516927719116, + "logps/chosen": -205.4871063232422, + "logps/rejected": -372.8687438964844, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.509349346160889, + "rewards/margins": 24.464649200439453, + "rewards/rejected": -28.974002838134766, + "step": 1910 + }, + { + "epoch": 1.55, + "learning_rate": 1.2477558348294433e-07, + "logits/chosen": -2.2692904472351074, + "logits/rejected": -0.4875977039337158, + "logps/chosen": -199.8176727294922, + "logps/rejected": -352.04974365234375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.211134910583496, + "rewards/margins": 22.314863204956055, + "rewards/rejected": -27.525997161865234, + "step": 1920 + }, + { + "epoch": 1.56, + "learning_rate": 1.2253141831238777e-07, + "logits/chosen": -2.4849350452423096, + "logits/rejected": -1.0505064725875854, + "logps/chosen": -222.5237274169922, + "logps/rejected": -383.90478515625, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.392744541168213, + "rewards/margins": 22.870044708251953, + "rewards/rejected": -28.26279067993164, + "step": 1930 + }, + { + "epoch": 1.57, + "learning_rate": 1.2028725314183123e-07, + "logits/chosen": -2.640073537826538, + "logits/rejected": -1.7355175018310547, + "logps/chosen": -195.56324768066406, + "logps/rejected": -355.7330322265625, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6881299018859863, + "rewards/margins": 23.08262062072754, + "rewards/rejected": -26.770748138427734, + "step": 1940 + }, + { + "epoch": 1.58, + "learning_rate": 1.1804308797127469e-07, + "logits/chosen": -2.6067564487457275, + "logits/rejected": -1.6765193939208984, + "logps/chosen": -213.5617218017578, + "logps/rejected": -325.18896484375, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.099620342254639, + "rewards/margins": 19.22537612915039, + "rewards/rejected": -23.324995040893555, + "step": 1950 + }, + { + "epoch": 1.58, + "learning_rate": 1.1579892280071812e-07, + "logits/chosen": -2.3989176750183105, + "logits/rejected": -1.4806644916534424, + "logps/chosen": -206.7233428955078, + "logps/rejected": -369.6064453125, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.540416717529297, + "rewards/margins": 22.4031982421875, + "rewards/rejected": -27.943614959716797, + "step": 1960 + }, + { + "epoch": 1.59, + "learning_rate": 1.1355475763016157e-07, + "logits/chosen": -2.491964817047119, + "logits/rejected": -1.6051578521728516, + "logps/chosen": -225.3629913330078, + "logps/rejected": -358.2679443359375, + "loss": 0.0195, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.35553503036499, + "rewards/margins": 20.502544403076172, + "rewards/rejected": -26.858078002929688, + "step": 1970 + }, + { + "epoch": 1.6, + "learning_rate": 1.1131059245960501e-07, + "logits/chosen": -2.5904030799865723, + "logits/rejected": -1.9738000631332397, + "logps/chosen": -216.9815216064453, + "logps/rejected": -326.6809387207031, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.408628940582275, + "rewards/margins": 18.634952545166016, + "rewards/rejected": -23.043582916259766, + "step": 1980 + }, + { + "epoch": 1.61, + "learning_rate": 1.0906642728904846e-07, + "logits/chosen": -2.480057954788208, + "logits/rejected": -1.7685632705688477, + "logps/chosen": -187.53213500976562, + "logps/rejected": -313.3226318359375, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.598074436187744, + "rewards/margins": 18.888193130493164, + "rewards/rejected": -22.486263275146484, + "step": 1990 + }, + { + "epoch": 1.62, + "learning_rate": 1.0682226211849191e-07, + "logits/chosen": -2.4661288261413574, + "logits/rejected": -1.7218126058578491, + "logps/chosen": -199.00552368164062, + "logps/rejected": -316.6554260253906, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.300858497619629, + "rewards/margins": 18.68948745727539, + "rewards/rejected": -22.990346908569336, + "step": 2000 + }, + { + "epoch": 1.62, + "eval_logits/chosen": -2.390958547592163, + "eval_logits/rejected": -1.6402666568756104, + "eval_logps/chosen": -205.94131469726562, + "eval_logps/rejected": -328.49603271484375, + "eval_loss": 0.014796514995396137, + "eval_rewards/accuracies": 0.9905303120613098, + "eval_rewards/chosen": -4.910885334014893, + "eval_rewards/margins": 19.099140167236328, + "eval_rewards/rejected": -24.010025024414062, + "eval_runtime": 1255.4968, + "eval_samples_per_second": 3.341, + "eval_steps_per_second": 0.105, + "step": 2000 + }, + { + "epoch": 1.62, + "learning_rate": 1.0457809694793537e-07, + "logits/chosen": -2.472365617752075, + "logits/rejected": -1.8317763805389404, + "logps/chosen": -204.76864624023438, + "logps/rejected": -327.6580505371094, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.449965000152588, + "rewards/margins": 19.671606063842773, + "rewards/rejected": -24.121570587158203, + "step": 2010 + }, + { + "epoch": 1.63, + "learning_rate": 1.0233393177737882e-07, + "logits/chosen": -2.458085775375366, + "logits/rejected": -1.768319845199585, + "logps/chosen": -214.52517700195312, + "logps/rejected": -336.6673278808594, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.354293346405029, + "rewards/margins": 19.74245834350586, + "rewards/rejected": -24.09675407409668, + "step": 2020 + }, + { + "epoch": 1.64, + "learning_rate": 1.0008976660682225e-07, + "logits/chosen": -2.5455336570739746, + "logits/rejected": -1.869773268699646, + "logps/chosen": -208.6695556640625, + "logps/rejected": -335.33270263671875, + "loss": 0.0167, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.887508392333984, + "rewards/margins": 19.274761199951172, + "rewards/rejected": -24.162269592285156, + "step": 2030 + }, + { + "epoch": 1.65, + "learning_rate": 9.78456014362657e-08, + "logits/chosen": -2.4698987007141113, + "logits/rejected": -1.7299079895019531, + "logps/chosen": -188.715087890625, + "logps/rejected": -331.55303955078125, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.583217144012451, + "rewards/margins": 19.68210792541504, + "rewards/rejected": -24.265323638916016, + "step": 2040 + }, + { + "epoch": 1.66, + "learning_rate": 9.560143626570916e-08, + "logits/chosen": -2.5610883235931396, + "logits/rejected": -1.7795727252960205, + "logps/chosen": -224.88192749023438, + "logps/rejected": -361.1158752441406, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.0886101722717285, + "rewards/margins": 21.445255279541016, + "rewards/rejected": -26.533864974975586, + "step": 2050 + }, + { + "epoch": 1.66, + "learning_rate": 9.33572710951526e-08, + "logits/chosen": -2.4338595867156982, + "logits/rejected": -1.80299973487854, + "logps/chosen": -209.2632598876953, + "logps/rejected": -339.7546691894531, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.963757038116455, + "rewards/margins": 18.653308868408203, + "rewards/rejected": -24.617063522338867, + "step": 2060 + }, + { + "epoch": 1.67, + "learning_rate": 9.111310592459605e-08, + "logits/chosen": -2.518695116043091, + "logits/rejected": -1.8007400035858154, + "logps/chosen": -203.3430633544922, + "logps/rejected": -340.86663818359375, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.8816304206848145, + "rewards/margins": 19.663654327392578, + "rewards/rejected": -24.545284271240234, + "step": 2070 + }, + { + "epoch": 1.68, + "learning_rate": 8.886894075403948e-08, + "logits/chosen": -2.5152430534362793, + "logits/rejected": -1.7658584117889404, + "logps/chosen": -184.61566162109375, + "logps/rejected": -323.6172790527344, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7919273376464844, + "rewards/margins": 20.03896713256836, + "rewards/rejected": -23.83089828491211, + "step": 2080 + }, + { + "epoch": 1.69, + "learning_rate": 8.662477558348293e-08, + "logits/chosen": -2.50066876411438, + "logits/rejected": -1.682350516319275, + "logps/chosen": -182.88999938964844, + "logps/rejected": -353.8326416015625, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.97038197517395, + "rewards/margins": 23.050928115844727, + "rewards/rejected": -26.021310806274414, + "step": 2090 + }, + { + "epoch": 1.7, + "learning_rate": 8.43806104129264e-08, + "logits/chosen": -2.3935739994049072, + "logits/rejected": -1.5213640928268433, + "logps/chosen": -185.13040161132812, + "logps/rejected": -339.47857666015625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.4466552734375, + "rewards/margins": 20.78413200378418, + "rewards/rejected": -25.23078727722168, + "step": 2100 + }, + { + "epoch": 1.7, + "eval_logits/chosen": -2.3995368480682373, + "eval_logits/rejected": -1.5789051055908203, + "eval_logps/chosen": -202.44464111328125, + "eval_logps/rejected": -341.2870788574219, + "eval_loss": 0.015234292484819889, + "eval_rewards/accuracies": 0.9886363744735718, + "eval_rewards/chosen": -4.561216354370117, + "eval_rewards/margins": 20.727914810180664, + "eval_rewards/rejected": -25.28912925720215, + "eval_runtime": 1267.1431, + "eval_samples_per_second": 3.31, + "eval_steps_per_second": 0.104, + "step": 2100 + }, + { + "epoch": 1.7, + "learning_rate": 8.213644524236984e-08, + "logits/chosen": -2.4096715450286865, + "logits/rejected": -1.6652095317840576, + "logps/chosen": -176.23236083984375, + "logps/rejected": -362.1352233886719, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6365292072296143, + "rewards/margins": 24.081058502197266, + "rewards/rejected": -27.717586517333984, + "step": 2110 + }, + { + "epoch": 1.71, + "learning_rate": 7.989228007181329e-08, + "logits/chosen": -2.5635247230529785, + "logits/rejected": -1.675135612487793, + "logps/chosen": -209.803955078125, + "logps/rejected": -331.2458801269531, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.344306468963623, + "rewards/margins": 20.84170150756836, + "rewards/rejected": -24.186006546020508, + "step": 2120 + }, + { + "epoch": 1.72, + "learning_rate": 7.764811490125673e-08, + "logits/chosen": -2.6421926021575928, + "logits/rejected": -1.7122713327407837, + "logps/chosen": -202.98875427246094, + "logps/rejected": -339.0066833496094, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.94111967086792, + "rewards/margins": 21.492115020751953, + "rewards/rejected": -25.433237075805664, + "step": 2130 + }, + { + "epoch": 1.73, + "learning_rate": 7.540394973070018e-08, + "logits/chosen": -2.581462860107422, + "logits/rejected": -2.0366358757019043, + "logps/chosen": -197.75538635253906, + "logps/rejected": -344.79547119140625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.451511859893799, + "rewards/margins": 21.68473243713379, + "rewards/rejected": -25.136241912841797, + "step": 2140 + }, + { + "epoch": 1.74, + "learning_rate": 7.315978456014363e-08, + "logits/chosen": -2.608591318130493, + "logits/rejected": -1.812015175819397, + "logps/chosen": -209.92233276367188, + "logps/rejected": -350.6752624511719, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.751372337341309, + "rewards/margins": 19.98618507385254, + "rewards/rejected": -25.7375545501709, + "step": 2150 + }, + { + "epoch": 1.74, + "learning_rate": 7.091561938958707e-08, + "logits/chosen": -2.5940194129943848, + "logits/rejected": -1.7068437337875366, + "logps/chosen": -180.03932189941406, + "logps/rejected": -299.02630615234375, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3423776626586914, + "rewards/margins": 19.381391525268555, + "rewards/rejected": -21.72376823425293, + "step": 2160 + }, + { + "epoch": 1.75, + "learning_rate": 6.867145421903052e-08, + "logits/chosen": -2.583341121673584, + "logits/rejected": -1.6264400482177734, + "logps/chosen": -178.058837890625, + "logps/rejected": -337.9113464355469, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6976494789123535, + "rewards/margins": 23.03209686279297, + "rewards/rejected": -25.729745864868164, + "step": 2170 + }, + { + "epoch": 1.76, + "learning_rate": 6.642728904847395e-08, + "logits/chosen": -2.5606331825256348, + "logits/rejected": -1.9560878276824951, + "logps/chosen": -221.374267578125, + "logps/rejected": -359.3021545410156, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.586182594299316, + "rewards/margins": 20.248546600341797, + "rewards/rejected": -24.834728240966797, + "step": 2180 + }, + { + "epoch": 1.77, + "learning_rate": 6.41831238779174e-08, + "logits/chosen": -2.543333053588867, + "logits/rejected": -1.456998586654663, + "logps/chosen": -186.70535278320312, + "logps/rejected": -339.4964294433594, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.472302436828613, + "rewards/margins": 21.189918518066406, + "rewards/rejected": -25.662221908569336, + "step": 2190 + }, + { + "epoch": 1.78, + "learning_rate": 6.193895870736086e-08, + "logits/chosen": -2.5679805278778076, + "logits/rejected": -1.4606512784957886, + "logps/chosen": -193.8790283203125, + "logps/rejected": -341.0709533691406, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9552359580993652, + "rewards/margins": 22.733787536621094, + "rewards/rejected": -25.689022064208984, + "step": 2200 + }, + { + "epoch": 1.78, + "eval_logits/chosen": -2.3330769538879395, + "eval_logits/rejected": -1.3679535388946533, + "eval_logps/chosen": -201.9725341796875, + "eval_logps/rejected": -357.319580078125, + "eval_loss": 0.01744203269481659, + "eval_rewards/accuracies": 0.9876893758773804, + "eval_rewards/chosen": -4.514005661010742, + "eval_rewards/margins": 22.37837791442871, + "eval_rewards/rejected": -26.892383575439453, + "eval_runtime": 1232.2335, + "eval_samples_per_second": 3.404, + "eval_steps_per_second": 0.107, + "step": 2200 + }, + { + "epoch": 1.79, + "learning_rate": 5.969479353680431e-08, + "logits/chosen": -2.5256142616271973, + "logits/rejected": -1.5265940427780151, + "logps/chosen": -210.9215545654297, + "logps/rejected": -443.59136962890625, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.736632347106934, + "rewards/margins": 30.950061798095703, + "rewards/rejected": -35.68669509887695, + "step": 2210 + }, + { + "epoch": 1.79, + "learning_rate": 5.745062836624775e-08, + "logits/chosen": -2.4370579719543457, + "logits/rejected": -1.5633165836334229, + "logps/chosen": -196.07791137695312, + "logps/rejected": -353.5588684082031, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.257540225982666, + "rewards/margins": 21.69769859313965, + "rewards/rejected": -26.95524024963379, + "step": 2220 + }, + { + "epoch": 1.8, + "learning_rate": 5.52064631956912e-08, + "logits/chosen": -2.447204828262329, + "logits/rejected": -1.5592924356460571, + "logps/chosen": -207.3715362548828, + "logps/rejected": -359.57183837890625, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.280811309814453, + "rewards/margins": 21.324148178100586, + "rewards/rejected": -26.60495948791504, + "step": 2230 + }, + { + "epoch": 1.81, + "learning_rate": 5.296229802513465e-08, + "logits/chosen": -2.202885627746582, + "logits/rejected": -1.0684783458709717, + "logps/chosen": -198.41769409179688, + "logps/rejected": -345.6863098144531, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.47156286239624, + "rewards/margins": 21.288461685180664, + "rewards/rejected": -26.760019302368164, + "step": 2240 + }, + { + "epoch": 1.82, + "learning_rate": 5.071813285457809e-08, + "logits/chosen": -2.4340877532958984, + "logits/rejected": -1.3804821968078613, + "logps/chosen": -214.57382202148438, + "logps/rejected": -370.85333251953125, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.449063301086426, + "rewards/margins": 23.452455520629883, + "rewards/rejected": -27.90151596069336, + "step": 2250 + }, + { + "epoch": 1.83, + "learning_rate": 4.8473967684021537e-08, + "logits/chosen": -2.4802422523498535, + "logits/rejected": -1.4854291677474976, + "logps/chosen": -208.07858276367188, + "logps/rejected": -364.12542724609375, + "loss": 0.0048, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.606834411621094, + "rewards/margins": 23.329484939575195, + "rewards/rejected": -27.936321258544922, + "step": 2260 + }, + { + "epoch": 1.83, + "learning_rate": 4.6229802513464994e-08, + "logits/chosen": -2.487740993499756, + "logits/rejected": -1.124255895614624, + "logps/chosen": -211.3971405029297, + "logps/rejected": -363.6300354003906, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.1517462730407715, + "rewards/margins": 22.992733001708984, + "rewards/rejected": -28.14447593688965, + "step": 2270 + }, + { + "epoch": 1.84, + "learning_rate": 4.398563734290844e-08, + "logits/chosen": -2.4872403144836426, + "logits/rejected": -1.6803514957427979, + "logps/chosen": -235.8931884765625, + "logps/rejected": -385.62957763671875, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.442092418670654, + "rewards/margins": 23.21182632446289, + "rewards/rejected": -28.653921127319336, + "step": 2280 + }, + { + "epoch": 1.85, + "learning_rate": 4.174147217235188e-08, + "logits/chosen": -2.3680336475372314, + "logits/rejected": -1.494185209274292, + "logps/chosen": -196.44735717773438, + "logps/rejected": -374.414306640625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.583914279937744, + "rewards/margins": 24.50788116455078, + "rewards/rejected": -29.091796875, + "step": 2290 + }, + { + "epoch": 1.86, + "learning_rate": 3.949730700179533e-08, + "logits/chosen": -2.498108386993408, + "logits/rejected": -1.5288788080215454, + "logps/chosen": -194.5970916748047, + "logps/rejected": -345.3826599121094, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.764695644378662, + "rewards/margins": 21.57973861694336, + "rewards/rejected": -26.344432830810547, + "step": 2300 + }, + { + "epoch": 1.86, + "eval_logits/chosen": -2.3958795070648193, + "eval_logits/rejected": -1.5206599235534668, + "eval_logps/chosen": -201.88543701171875, + "eval_logps/rejected": -353.18450927734375, + "eval_loss": 0.016560323536396027, + "eval_rewards/accuracies": 0.9867424368858337, + "eval_rewards/chosen": -4.505298137664795, + "eval_rewards/margins": 21.973575592041016, + "eval_rewards/rejected": -26.478872299194336, + "eval_runtime": 1262.6154, + "eval_samples_per_second": 3.322, + "eval_steps_per_second": 0.105, + "step": 2300 + }, + { + "epoch": 1.87, + "learning_rate": 3.725314183123877e-08, + "logits/chosen": -2.5438284873962402, + "logits/rejected": -1.770115852355957, + "logps/chosen": -211.43588256835938, + "logps/rejected": -446.965087890625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3962349891662598, + "rewards/margins": 31.757091522216797, + "rewards/rejected": -35.15332794189453, + "step": 2310 + }, + { + "epoch": 1.87, + "learning_rate": 3.500897666068223e-08, + "logits/chosen": -2.5129778385162354, + "logits/rejected": -1.845759630203247, + "logps/chosen": -199.38876342773438, + "logps/rejected": -330.90301513671875, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.473719358444214, + "rewards/margins": 20.16348648071289, + "rewards/rejected": -23.63720703125, + "step": 2320 + }, + { + "epoch": 1.88, + "learning_rate": 3.2764811490125674e-08, + "logits/chosen": -2.3579037189483643, + "logits/rejected": -1.2947641611099243, + "logps/chosen": -165.55572509765625, + "logps/rejected": -328.0405578613281, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0216546058654785, + "rewards/margins": 21.991546630859375, + "rewards/rejected": -25.013202667236328, + "step": 2330 + }, + { + "epoch": 1.89, + "learning_rate": 3.052064631956912e-08, + "logits/chosen": -2.6092991828918457, + "logits/rejected": -1.9178035259246826, + "logps/chosen": -190.85372924804688, + "logps/rejected": -343.14385986328125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.187027931213379, + "rewards/margins": 21.32404136657715, + "rewards/rejected": -25.51106834411621, + "step": 2340 + }, + { + "epoch": 1.9, + "learning_rate": 2.8276481149012566e-08, + "logits/chosen": -2.414665699005127, + "logits/rejected": -1.4469376802444458, + "logps/chosen": -199.14303588867188, + "logps/rejected": -362.63970947265625, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.676077842712402, + "rewards/margins": 22.0654239654541, + "rewards/rejected": -27.741504669189453, + "step": 2350 + }, + { + "epoch": 1.91, + "learning_rate": 2.6032315978456014e-08, + "logits/chosen": -2.4596142768859863, + "logits/rejected": -1.5556590557098389, + "logps/chosen": -199.23048400878906, + "logps/rejected": -359.006103515625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.547817707061768, + "rewards/margins": 22.574724197387695, + "rewards/rejected": -27.122543334960938, + "step": 2360 + }, + { + "epoch": 1.91, + "learning_rate": 2.378815080789946e-08, + "logits/chosen": -2.45746111869812, + "logits/rejected": -1.676483154296875, + "logps/chosen": -196.89332580566406, + "logps/rejected": -392.8509826660156, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.861809492111206, + "rewards/margins": 26.721317291259766, + "rewards/rejected": -30.5831241607666, + "step": 2370 + }, + { + "epoch": 1.92, + "learning_rate": 2.154398563734291e-08, + "logits/chosen": -2.2812705039978027, + "logits/rejected": -1.7073118686676025, + "logps/chosen": -200.2554931640625, + "logps/rejected": -432.61297607421875, + "loss": 0.0094, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.923653602600098, + "rewards/margins": 28.150310516357422, + "rewards/rejected": -33.07396697998047, + "step": 2380 + }, + { + "epoch": 1.93, + "learning_rate": 1.9299820466786354e-08, + "logits/chosen": -2.414008378982544, + "logits/rejected": -1.4974148273468018, + "logps/chosen": -190.5416717529297, + "logps/rejected": -364.8639221191406, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.092578887939453, + "rewards/margins": 24.329792022705078, + "rewards/rejected": -27.4223690032959, + "step": 2390 + }, + { + "epoch": 1.94, + "learning_rate": 1.7055655296229802e-08, + "logits/chosen": -2.5344808101654053, + "logits/rejected": -1.4826396703720093, + "logps/chosen": -199.9079132080078, + "logps/rejected": -337.10845947265625, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.035533905029297, + "rewards/margins": 21.509811401367188, + "rewards/rejected": -25.545347213745117, + "step": 2400 + }, + { + "epoch": 1.94, + "eval_logits/chosen": -2.3903465270996094, + "eval_logits/rejected": -1.488939642906189, + "eval_logps/chosen": -202.059326171875, + "eval_logps/rejected": -358.9593505859375, + "eval_loss": 0.017110256478190422, + "eval_rewards/accuracies": 0.9867424368858337, + "eval_rewards/chosen": -4.5226850509643555, + "eval_rewards/margins": 22.533672332763672, + "eval_rewards/rejected": -27.056360244750977, + "eval_runtime": 1271.9984, + "eval_samples_per_second": 3.297, + "eval_steps_per_second": 0.104, + "step": 2400 + }, + { + "epoch": 1.95, + "learning_rate": 1.481149012567325e-08, + "logits/chosen": -2.278808116912842, + "logits/rejected": -1.3401623964309692, + "logps/chosen": -195.31007385253906, + "logps/rejected": -429.0693359375, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.187358379364014, + "rewards/margins": 29.9195556640625, + "rewards/rejected": -35.106910705566406, + "step": 2410 + }, + { + "epoch": 1.95, + "learning_rate": 1.2567324955116697e-08, + "logits/chosen": -2.548039674758911, + "logits/rejected": -1.6519190073013306, + "logps/chosen": -190.36599731445312, + "logps/rejected": -332.66351318359375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.691801071166992, + "rewards/margins": 21.10658836364746, + "rewards/rejected": -24.798391342163086, + "step": 2420 + }, + { + "epoch": 1.96, + "learning_rate": 1.0323159784560143e-08, + "logits/chosen": -2.4509990215301514, + "logits/rejected": -1.8016693592071533, + "logps/chosen": -209.08181762695312, + "logps/rejected": -391.7178955078125, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.073967456817627, + "rewards/margins": 24.873350143432617, + "rewards/rejected": -29.947315216064453, + "step": 2430 + }, + { + "epoch": 1.97, + "learning_rate": 8.07899461400359e-09, + "logits/chosen": -2.4894814491271973, + "logits/rejected": -1.5515916347503662, + "logps/chosen": -193.84072875976562, + "logps/rejected": -340.84002685546875, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.216320037841797, + "rewards/margins": 20.58757972717285, + "rewards/rejected": -25.803897857666016, + "step": 2440 + }, + { + "epoch": 1.98, + "learning_rate": 5.834829443447037e-09, + "logits/chosen": -2.4994473457336426, + "logits/rejected": -1.7215261459350586, + "logps/chosen": -214.2314453125, + "logps/rejected": -343.91790771484375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.981940507888794, + "rewards/margins": 21.160888671875, + "rewards/rejected": -25.1428279876709, + "step": 2450 + }, + { + "epoch": 1.99, + "learning_rate": 3.5906642728904845e-09, + "logits/chosen": -2.5020933151245117, + "logits/rejected": -1.6862494945526123, + "logps/chosen": -205.0951385498047, + "logps/rejected": -353.6935119628906, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.266055107116699, + "rewards/margins": 22.203887939453125, + "rewards/rejected": -26.46994400024414, + "step": 2460 + }, + { + "epoch": 2.0, + "learning_rate": 1.3464991023339318e-09, + "logits/chosen": -2.592010021209717, + "logits/rejected": -1.6329389810562134, + "logps/chosen": -205.69189453125, + "logps/rejected": -380.0437927246094, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2647247314453125, + "rewards/margins": 24.284509658813477, + "rewards/rejected": -28.54923439025879, + "step": 2470 + }, + { + "epoch": 2.0, + "step": 2476, + "total_flos": 0.0, + "train_loss": 0.034544974909203885, + "train_runtime": 47563.1126, + "train_samples_per_second": 0.832, + "train_steps_per_second": 0.052 + } + ], + "logging_steps": 10, + "max_steps": 2476, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}