diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4582 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.998451213216314, + "eval_steps": 100, + "global_step": 2904, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.7182130584192438e-09, + "logits/chosen": 22.749126434326172, + "logits/rejected": 22.455398559570312, + "logps/chosen": -415.7331848144531, + "logps/rejected": -294.51483154296875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "eval_logits/chosen": 23.82334327697754, + "eval_logits/rejected": 23.573287963867188, + "eval_logps/chosen": -354.5701599121094, + "eval_logps/rejected": -274.08343505859375, + "eval_loss": 0.6931473612785339, + "eval_rewards/accuracies": 0.0, + "eval_rewards/chosen": 0.0, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": 0.0, + "eval_runtime": 208.2485, + "eval_samples_per_second": 9.604, + "eval_steps_per_second": 0.303, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 1.718213058419244e-08, + "logits/chosen": 23.493385314941406, + "logits/rejected": 23.479415893554688, + "logps/chosen": -359.0509948730469, + "logps/rejected": -263.7375793457031, + "loss": 0.692, + "rewards/accuracies": 0.5833333134651184, + "rewards/chosen": 0.016306404024362564, + "rewards/margins": 0.025918345898389816, + "rewards/rejected": -0.009611942805349827, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 3.436426116838488e-08, + "logits/chosen": 23.505186080932617, + "logits/rejected": 23.52346420288086, + "logps/chosen": -327.48468017578125, + "logps/rejected": -279.432861328125, + "loss": 0.6965, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.013154825195670128, + "rewards/margins": -0.014362807385623455, + "rewards/rejected": 0.0012079827720299363, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 5.154639175257731e-08, + "logits/chosen": 23.50873374938965, + "logits/rejected": 23.2880859375, + "logps/chosen": -340.9912109375, + "logps/rejected": -269.15045166015625, + "loss": 0.6955, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0022484897635877132, + "rewards/margins": -0.017411604523658752, + "rewards/rejected": 0.0196601003408432, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 6.872852233676976e-08, + "logits/chosen": 23.961822509765625, + "logits/rejected": 23.730144500732422, + "logps/chosen": -414.52447509765625, + "logps/rejected": -300.4974670410156, + "loss": 0.6961, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.003453383222222328, + "rewards/margins": 0.017737122252583504, + "rewards/rejected": -0.014283737167716026, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 8.59106529209622e-08, + "logits/chosen": 23.999908447265625, + "logits/rejected": 23.47333335876465, + "logps/chosen": -313.49395751953125, + "logps/rejected": -216.2849578857422, + "loss": 0.691, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.021781612187623978, + "rewards/margins": 0.03288044035434723, + "rewards/rejected": -0.011098823510110378, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 1.0309278350515462e-07, + "logits/chosen": 23.825542449951172, + "logits/rejected": 23.716323852539062, + "logps/chosen": -306.31744384765625, + "logps/rejected": -260.7249755859375, + "loss": 0.6916, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.009675316512584686, + "rewards/margins": -0.021775808185338974, + "rewards/rejected": 0.03145112842321396, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 1.202749140893471e-07, + "logits/chosen": 23.89028549194336, + "logits/rejected": 23.66950798034668, + "logps/chosen": -364.57757568359375, + "logps/rejected": -250.9732208251953, + "loss": 0.6871, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0638527050614357, + "rewards/margins": 0.016006827354431152, + "rewards/rejected": 0.047845881432294846, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 1.3745704467353952e-07, + "logits/chosen": 23.972980499267578, + "logits/rejected": 23.702159881591797, + "logps/chosen": -360.4600524902344, + "logps/rejected": -277.17767333984375, + "loss": 0.6826, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.10171504318714142, + "rewards/margins": 0.051059722900390625, + "rewards/rejected": 0.05065532401204109, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 1.5463917525773197e-07, + "logits/chosen": 23.601802825927734, + "logits/rejected": 23.44902229309082, + "logps/chosen": -256.45306396484375, + "logps/rejected": -228.2622528076172, + "loss": 0.6742, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.13566820323467255, + "rewards/margins": 0.05287040024995804, + "rewards/rejected": 0.0827978178858757, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 1.718213058419244e-07, + "logits/chosen": 23.945114135742188, + "logits/rejected": 23.670852661132812, + "logps/chosen": -317.6385192871094, + "logps/rejected": -238.4324188232422, + "loss": 0.6639, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.14283855259418488, + "rewards/margins": 0.07535254955291748, + "rewards/rejected": 0.0674859955906868, + "step": 100 + }, + { + "epoch": 0.1, + "eval_logits/chosen": 23.83555793762207, + "eval_logits/rejected": 23.585235595703125, + "eval_logps/chosen": -352.80859375, + "eval_logps/rejected": -273.12677001953125, + "eval_loss": 0.6592543125152588, + "eval_rewards/accuracies": 0.6150793433189392, + "eval_rewards/chosen": 0.17615097761154175, + "eval_rewards/margins": 0.0804828330874443, + "eval_rewards/rejected": 0.09566814452409744, + "eval_runtime": 210.7096, + "eval_samples_per_second": 9.492, + "eval_steps_per_second": 0.299, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 1.8900343642611682e-07, + "logits/chosen": 23.709579467773438, + "logits/rejected": 23.512853622436523, + "logps/chosen": -349.40234375, + "logps/rejected": -243.11532592773438, + "loss": 0.6541, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.16010603308677673, + "rewards/margins": 0.09831614792346954, + "rewards/rejected": 0.06178988143801689, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 2.0618556701030925e-07, + "logits/chosen": 23.544376373291016, + "logits/rejected": 23.377239227294922, + "logps/chosen": -341.64080810546875, + "logps/rejected": -247.55844116210938, + "loss": 0.6539, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.1492854803800583, + "rewards/margins": 0.0526873879134655, + "rewards/rejected": 0.09659810364246368, + "step": 120 + }, + { + "epoch": 0.13, + "learning_rate": 2.2336769759450173e-07, + "logits/chosen": 24.006563186645508, + "logits/rejected": 23.8785457611084, + "logps/chosen": -321.85467529296875, + "logps/rejected": -281.0990905761719, + "loss": 0.6401, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.19155286252498627, + "rewards/margins": 0.1119670420885086, + "rewards/rejected": 0.07958582043647766, + "step": 130 + }, + { + "epoch": 0.14, + "learning_rate": 2.405498281786942e-07, + "logits/chosen": 23.71746826171875, + "logits/rejected": 23.616607666015625, + "logps/chosen": -346.86761474609375, + "logps/rejected": -257.8626708984375, + "loss": 0.6319, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.179647758603096, + "rewards/margins": 0.20804457366466522, + "rewards/rejected": -0.02839680388569832, + "step": 140 + }, + { + "epoch": 0.15, + "learning_rate": 2.5773195876288655e-07, + "logits/chosen": 23.601333618164062, + "logits/rejected": 23.368152618408203, + "logps/chosen": -342.10003662109375, + "logps/rejected": -261.25201416015625, + "loss": 0.6243, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.13529065251350403, + "rewards/margins": 0.20980004966259003, + "rewards/rejected": -0.0745093896985054, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 2.7491408934707903e-07, + "logits/chosen": 24.020530700683594, + "logits/rejected": 23.818883895874023, + "logps/chosen": -362.73968505859375, + "logps/rejected": -253.7847137451172, + "loss": 0.5915, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.13591055572032928, + "rewards/margins": 0.31857621669769287, + "rewards/rejected": -0.1826656460762024, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 2.9209621993127146e-07, + "logits/chosen": 23.72347640991211, + "logits/rejected": 23.625173568725586, + "logps/chosen": -337.2410583496094, + "logps/rejected": -265.833740234375, + "loss": 0.5966, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.06029454618692398, + "rewards/margins": 0.21368882060050964, + "rewards/rejected": -0.15339429676532745, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 3.0927835051546394e-07, + "logits/chosen": 24.024005889892578, + "logits/rejected": 23.694889068603516, + "logps/chosen": -303.23358154296875, + "logps/rejected": -259.80047607421875, + "loss": 0.5912, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.10565178096294403, + "rewards/margins": 0.33361369371414185, + "rewards/rejected": -0.2279619425535202, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 3.2646048109965636e-07, + "logits/chosen": 23.458202362060547, + "logits/rejected": 23.41326904296875, + "logps/chosen": -278.2962341308594, + "logps/rejected": -242.08627319335938, + "loss": 0.5826, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.05947988107800484, + "rewards/margins": 0.3678347170352936, + "rewards/rejected": -0.30835479497909546, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 3.436426116838488e-07, + "logits/chosen": 23.741714477539062, + "logits/rejected": 23.483057022094727, + "logps/chosen": -314.7781066894531, + "logps/rejected": -248.27880859375, + "loss": 0.5804, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.1307556927204132, + "rewards/margins": 0.3709767758846283, + "rewards/rejected": -0.2402210682630539, + "step": 200 + }, + { + "epoch": 0.21, + "eval_logits/chosen": 23.830230712890625, + "eval_logits/rejected": 23.587175369262695, + "eval_logps/chosen": -353.7904052734375, + "eval_logps/rejected": -277.4797668457031, + "eval_loss": 0.5836150646209717, + "eval_rewards/accuracies": 0.6507936716079712, + "eval_rewards/chosen": 0.07797454297542572, + "eval_rewards/margins": 0.41760751605033875, + "eval_rewards/rejected": -0.33963292837142944, + "eval_runtime": 208.5861, + "eval_samples_per_second": 9.588, + "eval_steps_per_second": 0.302, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 3.608247422680412e-07, + "logits/chosen": 23.76480484008789, + "logits/rejected": 23.56380271911621, + "logps/chosen": -377.50799560546875, + "logps/rejected": -279.08978271484375, + "loss": 0.5611, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0905425176024437, + "rewards/margins": 0.527503252029419, + "rewards/rejected": -0.43696069717407227, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 3.7800687285223364e-07, + "logits/chosen": 23.482959747314453, + "logits/rejected": 23.370895385742188, + "logps/chosen": -316.96038818359375, + "logps/rejected": -253.94686889648438, + "loss": 0.5691, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.16429784893989563, + "rewards/margins": 0.4349435865879059, + "rewards/rejected": -0.5992413759231567, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 3.9518900343642607e-07, + "logits/chosen": 23.473817825317383, + "logits/rejected": 23.369760513305664, + "logps/chosen": -334.98663330078125, + "logps/rejected": -293.44854736328125, + "loss": 0.5962, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1187012642621994, + "rewards/margins": 0.38999611139297485, + "rewards/rejected": -0.5086973905563354, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 4.123711340206185e-07, + "logits/chosen": 23.480493545532227, + "logits/rejected": 23.42662239074707, + "logps/chosen": -329.04595947265625, + "logps/rejected": -243.5697784423828, + "loss": 0.564, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.16112008690834045, + "rewards/margins": 0.41859644651412964, + "rewards/rejected": -0.5797165036201477, + "step": 240 + }, + { + "epoch": 0.26, + "learning_rate": 4.2955326460481097e-07, + "logits/chosen": 23.753459930419922, + "logits/rejected": 23.624629974365234, + "logps/chosen": -347.7720642089844, + "logps/rejected": -273.23162841796875, + "loss": 0.5833, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.06806284189224243, + "rewards/margins": 0.4797073304653168, + "rewards/rejected": -0.5477702021598816, + "step": 250 + }, + { + "epoch": 0.27, + "learning_rate": 4.4673539518900345e-07, + "logits/chosen": 23.70407485961914, + "logits/rejected": 23.5228328704834, + "logps/chosen": -310.2815856933594, + "logps/rejected": -250.3536376953125, + "loss": 0.5718, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07945041358470917, + "rewards/margins": 0.49080556631088257, + "rewards/rejected": -0.5702559351921082, + "step": 260 + }, + { + "epoch": 0.28, + "learning_rate": 4.639175257731959e-07, + "logits/chosen": 23.76226234436035, + "logits/rejected": 23.472620010375977, + "logps/chosen": -301.5387268066406, + "logps/rejected": -240.7628631591797, + "loss": 0.601, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.07512088119983673, + "rewards/margins": 0.4631730914115906, + "rewards/rejected": -0.5382939577102661, + "step": 270 + }, + { + "epoch": 0.29, + "learning_rate": 4.810996563573884e-07, + "logits/chosen": 23.984760284423828, + "logits/rejected": 23.863937377929688, + "logps/chosen": -373.2278137207031, + "logps/rejected": -285.9132995605469, + "loss": 0.5712, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.10135757923126221, + "rewards/margins": 0.6022639274597168, + "rewards/rejected": -0.5009063482284546, + "step": 280 + }, + { + "epoch": 0.3, + "learning_rate": 4.982817869415807e-07, + "logits/chosen": 23.733274459838867, + "logits/rejected": 23.508481979370117, + "logps/chosen": -356.46099853515625, + "logps/rejected": -259.97003173828125, + "loss": 0.5751, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.18595895171165466, + "rewards/margins": 0.5399104952812195, + "rewards/rejected": -0.7258695363998413, + "step": 290 + }, + { + "epoch": 0.31, + "learning_rate": 4.982778415614236e-07, + "logits/chosen": 23.513275146484375, + "logits/rejected": 23.471511840820312, + "logps/chosen": -293.0979919433594, + "logps/rejected": -249.67446899414062, + "loss": 0.5815, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.22943711280822754, + "rewards/margins": 0.6720036268234253, + "rewards/rejected": -0.9014407396316528, + "step": 300 + }, + { + "epoch": 0.31, + "eval_logits/chosen": 23.749773025512695, + "eval_logits/rejected": 23.52240753173828, + "eval_logps/chosen": -356.49285888671875, + "eval_logps/rejected": -281.9402770996094, + "eval_loss": 0.5510157942771912, + "eval_rewards/accuracies": 0.7420634627342224, + "eval_rewards/chosen": -0.19227494299411774, + "eval_rewards/margins": 0.5934095978736877, + "eval_rewards/rejected": -0.7856844663619995, + "eval_runtime": 210.4467, + "eval_samples_per_second": 9.504, + "eval_steps_per_second": 0.299, + "step": 300 + }, + { + "epoch": 0.32, + "learning_rate": 4.963643321852277e-07, + "logits/chosen": 23.758647918701172, + "logits/rejected": 23.599285125732422, + "logps/chosen": -387.0029296875, + "logps/rejected": -297.8297119140625, + "loss": 0.5858, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.22534582018852234, + "rewards/margins": 0.4947783946990967, + "rewards/rejected": -0.7201241254806519, + "step": 310 + }, + { + "epoch": 0.33, + "learning_rate": 4.944508228090318e-07, + "logits/chosen": 23.673627853393555, + "logits/rejected": 23.470468521118164, + "logps/chosen": -269.2679748535156, + "logps/rejected": -209.1413116455078, + "loss": 0.5428, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2326889932155609, + "rewards/margins": 0.49293145537376404, + "rewards/rejected": -0.7256205677986145, + "step": 320 + }, + { + "epoch": 0.34, + "learning_rate": 4.925373134328357e-07, + "logits/chosen": 23.728256225585938, + "logits/rejected": 23.57656478881836, + "logps/chosen": -341.84552001953125, + "logps/rejected": -279.85650634765625, + "loss": 0.5848, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2053179293870926, + "rewards/margins": 0.5053264498710632, + "rewards/rejected": -0.7106443643569946, + "step": 330 + }, + { + "epoch": 0.35, + "learning_rate": 4.906238040566398e-07, + "logits/chosen": 23.395517349243164, + "logits/rejected": 23.30283546447754, + "logps/chosen": -276.00958251953125, + "logps/rejected": -245.6515655517578, + "loss": 0.5731, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4472281038761139, + "rewards/margins": 0.5112749338150024, + "rewards/rejected": -0.9585030674934387, + "step": 340 + }, + { + "epoch": 0.36, + "learning_rate": 4.887102946804438e-07, + "logits/chosen": 23.459369659423828, + "logits/rejected": 23.258296966552734, + "logps/chosen": -351.51153564453125, + "logps/rejected": -265.9107666015625, + "loss": 0.5436, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.248783141374588, + "rewards/margins": 0.7760157585144043, + "rewards/rejected": -1.02479887008667, + "step": 350 + }, + { + "epoch": 0.37, + "learning_rate": 4.867967853042479e-07, + "logits/chosen": 23.64513397216797, + "logits/rejected": 23.49908447265625, + "logps/chosen": -327.1449890136719, + "logps/rejected": -301.5306396484375, + "loss": 0.5287, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29137879610061646, + "rewards/margins": 0.8306191563606262, + "rewards/rejected": -1.1219979524612427, + "step": 360 + }, + { + "epoch": 0.38, + "learning_rate": 4.84883275928052e-07, + "logits/chosen": 23.72499656677246, + "logits/rejected": 23.477428436279297, + "logps/chosen": -337.2041320800781, + "logps/rejected": -292.93463134765625, + "loss": 0.5549, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.20421965420246124, + "rewards/margins": 0.776501476764679, + "rewards/rejected": -0.980721116065979, + "step": 370 + }, + { + "epoch": 0.39, + "learning_rate": 4.82969766551856e-07, + "logits/chosen": 23.811683654785156, + "logits/rejected": 23.42571258544922, + "logps/chosen": -364.2945251464844, + "logps/rejected": -283.0462646484375, + "loss": 0.5698, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.06022878363728523, + "rewards/margins": 0.7983857989311218, + "rewards/rejected": -0.8586145639419556, + "step": 380 + }, + { + "epoch": 0.4, + "learning_rate": 4.810562571756601e-07, + "logits/chosen": 23.39688491821289, + "logits/rejected": 23.162023544311523, + "logps/chosen": -323.4096984863281, + "logps/rejected": -250.5354461669922, + "loss": 0.5771, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1642606556415558, + "rewards/margins": 0.8033970594406128, + "rewards/rejected": -0.967657744884491, + "step": 390 + }, + { + "epoch": 0.41, + "learning_rate": 4.791427477994642e-07, + "logits/chosen": 23.463436126708984, + "logits/rejected": 23.304988861083984, + "logps/chosen": -290.4604797363281, + "logps/rejected": -257.20977783203125, + "loss": 0.5526, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2738291919231415, + "rewards/margins": 0.6287984848022461, + "rewards/rejected": -0.9026277661323547, + "step": 400 + }, + { + "epoch": 0.41, + "eval_logits/chosen": 23.72638702392578, + "eval_logits/rejected": 23.50330352783203, + "eval_logps/chosen": -356.5235290527344, + "eval_logps/rejected": -283.01190185546875, + "eval_loss": 0.5360822081565857, + "eval_rewards/accuracies": 0.7341269850730896, + "eval_rewards/chosen": -0.19533830881118774, + "eval_rewards/margins": 0.6975098848342896, + "eval_rewards/rejected": -0.8928481936454773, + "eval_runtime": 211.6561, + "eval_samples_per_second": 9.449, + "eval_steps_per_second": 0.298, + "step": 400 + }, + { + "epoch": 0.42, + "learning_rate": 4.772292384232682e-07, + "logits/chosen": 23.651836395263672, + "logits/rejected": 23.562541961669922, + "logps/chosen": -295.5309143066406, + "logps/rejected": -256.42864990234375, + "loss": 0.5646, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.3951832056045532, + "rewards/margins": 0.48603707551956177, + "rewards/rejected": -0.8812202215194702, + "step": 410 + }, + { + "epoch": 0.43, + "learning_rate": 4.753157290470723e-07, + "logits/chosen": 23.555591583251953, + "logits/rejected": 23.477405548095703, + "logps/chosen": -291.4106140136719, + "logps/rejected": -254.1300048828125, + "loss": 0.5647, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.44952210783958435, + "rewards/margins": 0.4881154000759125, + "rewards/rejected": -0.937637448310852, + "step": 420 + }, + { + "epoch": 0.44, + "learning_rate": 4.7340221967087635e-07, + "logits/chosen": 23.740278244018555, + "logits/rejected": 23.43073844909668, + "logps/chosen": -283.1187438964844, + "logps/rejected": -268.111083984375, + "loss": 0.5611, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.4848947525024414, + "rewards/margins": 0.5371454954147339, + "rewards/rejected": -1.0220401287078857, + "step": 430 + }, + { + "epoch": 0.45, + "learning_rate": 4.714887102946804e-07, + "logits/chosen": 23.772052764892578, + "logits/rejected": 23.574148178100586, + "logps/chosen": -316.62310791015625, + "logps/rejected": -249.21389770507812, + "loss": 0.5237, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.24736304581165314, + "rewards/margins": 0.6857331395149231, + "rewards/rejected": -0.9330962300300598, + "step": 440 + }, + { + "epoch": 0.46, + "learning_rate": 4.6957520091848447e-07, + "logits/chosen": 23.818843841552734, + "logits/rejected": 23.663349151611328, + "logps/chosen": -301.2689208984375, + "logps/rejected": -274.0567932128906, + "loss": 0.5833, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.04559071734547615, + "rewards/margins": 0.5639020800590515, + "rewards/rejected": -0.6094927191734314, + "step": 450 + }, + { + "epoch": 0.47, + "learning_rate": 4.6766169154228853e-07, + "logits/chosen": 23.291194915771484, + "logits/rejected": 23.422870635986328, + "logps/chosen": -323.94488525390625, + "logps/rejected": -233.00833129882812, + "loss": 0.5194, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03107648529112339, + "rewards/margins": 0.7044192552566528, + "rewards/rejected": -0.7354957461357117, + "step": 460 + }, + { + "epoch": 0.49, + "learning_rate": 4.657481821660926e-07, + "logits/chosen": 23.393449783325195, + "logits/rejected": 23.201961517333984, + "logps/chosen": -318.3015441894531, + "logps/rejected": -219.90170288085938, + "loss": 0.5072, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.05152938514947891, + "rewards/margins": 0.8564150929450989, + "rewards/rejected": -0.90794438123703, + "step": 470 + }, + { + "epoch": 0.5, + "learning_rate": 4.6383467278989666e-07, + "logits/chosen": 23.315677642822266, + "logits/rejected": 23.30160903930664, + "logps/chosen": -354.28302001953125, + "logps/rejected": -268.2124938964844, + "loss": 0.5397, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.004499013535678387, + "rewards/margins": 0.9241636395454407, + "rewards/rejected": -0.9196645617485046, + "step": 480 + }, + { + "epoch": 0.51, + "learning_rate": 4.6192116341370067e-07, + "logits/chosen": 23.65988540649414, + "logits/rejected": 23.262027740478516, + "logps/chosen": -363.83416748046875, + "logps/rejected": -269.9544677734375, + "loss": 0.5463, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03792769834399223, + "rewards/margins": 0.8194522857666016, + "rewards/rejected": -0.8573800325393677, + "step": 490 + }, + { + "epoch": 0.52, + "learning_rate": 4.6000765403750473e-07, + "logits/chosen": 23.21782684326172, + "logits/rejected": 22.95124053955078, + "logps/chosen": -272.1101379394531, + "logps/rejected": -233.3650665283203, + "loss": 0.5225, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.18960613012313843, + "rewards/margins": 0.5189381837844849, + "rewards/rejected": -0.7085443139076233, + "step": 500 + }, + { + "epoch": 0.52, + "eval_logits/chosen": 23.67182731628418, + "eval_logits/rejected": 23.457815170288086, + "eval_logps/chosen": -355.6113586425781, + "eval_logps/rejected": -282.89288330078125, + "eval_loss": 0.5261635184288025, + "eval_rewards/accuracies": 0.7539682388305664, + "eval_rewards/chosen": -0.10412228107452393, + "eval_rewards/margins": 0.7768236994743347, + "eval_rewards/rejected": -0.8809459805488586, + "eval_runtime": 208.2947, + "eval_samples_per_second": 9.602, + "eval_steps_per_second": 0.302, + "step": 500 + }, + { + "epoch": 0.53, + "learning_rate": 4.580941446613088e-07, + "logits/chosen": 23.604022979736328, + "logits/rejected": 23.44409942626953, + "logps/chosen": -326.31378173828125, + "logps/rejected": -279.2933349609375, + "loss": 0.5379, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.2937595248222351, + "rewards/margins": 0.6278744339942932, + "rewards/rejected": -0.9216337203979492, + "step": 510 + }, + { + "epoch": 0.54, + "learning_rate": 4.5618063528511285e-07, + "logits/chosen": 23.713848114013672, + "logits/rejected": 23.53582000732422, + "logps/chosen": -304.9338684082031, + "logps/rejected": -268.5104064941406, + "loss": 0.5433, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.10584266483783722, + "rewards/margins": 0.8640462160110474, + "rewards/rejected": -0.9698888063430786, + "step": 520 + }, + { + "epoch": 0.55, + "learning_rate": 4.542671259089169e-07, + "logits/chosen": 23.50804328918457, + "logits/rejected": 23.286922454833984, + "logps/chosen": -291.4549560546875, + "logps/rejected": -222.6033935546875, + "loss": 0.553, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3200764060020447, + "rewards/margins": 0.6575796008110046, + "rewards/rejected": -0.9776560068130493, + "step": 530 + }, + { + "epoch": 0.56, + "learning_rate": 4.52353616532721e-07, + "logits/chosen": 23.716320037841797, + "logits/rejected": 23.562469482421875, + "logps/chosen": -322.17047119140625, + "logps/rejected": -258.58917236328125, + "loss": 0.5491, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.1657361090183258, + "rewards/margins": 0.7579048871994019, + "rewards/rejected": -0.9236409068107605, + "step": 540 + }, + { + "epoch": 0.57, + "learning_rate": 4.5044010715652504e-07, + "logits/chosen": 23.58610725402832, + "logits/rejected": 23.374378204345703, + "logps/chosen": -303.7857971191406, + "logps/rejected": -266.2262878417969, + "loss": 0.5447, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.1886710226535797, + "rewards/margins": 0.5424461364746094, + "rewards/rejected": -0.7311171293258667, + "step": 550 + }, + { + "epoch": 0.58, + "learning_rate": 4.485265977803291e-07, + "logits/chosen": 23.34024429321289, + "logits/rejected": 23.08355140686035, + "logps/chosen": -346.318603515625, + "logps/rejected": -288.74432373046875, + "loss": 0.5315, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1655091792345047, + "rewards/margins": 0.8151466250419617, + "rewards/rejected": -0.98065584897995, + "step": 560 + }, + { + "epoch": 0.59, + "learning_rate": 4.4661308840413316e-07, + "logits/chosen": 23.876493453979492, + "logits/rejected": 23.629627227783203, + "logps/chosen": -300.15509033203125, + "logps/rejected": -273.7672424316406, + "loss": 0.5197, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.30857834219932556, + "rewards/margins": 0.6672872304916382, + "rewards/rejected": -0.9758656620979309, + "step": 570 + }, + { + "epoch": 0.6, + "learning_rate": 4.446995790279372e-07, + "logits/chosen": 23.596471786499023, + "logits/rejected": 23.427684783935547, + "logps/chosen": -334.6555480957031, + "logps/rejected": -264.3316345214844, + "loss": 0.5231, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.2455914467573166, + "rewards/margins": 0.883080005645752, + "rewards/rejected": -1.128671407699585, + "step": 580 + }, + { + "epoch": 0.61, + "learning_rate": 4.4278606965174123e-07, + "logits/chosen": 23.314987182617188, + "logits/rejected": 23.11943817138672, + "logps/chosen": -298.7581787109375, + "logps/rejected": -246.3135223388672, + "loss": 0.5253, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3651345372200012, + "rewards/margins": 0.677712619304657, + "rewards/rejected": -1.0428470373153687, + "step": 590 + }, + { + "epoch": 0.62, + "learning_rate": 4.408725602755453e-07, + "logits/chosen": 23.37234115600586, + "logits/rejected": 23.340373992919922, + "logps/chosen": -286.86737060546875, + "logps/rejected": -220.9873504638672, + "loss": 0.5577, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4736716151237488, + "rewards/margins": 0.5933648347854614, + "rewards/rejected": -1.0670363903045654, + "step": 600 + }, + { + "epoch": 0.62, + "eval_logits/chosen": 23.661834716796875, + "eval_logits/rejected": 23.4466495513916, + "eval_logps/chosen": -356.5157775878906, + "eval_logps/rejected": -284.3682861328125, + "eval_loss": 0.5155950784683228, + "eval_rewards/accuracies": 0.7658730149269104, + "eval_rewards/chosen": -0.19456443190574646, + "eval_rewards/margins": 0.8339203000068665, + "eval_rewards/rejected": -1.02848482131958, + "eval_runtime": 211.7272, + "eval_samples_per_second": 9.446, + "eval_steps_per_second": 0.298, + "step": 600 + }, + { + "epoch": 0.63, + "learning_rate": 4.3895905089934936e-07, + "logits/chosen": 23.545116424560547, + "logits/rejected": 23.458499908447266, + "logps/chosen": -338.8705139160156, + "logps/rejected": -272.00714111328125, + "loss": 0.5331, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2067803144454956, + "rewards/margins": 0.8310182690620422, + "rewards/rejected": -1.037798523902893, + "step": 610 + }, + { + "epoch": 0.64, + "learning_rate": 4.370455415231534e-07, + "logits/chosen": 23.625316619873047, + "logits/rejected": 23.448591232299805, + "logps/chosen": -345.89208984375, + "logps/rejected": -306.5247497558594, + "loss": 0.5182, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.30394795536994934, + "rewards/margins": 0.5751500725746155, + "rewards/rejected": -0.8790979385375977, + "step": 620 + }, + { + "epoch": 0.65, + "learning_rate": 4.351320321469575e-07, + "logits/chosen": 22.87206268310547, + "logits/rejected": 22.757465362548828, + "logps/chosen": -309.4687805175781, + "logps/rejected": -291.12847900390625, + "loss": 0.5417, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.20515112578868866, + "rewards/margins": 0.6493626236915588, + "rewards/rejected": -0.8545138239860535, + "step": 630 + }, + { + "epoch": 0.66, + "learning_rate": 4.3321852277076154e-07, + "logits/chosen": 23.304141998291016, + "logits/rejected": 23.251794815063477, + "logps/chosen": -333.1667785644531, + "logps/rejected": -272.1311950683594, + "loss": 0.5271, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.41801586747169495, + "rewards/margins": 0.5841894149780273, + "rewards/rejected": -1.0022052526474, + "step": 640 + }, + { + "epoch": 0.67, + "learning_rate": 4.313050133945656e-07, + "logits/chosen": 23.594745635986328, + "logits/rejected": 23.500207901000977, + "logps/chosen": -357.5419616699219, + "logps/rejected": -274.1604309082031, + "loss": 0.5192, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.20376773178577423, + "rewards/margins": 0.8759373426437378, + "rewards/rejected": -1.0797051191329956, + "step": 650 + }, + { + "epoch": 0.68, + "learning_rate": 4.2939150401836967e-07, + "logits/chosen": 23.767133712768555, + "logits/rejected": 23.464405059814453, + "logps/chosen": -308.720458984375, + "logps/rejected": -290.14306640625, + "loss": 0.5137, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.28276339173316956, + "rewards/margins": 0.6749929189682007, + "rewards/rejected": -0.9577562212944031, + "step": 660 + }, + { + "epoch": 0.69, + "learning_rate": 4.2747799464217373e-07, + "logits/chosen": 23.170560836791992, + "logits/rejected": 23.10344886779785, + "logps/chosen": -350.4610595703125, + "logps/rejected": -267.5567932128906, + "loss": 0.5273, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.439180463552475, + "rewards/margins": 0.6723843216896057, + "rewards/rejected": -1.1115647554397583, + "step": 670 + }, + { + "epoch": 0.7, + "learning_rate": 4.255644852659778e-07, + "logits/chosen": 23.414445877075195, + "logits/rejected": 23.61502456665039, + "logps/chosen": -374.15692138671875, + "logps/rejected": -288.3631286621094, + "loss": 0.5761, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.08570393174886703, + "rewards/margins": 0.6574904918670654, + "rewards/rejected": -0.7431942820549011, + "step": 680 + }, + { + "epoch": 0.71, + "learning_rate": 4.236509758897818e-07, + "logits/chosen": 23.552722930908203, + "logits/rejected": 23.40909194946289, + "logps/chosen": -342.1145324707031, + "logps/rejected": -264.7322082519531, + "loss": 0.5549, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.16522836685180664, + "rewards/margins": 0.6978949904441833, + "rewards/rejected": -0.8631232976913452, + "step": 690 + }, + { + "epoch": 0.72, + "learning_rate": 4.2173746651358586e-07, + "logits/chosen": 23.72678565979004, + "logits/rejected": 23.425289154052734, + "logps/chosen": -331.14410400390625, + "logps/rejected": -285.4593200683594, + "loss": 0.5515, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10031759738922119, + "rewards/margins": 0.6387730836868286, + "rewards/rejected": -0.739090621471405, + "step": 700 + }, + { + "epoch": 0.72, + "eval_logits/chosen": 23.634296417236328, + "eval_logits/rejected": 23.424331665039062, + "eval_logps/chosen": -353.9219665527344, + "eval_logps/rejected": -281.7333679199219, + "eval_loss": 0.5162664651870728, + "eval_rewards/accuracies": 0.7658730149269104, + "eval_rewards/chosen": 0.0648159608244896, + "eval_rewards/margins": 0.8298115730285645, + "eval_rewards/rejected": -0.7649956345558167, + "eval_runtime": 211.7482, + "eval_samples_per_second": 9.445, + "eval_steps_per_second": 0.298, + "step": 700 + }, + { + "epoch": 0.73, + "learning_rate": 4.198239571373899e-07, + "logits/chosen": 23.546445846557617, + "logits/rejected": 23.238723754882812, + "logps/chosen": -307.0013122558594, + "logps/rejected": -247.3063201904297, + "loss": 0.5341, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.08633746951818466, + "rewards/margins": 0.7773478031158447, + "rewards/rejected": -0.8636852502822876, + "step": 710 + }, + { + "epoch": 0.74, + "learning_rate": 4.17910447761194e-07, + "logits/chosen": 23.390674591064453, + "logits/rejected": 23.3981876373291, + "logps/chosen": -337.4337158203125, + "logps/rejected": -311.81414794921875, + "loss": 0.5774, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.21913309395313263, + "rewards/margins": 0.4323801100254059, + "rewards/rejected": -0.6515131592750549, + "step": 720 + }, + { + "epoch": 0.75, + "learning_rate": 4.1599693838499805e-07, + "logits/chosen": 23.517433166503906, + "logits/rejected": 23.37581443786621, + "logps/chosen": -291.6369323730469, + "logps/rejected": -265.741943359375, + "loss": 0.5377, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.25872206687927246, + "rewards/margins": 0.6349440813064575, + "rewards/rejected": -0.8936660885810852, + "step": 730 + }, + { + "epoch": 0.76, + "learning_rate": 4.140834290088021e-07, + "logits/chosen": 23.460777282714844, + "logits/rejected": 23.235326766967773, + "logps/chosen": -339.9285888671875, + "logps/rejected": -266.0412292480469, + "loss": 0.5197, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11111575365066528, + "rewards/margins": 0.9949262738227844, + "rewards/rejected": -1.1060421466827393, + "step": 740 + }, + { + "epoch": 0.77, + "learning_rate": 4.121699196326062e-07, + "logits/chosen": 23.193099975585938, + "logits/rejected": 23.17205238342285, + "logps/chosen": -333.4886474609375, + "logps/rejected": -274.37274169921875, + "loss": 0.5358, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.505832850933075, + "rewards/margins": 0.6223492622375488, + "rewards/rejected": -1.1281821727752686, + "step": 750 + }, + { + "epoch": 0.78, + "learning_rate": 4.1025641025641024e-07, + "logits/chosen": 23.626953125, + "logits/rejected": 23.612979888916016, + "logps/chosen": -327.89111328125, + "logps/rejected": -297.7337341308594, + "loss": 0.5246, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35025396943092346, + "rewards/margins": 0.8313090205192566, + "rewards/rejected": -1.1815630197525024, + "step": 760 + }, + { + "epoch": 0.8, + "learning_rate": 4.083429008802143e-07, + "logits/chosen": 23.58197593688965, + "logits/rejected": 23.45255470275879, + "logps/chosen": -272.42156982421875, + "logps/rejected": -270.8283386230469, + "loss": 0.5165, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3132372796535492, + "rewards/margins": 0.5522381663322449, + "rewards/rejected": -0.8654754757881165, + "step": 770 + }, + { + "epoch": 0.81, + "learning_rate": 4.0642939150401836e-07, + "logits/chosen": 23.23889923095703, + "logits/rejected": 23.24991798400879, + "logps/chosen": -314.5959167480469, + "logps/rejected": -257.6392517089844, + "loss": 0.534, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.4284973740577698, + "rewards/margins": 0.6584367156028748, + "rewards/rejected": -1.086934208869934, + "step": 780 + }, + { + "epoch": 0.82, + "learning_rate": 4.0451588212782237e-07, + "logits/chosen": 23.414813995361328, + "logits/rejected": 23.380718231201172, + "logps/chosen": -291.32501220703125, + "logps/rejected": -256.01263427734375, + "loss": 0.4937, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.18692317605018616, + "rewards/margins": 0.8645000457763672, + "rewards/rejected": -1.051423192024231, + "step": 790 + }, + { + "epoch": 0.83, + "learning_rate": 4.0260237275162643e-07, + "logits/chosen": 23.40145492553711, + "logits/rejected": 23.43955421447754, + "logps/chosen": -313.07513427734375, + "logps/rejected": -277.38201904296875, + "loss": 0.5159, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11286661773920059, + "rewards/margins": 0.8762611150741577, + "rewards/rejected": -0.9891278147697449, + "step": 800 + }, + { + "epoch": 0.83, + "eval_logits/chosen": 23.617877960205078, + "eval_logits/rejected": 23.40951156616211, + "eval_logps/chosen": -355.9697570800781, + "eval_logps/rejected": -284.6782531738281, + "eval_loss": 0.5112624764442444, + "eval_rewards/accuracies": 0.7777777910232544, + "eval_rewards/chosen": -0.1399604231119156, + "eval_rewards/margins": 0.9195234179496765, + "eval_rewards/rejected": -1.0594837665557861, + "eval_runtime": 211.1679, + "eval_samples_per_second": 9.471, + "eval_steps_per_second": 0.298, + "step": 800 + }, + { + "epoch": 0.84, + "learning_rate": 4.006888633754305e-07, + "logits/chosen": 23.594928741455078, + "logits/rejected": 23.54049301147461, + "logps/chosen": -319.88677978515625, + "logps/rejected": -260.43389892578125, + "loss": 0.4905, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.20980267226696014, + "rewards/margins": 0.9086447954177856, + "rewards/rejected": -1.1184475421905518, + "step": 810 + }, + { + "epoch": 0.85, + "learning_rate": 3.9877535399923456e-07, + "logits/chosen": 23.482894897460938, + "logits/rejected": 23.19647216796875, + "logps/chosen": -338.23223876953125, + "logps/rejected": -269.0614929199219, + "loss": 0.5256, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4344770014286041, + "rewards/margins": 0.710912823677063, + "rewards/rejected": -1.1453897953033447, + "step": 820 + }, + { + "epoch": 0.86, + "learning_rate": 3.968618446230386e-07, + "logits/chosen": 23.347646713256836, + "logits/rejected": 23.12314224243164, + "logps/chosen": -311.5711364746094, + "logps/rejected": -240.50125122070312, + "loss": 0.4916, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3820451498031616, + "rewards/margins": 1.0347092151641846, + "rewards/rejected": -1.4167543649673462, + "step": 830 + }, + { + "epoch": 0.87, + "learning_rate": 3.949483352468427e-07, + "logits/chosen": 23.311033248901367, + "logits/rejected": 23.248620986938477, + "logps/chosen": -281.490966796875, + "logps/rejected": -240.92086791992188, + "loss": 0.556, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.47001034021377563, + "rewards/margins": 0.762096643447876, + "rewards/rejected": -1.2321069240570068, + "step": 840 + }, + { + "epoch": 0.88, + "learning_rate": 3.9303482587064674e-07, + "logits/chosen": 23.50173568725586, + "logits/rejected": 23.377094268798828, + "logps/chosen": -290.4707336425781, + "logps/rejected": -248.4992218017578, + "loss": 0.5133, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25379228591918945, + "rewards/margins": 0.8668729662895203, + "rewards/rejected": -1.1206653118133545, + "step": 850 + }, + { + "epoch": 0.89, + "learning_rate": 3.911213164944508e-07, + "logits/chosen": 23.733707427978516, + "logits/rejected": 23.433372497558594, + "logps/chosen": -346.18353271484375, + "logps/rejected": -291.7870788574219, + "loss": 0.5163, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3051421046257019, + "rewards/margins": 0.9217368364334106, + "rewards/rejected": -1.2268788814544678, + "step": 860 + }, + { + "epoch": 0.9, + "learning_rate": 3.8920780711825487e-07, + "logits/chosen": 23.664628982543945, + "logits/rejected": 23.414520263671875, + "logps/chosen": -396.57269287109375, + "logps/rejected": -270.23681640625, + "loss": 0.5127, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.19412247836589813, + "rewards/margins": 0.9694843292236328, + "rewards/rejected": -1.163606882095337, + "step": 870 + }, + { + "epoch": 0.91, + "learning_rate": 3.8729429774205893e-07, + "logits/chosen": 23.33928680419922, + "logits/rejected": 23.3987979888916, + "logps/chosen": -381.9424133300781, + "logps/rejected": -267.4436340332031, + "loss": 0.5481, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1822233498096466, + "rewards/margins": 1.0088589191436768, + "rewards/rejected": -1.191082239151001, + "step": 880 + }, + { + "epoch": 0.92, + "learning_rate": 3.8538078836586294e-07, + "logits/chosen": 23.547710418701172, + "logits/rejected": 23.508426666259766, + "logps/chosen": -332.8504638671875, + "logps/rejected": -287.86712646484375, + "loss": 0.5454, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.26958388090133667, + "rewards/margins": 0.755517840385437, + "rewards/rejected": -1.025101661682129, + "step": 890 + }, + { + "epoch": 0.93, + "learning_rate": 3.83467278989667e-07, + "logits/chosen": 23.69491195678711, + "logits/rejected": 23.59137725830078, + "logps/chosen": -287.8290100097656, + "logps/rejected": -236.21438598632812, + "loss": 0.5242, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07682293653488159, + "rewards/margins": 0.8297051191329956, + "rewards/rejected": -0.906528115272522, + "step": 900 + }, + { + "epoch": 0.93, + "eval_logits/chosen": 23.614517211914062, + "eval_logits/rejected": 23.403518676757812, + "eval_logps/chosen": -354.952880859375, + "eval_logps/rejected": -283.23175048828125, + "eval_loss": 0.5089067220687866, + "eval_rewards/accuracies": 0.7658730149269104, + "eval_rewards/chosen": -0.03827480971813202, + "eval_rewards/margins": 0.8765569925308228, + "eval_rewards/rejected": -0.914831817150116, + "eval_runtime": 210.9611, + "eval_samples_per_second": 9.48, + "eval_steps_per_second": 0.299, + "step": 900 + }, + { + "epoch": 0.94, + "learning_rate": 3.8155376961347106e-07, + "logits/chosen": 23.2309627532959, + "logits/rejected": 23.20724105834961, + "logps/chosen": -246.0962677001953, + "logps/rejected": -230.4759979248047, + "loss": 0.5286, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.24791307747364044, + "rewards/margins": 0.7959302663803101, + "rewards/rejected": -1.0438432693481445, + "step": 910 + }, + { + "epoch": 0.95, + "learning_rate": 3.796402602372751e-07, + "logits/chosen": 23.323627471923828, + "logits/rejected": 23.179901123046875, + "logps/chosen": -294.0438537597656, + "logps/rejected": -236.05673217773438, + "loss": 0.5097, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.21511948108673096, + "rewards/margins": 0.6154388189315796, + "rewards/rejected": -0.8305583000183105, + "step": 920 + }, + { + "epoch": 0.96, + "learning_rate": 3.777267508610792e-07, + "logits/chosen": 23.346338272094727, + "logits/rejected": 23.224199295043945, + "logps/chosen": -318.80096435546875, + "logps/rejected": -251.92593383789062, + "loss": 0.5228, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.1629999428987503, + "rewards/margins": 0.8454955816268921, + "rewards/rejected": -1.008495569229126, + "step": 930 + }, + { + "epoch": 0.97, + "learning_rate": 3.7581324148488325e-07, + "logits/chosen": 23.233306884765625, + "logits/rejected": 23.179094314575195, + "logps/chosen": -330.75469970703125, + "logps/rejected": -246.1700897216797, + "loss": 0.5374, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.317868173122406, + "rewards/margins": 0.8588002920150757, + "rewards/rejected": -1.176668405532837, + "step": 940 + }, + { + "epoch": 0.98, + "learning_rate": 3.738997321086873e-07, + "logits/chosen": 23.4466495513916, + "logits/rejected": 23.421428680419922, + "logps/chosen": -325.29388427734375, + "logps/rejected": -277.3059387207031, + "loss": 0.5251, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.012738706544041634, + "rewards/margins": 0.8846112489700317, + "rewards/rejected": -0.8973498344421387, + "step": 950 + }, + { + "epoch": 0.99, + "learning_rate": 3.7198622273249137e-07, + "logits/chosen": 23.535139083862305, + "logits/rejected": 23.37562370300293, + "logps/chosen": -335.9436340332031, + "logps/rejected": -274.2239990234375, + "loss": 0.5143, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.1785895973443985, + "rewards/margins": 0.5961320400238037, + "rewards/rejected": -0.7747215628623962, + "step": 960 + }, + { + "epoch": 1.0, + "learning_rate": 3.7007271335629544e-07, + "logits/chosen": 23.633747100830078, + "logits/rejected": 23.46231460571289, + "logps/chosen": -295.21759033203125, + "logps/rejected": -258.9006042480469, + "loss": 0.4828, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.19572855532169342, + "rewards/margins": 0.8321182131767273, + "rewards/rejected": -1.0278469324111938, + "step": 970 + }, + { + "epoch": 1.01, + "learning_rate": 3.681592039800995e-07, + "logits/chosen": 23.68822479248047, + "logits/rejected": 23.501148223876953, + "logps/chosen": -299.13140869140625, + "logps/rejected": -279.1519470214844, + "loss": 0.4402, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.023116961121559143, + "rewards/margins": 1.1388248205184937, + "rewards/rejected": -1.1157079935073853, + "step": 980 + }, + { + "epoch": 1.02, + "learning_rate": 3.662456946039035e-07, + "logits/chosen": 23.261262893676758, + "logits/rejected": 22.952524185180664, + "logps/chosen": -308.18536376953125, + "logps/rejected": -273.8042907714844, + "loss": 0.4855, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.0076486109755933285, + "rewards/margins": 1.2322968244552612, + "rewards/rejected": -1.2399452924728394, + "step": 990 + }, + { + "epoch": 1.03, + "learning_rate": 3.6433218522770757e-07, + "logits/chosen": 23.609331130981445, + "logits/rejected": 23.460206985473633, + "logps/chosen": -298.27410888671875, + "logps/rejected": -291.9101867675781, + "loss": 0.4618, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.07892550528049469, + "rewards/margins": 0.973551869392395, + "rewards/rejected": -1.0524773597717285, + "step": 1000 + }, + { + "epoch": 1.03, + "eval_logits/chosen": 23.585590362548828, + "eval_logits/rejected": 23.38045883178711, + "eval_logps/chosen": -355.79290771484375, + "eval_logps/rejected": -284.2840881347656, + "eval_loss": 0.5076952576637268, + "eval_rewards/accuracies": 0.7777777910232544, + "eval_rewards/chosen": -0.12227805703878403, + "eval_rewards/margins": 0.8977885842323303, + "eval_rewards/rejected": -1.020066499710083, + "eval_runtime": 209.3271, + "eval_samples_per_second": 9.554, + "eval_steps_per_second": 0.301, + "step": 1000 + }, + { + "epoch": 1.04, + "learning_rate": 3.6241867585151163e-07, + "logits/chosen": 23.0650577545166, + "logits/rejected": 22.977046966552734, + "logps/chosen": -336.06597900390625, + "logps/rejected": -278.80828857421875, + "loss": 0.4487, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.024078911170363426, + "rewards/margins": 1.0799994468688965, + "rewards/rejected": -1.1040784120559692, + "step": 1010 + }, + { + "epoch": 1.05, + "learning_rate": 3.605051664753157e-07, + "logits/chosen": 23.251041412353516, + "logits/rejected": 23.117984771728516, + "logps/chosen": -303.11199951171875, + "logps/rejected": -243.7981719970703, + "loss": 0.415, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.18380531668663025, + "rewards/margins": 1.1280765533447266, + "rewards/rejected": -1.3118817806243896, + "step": 1020 + }, + { + "epoch": 1.06, + "learning_rate": 3.5859165709911975e-07, + "logits/chosen": 23.297225952148438, + "logits/rejected": 23.318119049072266, + "logps/chosen": -334.6638488769531, + "logps/rejected": -316.2551574707031, + "loss": 0.4302, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2101324051618576, + "rewards/margins": 0.9589886665344238, + "rewards/rejected": -1.1691210269927979, + "step": 1030 + }, + { + "epoch": 1.07, + "learning_rate": 3.566781477229238e-07, + "logits/chosen": 23.525909423828125, + "logits/rejected": 23.159460067749023, + "logps/chosen": -318.52093505859375, + "logps/rejected": -268.981201171875, + "loss": 0.4484, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.31944024562835693, + "rewards/margins": 0.7762435674667358, + "rewards/rejected": -1.0956838130950928, + "step": 1040 + }, + { + "epoch": 1.08, + "learning_rate": 3.547646383467279e-07, + "logits/chosen": 23.090410232543945, + "logits/rejected": 23.27143669128418, + "logps/chosen": -314.944580078125, + "logps/rejected": -246.20974731445312, + "loss": 0.4362, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.34100785851478577, + "rewards/margins": 0.8710842132568359, + "rewards/rejected": -1.2120921611785889, + "step": 1050 + }, + { + "epoch": 1.09, + "learning_rate": 3.5285112897053194e-07, + "logits/chosen": 23.36246681213379, + "logits/rejected": 23.223459243774414, + "logps/chosen": -286.5093078613281, + "logps/rejected": -283.33514404296875, + "loss": 0.4188, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0016206980217248201, + "rewards/margins": 1.3344464302062988, + "rewards/rejected": -1.3328258991241455, + "step": 1060 + }, + { + "epoch": 1.1, + "learning_rate": 3.50937619594336e-07, + "logits/chosen": 23.439857482910156, + "logits/rejected": 23.41635513305664, + "logps/chosen": -319.5511169433594, + "logps/rejected": -310.8269348144531, + "loss": 0.4497, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.29633527994155884, + "rewards/margins": 0.8939793705940247, + "rewards/rejected": -1.190314531326294, + "step": 1070 + }, + { + "epoch": 1.12, + "learning_rate": 3.4902411021814007e-07, + "logits/chosen": 23.304676055908203, + "logits/rejected": 23.236148834228516, + "logps/chosen": -329.2352600097656, + "logps/rejected": -261.53082275390625, + "loss": 0.4335, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.15099604427814484, + "rewards/margins": 1.2369451522827148, + "rewards/rejected": -1.3879411220550537, + "step": 1080 + }, + { + "epoch": 1.13, + "learning_rate": 3.4711060084194413e-07, + "logits/chosen": 23.37632179260254, + "logits/rejected": 23.350784301757812, + "logps/chosen": -360.5380859375, + "logps/rejected": -272.3520812988281, + "loss": 0.442, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.015167620964348316, + "rewards/margins": 1.156204342842102, + "rewards/rejected": -1.1410366296768188, + "step": 1090 + }, + { + "epoch": 1.14, + "learning_rate": 3.4519709146574814e-07, + "logits/chosen": 23.032573699951172, + "logits/rejected": 22.9952449798584, + "logps/chosen": -225.4923858642578, + "logps/rejected": -196.6672821044922, + "loss": 0.4484, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4950261116027832, + "rewards/margins": 0.9079095721244812, + "rewards/rejected": -1.4029356241226196, + "step": 1100 + }, + { + "epoch": 1.14, + "eval_logits/chosen": 23.538101196289062, + "eval_logits/rejected": 23.34269142150879, + "eval_logps/chosen": -357.8807373046875, + "eval_logps/rejected": -287.3826599121094, + "eval_loss": 0.5019155144691467, + "eval_rewards/accuracies": 0.7777777910232544, + "eval_rewards/chosen": -0.3310595154762268, + "eval_rewards/margins": 0.9988633990287781, + "eval_rewards/rejected": -1.3299229145050049, + "eval_runtime": 210.9987, + "eval_samples_per_second": 9.479, + "eval_steps_per_second": 0.299, + "step": 1100 + }, + { + "epoch": 1.15, + "learning_rate": 3.432835820895522e-07, + "logits/chosen": 23.460542678833008, + "logits/rejected": 23.26938247680664, + "logps/chosen": -358.67877197265625, + "logps/rejected": -289.0791931152344, + "loss": 0.4235, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.26543277502059937, + "rewards/margins": 1.2576963901519775, + "rewards/rejected": -1.5231291055679321, + "step": 1110 + }, + { + "epoch": 1.16, + "learning_rate": 3.4137007271335626e-07, + "logits/chosen": 23.63456916809082, + "logits/rejected": 23.502344131469727, + "logps/chosen": -284.9480895996094, + "logps/rejected": -279.4847412109375, + "loss": 0.4245, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3471234440803528, + "rewards/margins": 1.1683541536331177, + "rewards/rejected": -1.5154775381088257, + "step": 1120 + }, + { + "epoch": 1.17, + "learning_rate": 3.394565633371603e-07, + "logits/chosen": 23.349411010742188, + "logits/rejected": 23.35630226135254, + "logps/chosen": -341.84881591796875, + "logps/rejected": -319.86358642578125, + "loss": 0.4209, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.29219168424606323, + "rewards/margins": 1.1724560260772705, + "rewards/rejected": -1.464647889137268, + "step": 1130 + }, + { + "epoch": 1.18, + "learning_rate": 3.375430539609644e-07, + "logits/chosen": 23.028972625732422, + "logits/rejected": 23.105947494506836, + "logps/chosen": -298.8593444824219, + "logps/rejected": -308.6123046875, + "loss": 0.4111, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.41705456376075745, + "rewards/margins": 0.954562783241272, + "rewards/rejected": -1.3716174364089966, + "step": 1140 + }, + { + "epoch": 1.19, + "learning_rate": 3.3562954458476845e-07, + "logits/chosen": 23.36569595336914, + "logits/rejected": 23.174901962280273, + "logps/chosen": -414.62567138671875, + "logps/rejected": -282.6720275878906, + "loss": 0.4634, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.30735766887664795, + "rewards/margins": 1.2201875448226929, + "rewards/rejected": -1.5275452136993408, + "step": 1150 + }, + { + "epoch": 1.2, + "learning_rate": 3.337160352085725e-07, + "logits/chosen": 23.13878059387207, + "logits/rejected": 23.121612548828125, + "logps/chosen": -331.0238342285156, + "logps/rejected": -275.01129150390625, + "loss": 0.4363, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.260793000459671, + "rewards/margins": 1.1585649251937866, + "rewards/rejected": -1.4193580150604248, + "step": 1160 + }, + { + "epoch": 1.21, + "learning_rate": 3.3180252583237657e-07, + "logits/chosen": 23.040502548217773, + "logits/rejected": 22.87631607055664, + "logps/chosen": -301.6041564941406, + "logps/rejected": -246.01254272460938, + "loss": 0.4526, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.19479836523532867, + "rewards/margins": 1.1458656787872314, + "rewards/rejected": -1.340664029121399, + "step": 1170 + }, + { + "epoch": 1.22, + "learning_rate": 3.2988901645618063e-07, + "logits/chosen": 23.21322250366211, + "logits/rejected": 22.910724639892578, + "logps/chosen": -270.7756652832031, + "logps/rejected": -233.3585968017578, + "loss": 0.4396, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.4070712924003601, + "rewards/margins": 0.8780719041824341, + "rewards/rejected": -1.2851431369781494, + "step": 1180 + }, + { + "epoch": 1.23, + "learning_rate": 3.279755070799847e-07, + "logits/chosen": 23.265064239501953, + "logits/rejected": 23.274433135986328, + "logps/chosen": -315.3988037109375, + "logps/rejected": -291.09375, + "loss": 0.4049, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.24129147827625275, + "rewards/margins": 1.1495540142059326, + "rewards/rejected": -1.390845537185669, + "step": 1190 + }, + { + "epoch": 1.24, + "learning_rate": 3.260619977037887e-07, + "logits/chosen": 23.321971893310547, + "logits/rejected": 23.265857696533203, + "logps/chosen": -304.5440368652344, + "logps/rejected": -283.64764404296875, + "loss": 0.4228, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15949824452400208, + "rewards/margins": 1.1103687286376953, + "rewards/rejected": -1.269866704940796, + "step": 1200 + }, + { + "epoch": 1.24, + "eval_logits/chosen": 23.51008415222168, + "eval_logits/rejected": 23.319059371948242, + "eval_logps/chosen": -355.1871337890625, + "eval_logps/rejected": -285.07257080078125, + "eval_loss": 0.5033829212188721, + "eval_rewards/accuracies": 0.761904776096344, + "eval_rewards/chosen": -0.06169680133461952, + "eval_rewards/margins": 1.0372183322906494, + "eval_rewards/rejected": -1.0989152193069458, + "eval_runtime": 207.9261, + "eval_samples_per_second": 9.619, + "eval_steps_per_second": 0.303, + "step": 1200 + }, + { + "epoch": 1.25, + "learning_rate": 3.2414848832759277e-07, + "logits/chosen": 23.359235763549805, + "logits/rejected": 23.241931915283203, + "logps/chosen": -253.0906219482422, + "logps/rejected": -248.0492706298828, + "loss": 0.4028, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.09821876138448715, + "rewards/margins": 1.3042573928833008, + "rewards/rejected": -1.4024760723114014, + "step": 1210 + }, + { + "epoch": 1.26, + "learning_rate": 3.2223497895139683e-07, + "logits/chosen": 23.251922607421875, + "logits/rejected": 23.224475860595703, + "logps/chosen": -323.7980041503906, + "logps/rejected": -312.51934814453125, + "loss": 0.4376, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.02340073511004448, + "rewards/margins": 1.2130701541900635, + "rewards/rejected": -1.2364708185195923, + "step": 1220 + }, + { + "epoch": 1.27, + "learning_rate": 3.203214695752009e-07, + "logits/chosen": 23.745332717895508, + "logits/rejected": 23.615753173828125, + "logps/chosen": -313.46453857421875, + "logps/rejected": -308.4986877441406, + "loss": 0.4391, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.15489891171455383, + "rewards/margins": 0.8978082537651062, + "rewards/rejected": -1.0527071952819824, + "step": 1230 + }, + { + "epoch": 1.28, + "learning_rate": 3.1840796019900495e-07, + "logits/chosen": 23.76753044128418, + "logits/rejected": 23.41860580444336, + "logps/chosen": -348.83258056640625, + "logps/rejected": -300.46893310546875, + "loss": 0.4562, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.1853850781917572, + "rewards/margins": 1.198961853981018, + "rewards/rejected": -1.3843467235565186, + "step": 1240 + }, + { + "epoch": 1.29, + "learning_rate": 3.16494450822809e-07, + "logits/chosen": 23.356491088867188, + "logits/rejected": 23.26506996154785, + "logps/chosen": -251.2193145751953, + "logps/rejected": -268.14215087890625, + "loss": 0.4412, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5262545347213745, + "rewards/margins": 0.6762484312057495, + "rewards/rejected": -1.202502965927124, + "step": 1250 + }, + { + "epoch": 1.3, + "learning_rate": 3.145809414466131e-07, + "logits/chosen": 23.517498016357422, + "logits/rejected": 23.316844940185547, + "logps/chosen": -342.25604248046875, + "logps/rejected": -239.2180633544922, + "loss": 0.4239, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.42340001463890076, + "rewards/margins": 1.1046512126922607, + "rewards/rejected": -1.5280513763427734, + "step": 1260 + }, + { + "epoch": 1.31, + "learning_rate": 3.1266743207041714e-07, + "logits/chosen": 23.371051788330078, + "logits/rejected": 23.382333755493164, + "logps/chosen": -388.79425048828125, + "logps/rejected": -311.9518127441406, + "loss": 0.4435, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25299787521362305, + "rewards/margins": 1.0895098447799683, + "rewards/rejected": -1.3425077199935913, + "step": 1270 + }, + { + "epoch": 1.32, + "learning_rate": 3.107539226942212e-07, + "logits/chosen": 23.398571014404297, + "logits/rejected": 23.297183990478516, + "logps/chosen": -284.0433349609375, + "logps/rejected": -269.79901123046875, + "loss": 0.4612, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.34444212913513184, + "rewards/margins": 1.0336390733718872, + "rewards/rejected": -1.3780810832977295, + "step": 1280 + }, + { + "epoch": 1.33, + "learning_rate": 3.0884041331802526e-07, + "logits/chosen": 23.48178482055664, + "logits/rejected": 23.24820899963379, + "logps/chosen": -316.18109130859375, + "logps/rejected": -268.31182861328125, + "loss": 0.4396, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.26420363783836365, + "rewards/margins": 1.2460224628448486, + "rewards/rejected": -1.5102260112762451, + "step": 1290 + }, + { + "epoch": 1.34, + "learning_rate": 3.0692690394182927e-07, + "logits/chosen": 23.08510971069336, + "logits/rejected": 22.929636001586914, + "logps/chosen": -303.6866760253906, + "logps/rejected": -280.0844421386719, + "loss": 0.4306, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.46213406324386597, + "rewards/margins": 1.072772741317749, + "rewards/rejected": -1.5349067449569702, + "step": 1300 + }, + { + "epoch": 1.34, + "eval_logits/chosen": 23.478702545166016, + "eval_logits/rejected": 23.2889404296875, + "eval_logps/chosen": -356.1548767089844, + "eval_logps/rejected": -285.9320373535156, + "eval_loss": 0.5032446384429932, + "eval_rewards/accuracies": 0.7698412537574768, + "eval_rewards/chosen": -0.15847428143024445, + "eval_rewards/margins": 1.0263888835906982, + "eval_rewards/rejected": -1.1848632097244263, + "eval_runtime": 214.9168, + "eval_samples_per_second": 9.306, + "eval_steps_per_second": 0.293, + "step": 1300 + }, + { + "epoch": 1.35, + "learning_rate": 3.0501339456563334e-07, + "logits/chosen": 23.21750831604004, + "logits/rejected": 23.04998016357422, + "logps/chosen": -319.6648254394531, + "logps/rejected": -272.9951477050781, + "loss": 0.42, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.12266921997070312, + "rewards/margins": 1.1443729400634766, + "rewards/rejected": -1.2670420408248901, + "step": 1310 + }, + { + "epoch": 1.36, + "learning_rate": 3.030998851894374e-07, + "logits/chosen": 22.977802276611328, + "logits/rejected": 22.969890594482422, + "logps/chosen": -301.5118408203125, + "logps/rejected": -249.35372924804688, + "loss": 0.4142, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41769084334373474, + "rewards/margins": 0.9436267614364624, + "rewards/rejected": -1.3613176345825195, + "step": 1320 + }, + { + "epoch": 1.37, + "learning_rate": 3.0118637581324146e-07, + "logits/chosen": 23.275325775146484, + "logits/rejected": 23.174297332763672, + "logps/chosen": -339.08258056640625, + "logps/rejected": -282.3678283691406, + "loss": 0.422, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15730026364326477, + "rewards/margins": 1.1286394596099854, + "rewards/rejected": -1.2859396934509277, + "step": 1330 + }, + { + "epoch": 1.38, + "learning_rate": 2.992728664370455e-07, + "logits/chosen": 23.317642211914062, + "logits/rejected": 23.297130584716797, + "logps/chosen": -263.40313720703125, + "logps/rejected": -259.40655517578125, + "loss": 0.4378, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5026736259460449, + "rewards/margins": 0.7866870760917664, + "rewards/rejected": -1.289360761642456, + "step": 1340 + }, + { + "epoch": 1.39, + "learning_rate": 2.973593570608496e-07, + "logits/chosen": 23.30654525756836, + "logits/rejected": 23.131305694580078, + "logps/chosen": -299.3922424316406, + "logps/rejected": -269.0783386230469, + "loss": 0.4366, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.43080934882164, + "rewards/margins": 1.0835515260696411, + "rewards/rejected": -1.514360785484314, + "step": 1350 + }, + { + "epoch": 1.4, + "learning_rate": 2.9544584768465365e-07, + "logits/chosen": 23.53885269165039, + "logits/rejected": 23.299760818481445, + "logps/chosen": -324.5937194824219, + "logps/rejected": -272.784912109375, + "loss": 0.4427, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.40283799171447754, + "rewards/margins": 1.1869771480560303, + "rewards/rejected": -1.5898151397705078, + "step": 1360 + }, + { + "epoch": 1.41, + "learning_rate": 2.935323383084577e-07, + "logits/chosen": 23.592838287353516, + "logits/rejected": 23.14777183532715, + "logps/chosen": -392.72662353515625, + "logps/rejected": -286.9781494140625, + "loss": 0.4427, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.39330539107322693, + "rewards/margins": 1.0419480800628662, + "rewards/rejected": -1.4352535009384155, + "step": 1370 + }, + { + "epoch": 1.42, + "learning_rate": 2.9161882893226177e-07, + "logits/chosen": 23.269372940063477, + "logits/rejected": 22.95911979675293, + "logps/chosen": -316.82574462890625, + "logps/rejected": -261.1885070800781, + "loss": 0.433, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.317082941532135, + "rewards/margins": 1.2134672403335571, + "rewards/rejected": -1.530550241470337, + "step": 1380 + }, + { + "epoch": 1.44, + "learning_rate": 2.8970531955606583e-07, + "logits/chosen": 23.39419937133789, + "logits/rejected": 23.142974853515625, + "logps/chosen": -336.1145935058594, + "logps/rejected": -242.3621826171875, + "loss": 0.4514, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19971036911010742, + "rewards/margins": 1.381958246231079, + "rewards/rejected": -1.5816686153411865, + "step": 1390 + }, + { + "epoch": 1.45, + "learning_rate": 2.8779181017986984e-07, + "logits/chosen": 23.005327224731445, + "logits/rejected": 22.998939514160156, + "logps/chosen": -389.1512756347656, + "logps/rejected": -290.7593688964844, + "loss": 0.4678, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.16274584829807281, + "rewards/margins": 1.0353623628616333, + "rewards/rejected": -1.198108196258545, + "step": 1400 + }, + { + "epoch": 1.45, + "eval_logits/chosen": 23.455062866210938, + "eval_logits/rejected": 23.266075134277344, + "eval_logps/chosen": -356.9206848144531, + "eval_logps/rejected": -285.68414306640625, + "eval_loss": 0.5029928684234619, + "eval_rewards/accuracies": 0.7817460298538208, + "eval_rewards/chosen": -0.23505355417728424, + "eval_rewards/margins": 0.9250208735466003, + "eval_rewards/rejected": -1.1600743532180786, + "eval_runtime": 212.5498, + "eval_samples_per_second": 9.41, + "eval_steps_per_second": 0.296, + "step": 1400 + }, + { + "epoch": 1.46, + "learning_rate": 2.858783008036739e-07, + "logits/chosen": 23.323863983154297, + "logits/rejected": 23.302270889282227, + "logps/chosen": -362.5963134765625, + "logps/rejected": -284.63519287109375, + "loss": 0.4375, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -0.15233711898326874, + "rewards/margins": 1.4009491205215454, + "rewards/rejected": -1.5532863140106201, + "step": 1410 + }, + { + "epoch": 1.47, + "learning_rate": 2.8396479142747797e-07, + "logits/chosen": 23.571504592895508, + "logits/rejected": 23.417720794677734, + "logps/chosen": -313.31170654296875, + "logps/rejected": -293.6142272949219, + "loss": 0.4351, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16742083430290222, + "rewards/margins": 1.0415483713150024, + "rewards/rejected": -1.208969235420227, + "step": 1420 + }, + { + "epoch": 1.48, + "learning_rate": 2.8205128205128203e-07, + "logits/chosen": 23.33370590209961, + "logits/rejected": 23.32365608215332, + "logps/chosen": -299.41351318359375, + "logps/rejected": -302.18939208984375, + "loss": 0.4146, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.1101953536272049, + "rewards/margins": 1.1390321254730225, + "rewards/rejected": -1.2492274045944214, + "step": 1430 + }, + { + "epoch": 1.49, + "learning_rate": 2.801377726750861e-07, + "logits/chosen": 23.091251373291016, + "logits/rejected": 23.06249237060547, + "logps/chosen": -288.70050048828125, + "logps/rejected": -261.63494873046875, + "loss": 0.4288, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.41927942633628845, + "rewards/margins": 0.8092526197433472, + "rewards/rejected": -1.228532075881958, + "step": 1440 + }, + { + "epoch": 1.5, + "learning_rate": 2.7822426329889015e-07, + "logits/chosen": 22.919397354125977, + "logits/rejected": 22.959392547607422, + "logps/chosen": -316.126953125, + "logps/rejected": -247.66845703125, + "loss": 0.4636, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22512026131153107, + "rewards/margins": 1.1928333044052124, + "rewards/rejected": -1.4179537296295166, + "step": 1450 + }, + { + "epoch": 1.51, + "learning_rate": 2.763107539226942e-07, + "logits/chosen": 23.120534896850586, + "logits/rejected": 22.996898651123047, + "logps/chosen": -366.4676513671875, + "logps/rejected": -257.8288879394531, + "loss": 0.4564, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3532385230064392, + "rewards/margins": 1.007889986038208, + "rewards/rejected": -1.361128568649292, + "step": 1460 + }, + { + "epoch": 1.52, + "learning_rate": 2.743972445464983e-07, + "logits/chosen": 23.175884246826172, + "logits/rejected": 23.074565887451172, + "logps/chosen": -285.8373718261719, + "logps/rejected": -230.23263549804688, + "loss": 0.4406, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.30376359820365906, + "rewards/margins": 0.9630700945854187, + "rewards/rejected": -1.2668339014053345, + "step": 1470 + }, + { + "epoch": 1.53, + "learning_rate": 2.7248373517030234e-07, + "logits/chosen": 23.030946731567383, + "logits/rejected": 23.114126205444336, + "logps/chosen": -342.0677185058594, + "logps/rejected": -273.91558837890625, + "loss": 0.4265, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3027496933937073, + "rewards/margins": 0.7853070497512817, + "rewards/rejected": -1.0880568027496338, + "step": 1480 + }, + { + "epoch": 1.54, + "learning_rate": 2.705702257941064e-07, + "logits/chosen": 23.109298706054688, + "logits/rejected": 22.95934295654297, + "logps/chosen": -353.9482727050781, + "logps/rejected": -293.26165771484375, + "loss": 0.3973, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.19786301255226135, + "rewards/margins": 1.2528201341629028, + "rewards/rejected": -1.4506832361221313, + "step": 1490 + }, + { + "epoch": 1.55, + "learning_rate": 2.686567164179104e-07, + "logits/chosen": 23.07802391052246, + "logits/rejected": 22.99662208557129, + "logps/chosen": -328.4552001953125, + "logps/rejected": -250.57901000976562, + "loss": 0.4317, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1944916695356369, + "rewards/margins": 1.2932884693145752, + "rewards/rejected": -1.4877803325653076, + "step": 1500 + }, + { + "epoch": 1.55, + "eval_logits/chosen": 23.452411651611328, + "eval_logits/rejected": 23.262121200561523, + "eval_logps/chosen": -355.9715576171875, + "eval_logps/rejected": -285.541748046875, + "eval_loss": 0.49968841671943665, + "eval_rewards/accuracies": 0.761904776096344, + "eval_rewards/chosen": -0.1401444375514984, + "eval_rewards/margins": 1.0056895017623901, + "eval_rewards/rejected": -1.1458338499069214, + "eval_runtime": 210.203, + "eval_samples_per_second": 9.515, + "eval_steps_per_second": 0.3, + "step": 1500 + }, + { + "epoch": 1.56, + "learning_rate": 2.6674320704171447e-07, + "logits/chosen": 23.38498306274414, + "logits/rejected": 23.101451873779297, + "logps/chosen": -313.84991455078125, + "logps/rejected": -229.9058837890625, + "loss": 0.4147, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.2590484023094177, + "rewards/margins": 0.8956319689750671, + "rewards/rejected": -1.1546803712844849, + "step": 1510 + }, + { + "epoch": 1.57, + "learning_rate": 2.6482969766551853e-07, + "logits/chosen": 23.376522064208984, + "logits/rejected": 23.071407318115234, + "logps/chosen": -294.4135437011719, + "logps/rejected": -286.2037658691406, + "loss": 0.4243, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.2608863413333893, + "rewards/margins": 1.1486496925354004, + "rewards/rejected": -1.4095360040664673, + "step": 1520 + }, + { + "epoch": 1.58, + "learning_rate": 2.629161882893226e-07, + "logits/chosen": 23.766990661621094, + "logits/rejected": 23.534847259521484, + "logps/chosen": -363.1257629394531, + "logps/rejected": -257.43377685546875, + "loss": 0.4044, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.28155818581581116, + "rewards/margins": 1.1225359439849854, + "rewards/rejected": -1.4040942192077637, + "step": 1530 + }, + { + "epoch": 1.59, + "learning_rate": 2.6100267891312666e-07, + "logits/chosen": 23.727947235107422, + "logits/rejected": 23.551546096801758, + "logps/chosen": -354.59808349609375, + "logps/rejected": -309.74041748046875, + "loss": 0.4358, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2375718653202057, + "rewards/margins": 0.9404077529907227, + "rewards/rejected": -1.177979588508606, + "step": 1540 + }, + { + "epoch": 1.6, + "learning_rate": 2.590891695369307e-07, + "logits/chosen": 23.435352325439453, + "logits/rejected": 23.340909957885742, + "logps/chosen": -323.2891845703125, + "logps/rejected": -256.3253479003906, + "loss": 0.4108, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.21504418551921844, + "rewards/margins": 1.1017714738845825, + "rewards/rejected": -1.316815733909607, + "step": 1550 + }, + { + "epoch": 1.61, + "learning_rate": 2.571756601607348e-07, + "logits/chosen": 23.279882431030273, + "logits/rejected": 22.866252899169922, + "logps/chosen": -376.61431884765625, + "logps/rejected": -252.8503875732422, + "loss": 0.4265, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.24882745742797852, + "rewards/margins": 1.2063392400741577, + "rewards/rejected": -1.4551665782928467, + "step": 1560 + }, + { + "epoch": 1.62, + "learning_rate": 2.5526215078453884e-07, + "logits/chosen": 23.572551727294922, + "logits/rejected": 23.359222412109375, + "logps/chosen": -348.06396484375, + "logps/rejected": -301.94830322265625, + "loss": 0.4353, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1706874668598175, + "rewards/margins": 1.228468656539917, + "rewards/rejected": -1.399156093597412, + "step": 1570 + }, + { + "epoch": 1.63, + "learning_rate": 2.533486414083429e-07, + "logits/chosen": 23.559356689453125, + "logits/rejected": 23.3609676361084, + "logps/chosen": -354.1952209472656, + "logps/rejected": -299.01385498046875, + "loss": 0.392, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.025229115039110184, + "rewards/margins": 1.3790032863616943, + "rewards/rejected": -1.3537743091583252, + "step": 1580 + }, + { + "epoch": 1.64, + "learning_rate": 2.5143513203214697e-07, + "logits/chosen": 23.176847457885742, + "logits/rejected": 23.128990173339844, + "logps/chosen": -395.842529296875, + "logps/rejected": -295.98162841796875, + "loss": 0.4379, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.10837908089160919, + "rewards/margins": 1.3724091053009033, + "rewards/rejected": -1.480788230895996, + "step": 1590 + }, + { + "epoch": 1.65, + "learning_rate": 2.49521622655951e-07, + "logits/chosen": 23.313915252685547, + "logits/rejected": 23.103626251220703, + "logps/chosen": -350.606689453125, + "logps/rejected": -294.15594482421875, + "loss": 0.4363, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.2652584910392761, + "rewards/margins": 1.1176103353500366, + "rewards/rejected": -1.382868766784668, + "step": 1600 + }, + { + "epoch": 1.65, + "eval_logits/chosen": 23.417835235595703, + "eval_logits/rejected": 23.231985092163086, + "eval_logps/chosen": -357.8829650878906, + "eval_logps/rejected": -287.6752014160156, + "eval_loss": 0.5009579062461853, + "eval_rewards/accuracies": 0.773809552192688, + "eval_rewards/chosen": -0.3312842845916748, + "eval_rewards/margins": 1.0278921127319336, + "eval_rewards/rejected": -1.3591763973236084, + "eval_runtime": 211.3907, + "eval_samples_per_second": 9.461, + "eval_steps_per_second": 0.298, + "step": 1600 + }, + { + "epoch": 1.66, + "learning_rate": 2.4760811327975504e-07, + "logits/chosen": 23.531373977661133, + "logits/rejected": 23.429351806640625, + "logps/chosen": -347.46490478515625, + "logps/rejected": -289.24176025390625, + "loss": 0.4249, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.28774064779281616, + "rewards/margins": 1.0696442127227783, + "rewards/rejected": -1.3573849201202393, + "step": 1610 + }, + { + "epoch": 1.67, + "learning_rate": 2.456946039035591e-07, + "logits/chosen": 23.160110473632812, + "logits/rejected": 23.07761001586914, + "logps/chosen": -372.6455993652344, + "logps/rejected": -254.6509246826172, + "loss": 0.4312, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.29867392778396606, + "rewards/margins": 1.093621850013733, + "rewards/rejected": -1.3922955989837646, + "step": 1620 + }, + { + "epoch": 1.68, + "learning_rate": 2.4378109452736316e-07, + "logits/chosen": 23.148435592651367, + "logits/rejected": 23.090463638305664, + "logps/chosen": -316.5802917480469, + "logps/rejected": -288.7501220703125, + "loss": 0.4262, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.3632759749889374, + "rewards/margins": 1.0216796398162842, + "rewards/rejected": -1.3849557638168335, + "step": 1630 + }, + { + "epoch": 1.69, + "learning_rate": 2.418675851511672e-07, + "logits/chosen": 23.17940330505371, + "logits/rejected": 23.190204620361328, + "logps/chosen": -346.47625732421875, + "logps/rejected": -270.1147155761719, + "loss": 0.4333, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2974759638309479, + "rewards/margins": 1.0174903869628906, + "rewards/rejected": -1.3149662017822266, + "step": 1640 + }, + { + "epoch": 1.7, + "learning_rate": 2.399540757749713e-07, + "logits/chosen": 23.138202667236328, + "logits/rejected": 22.985610961914062, + "logps/chosen": -340.69940185546875, + "logps/rejected": -311.3159484863281, + "loss": 0.4341, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1574067324399948, + "rewards/margins": 1.2777574062347412, + "rewards/rejected": -1.4351643323898315, + "step": 1650 + }, + { + "epoch": 1.71, + "learning_rate": 2.3804056639877535e-07, + "logits/chosen": 23.348569869995117, + "logits/rejected": 23.167980194091797, + "logps/chosen": -273.6280822753906, + "logps/rejected": -238.4679412841797, + "loss": 0.4392, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.03210877254605293, + "rewards/margins": 1.156890630722046, + "rewards/rejected": -1.1889994144439697, + "step": 1660 + }, + { + "epoch": 1.72, + "learning_rate": 2.361270570225794e-07, + "logits/chosen": 23.42279052734375, + "logits/rejected": 23.08903694152832, + "logps/chosen": -358.487548828125, + "logps/rejected": -266.75341796875, + "loss": 0.3848, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.0037168667186051607, + "rewards/margins": 1.2687807083129883, + "rewards/rejected": -1.272497534751892, + "step": 1670 + }, + { + "epoch": 1.73, + "learning_rate": 2.3421354764638345e-07, + "logits/chosen": 23.328954696655273, + "logits/rejected": 23.21335792541504, + "logps/chosen": -294.31402587890625, + "logps/rejected": -263.2884826660156, + "loss": 0.44, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.1213906854391098, + "rewards/margins": 0.9413496255874634, + "rewards/rejected": -1.0627403259277344, + "step": 1680 + }, + { + "epoch": 1.74, + "learning_rate": 2.323000382701875e-07, + "logits/chosen": 23.385282516479492, + "logits/rejected": 23.229637145996094, + "logps/chosen": -392.8078308105469, + "logps/rejected": -314.957275390625, + "loss": 0.4084, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1200430616736412, + "rewards/margins": 1.2001924514770508, + "rewards/rejected": -1.3202354907989502, + "step": 1690 + }, + { + "epoch": 1.76, + "learning_rate": 2.3038652889399157e-07, + "logits/chosen": 23.32499122619629, + "logits/rejected": 23.208293914794922, + "logps/chosen": -338.33892822265625, + "logps/rejected": -305.3815612792969, + "loss": 0.408, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3193683624267578, + "rewards/margins": 1.028808832168579, + "rewards/rejected": -1.348177433013916, + "step": 1700 + }, + { + "epoch": 1.76, + "eval_logits/chosen": 23.395021438598633, + "eval_logits/rejected": 23.213520050048828, + "eval_logps/chosen": -357.0264892578125, + "eval_logps/rejected": -287.1567687988281, + "eval_loss": 0.4989284873008728, + "eval_rewards/accuracies": 0.7777777910232544, + "eval_rewards/chosen": -0.24563594162464142, + "eval_rewards/margins": 1.0617001056671143, + "eval_rewards/rejected": -1.3073359727859497, + "eval_runtime": 212.6457, + "eval_samples_per_second": 9.405, + "eval_steps_per_second": 0.296, + "step": 1700 + }, + { + "epoch": 1.77, + "learning_rate": 2.2847301951779563e-07, + "logits/chosen": 23.246747970581055, + "logits/rejected": 23.268218994140625, + "logps/chosen": -298.40838623046875, + "logps/rejected": -294.05877685546875, + "loss": 0.4063, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.37956395745277405, + "rewards/margins": 1.2896459102630615, + "rewards/rejected": -1.6692098379135132, + "step": 1710 + }, + { + "epoch": 1.78, + "learning_rate": 2.265595101415997e-07, + "logits/chosen": 23.170940399169922, + "logits/rejected": 23.139057159423828, + "logps/chosen": -333.7245178222656, + "logps/rejected": -287.9451599121094, + "loss": 0.4352, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.23647813498973846, + "rewards/margins": 1.159517526626587, + "rewards/rejected": -1.3959954977035522, + "step": 1720 + }, + { + "epoch": 1.79, + "learning_rate": 2.2464600076540373e-07, + "logits/chosen": 23.052528381347656, + "logits/rejected": 22.96520233154297, + "logps/chosen": -327.0295104980469, + "logps/rejected": -272.34539794921875, + "loss": 0.4108, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3730023503303528, + "rewards/margins": 1.0741784572601318, + "rewards/rejected": -1.4471808671951294, + "step": 1730 + }, + { + "epoch": 1.8, + "learning_rate": 2.227324913892078e-07, + "logits/chosen": 23.350069046020508, + "logits/rejected": 23.162134170532227, + "logps/chosen": -338.16583251953125, + "logps/rejected": -292.6080627441406, + "loss": 0.4213, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.24311116337776184, + "rewards/margins": 1.0276445150375366, + "rewards/rejected": -1.270755648612976, + "step": 1740 + }, + { + "epoch": 1.81, + "learning_rate": 2.2081898201301186e-07, + "logits/chosen": 23.173582077026367, + "logits/rejected": 23.145183563232422, + "logps/chosen": -329.4697265625, + "logps/rejected": -266.3700256347656, + "loss": 0.407, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.023505190387368202, + "rewards/margins": 1.2212746143341064, + "rewards/rejected": -1.2447797060012817, + "step": 1750 + }, + { + "epoch": 1.82, + "learning_rate": 2.1890547263681592e-07, + "logits/chosen": 23.282878875732422, + "logits/rejected": 23.070077896118164, + "logps/chosen": -321.710205078125, + "logps/rejected": -277.3352966308594, + "loss": 0.4071, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.25008895993232727, + "rewards/margins": 1.3580764532089233, + "rewards/rejected": -1.6081653833389282, + "step": 1760 + }, + { + "epoch": 1.83, + "learning_rate": 2.1699196326061998e-07, + "logits/chosen": 23.08016586303711, + "logits/rejected": 23.203523635864258, + "logps/chosen": -297.59130859375, + "logps/rejected": -280.3871765136719, + "loss": 0.4554, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.3134928345680237, + "rewards/margins": 1.07438063621521, + "rewards/rejected": -1.387873649597168, + "step": 1770 + }, + { + "epoch": 1.84, + "learning_rate": 2.1507845388442402e-07, + "logits/chosen": 23.223857879638672, + "logits/rejected": 23.118579864501953, + "logps/chosen": -307.32745361328125, + "logps/rejected": -264.244384765625, + "loss": 0.4215, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4751613140106201, + "rewards/margins": 1.2573087215423584, + "rewards/rejected": -1.732469916343689, + "step": 1780 + }, + { + "epoch": 1.85, + "learning_rate": 2.1316494450822808e-07, + "logits/chosen": 23.08175277709961, + "logits/rejected": 23.210302352905273, + "logps/chosen": -348.2358093261719, + "logps/rejected": -269.00909423828125, + "loss": 0.4148, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.5350313186645508, + "rewards/margins": 1.2447912693023682, + "rewards/rejected": -1.779822587966919, + "step": 1790 + }, + { + "epoch": 1.86, + "learning_rate": 2.1125143513203214e-07, + "logits/chosen": 23.117109298706055, + "logits/rejected": 23.03956413269043, + "logps/chosen": -350.7597961425781, + "logps/rejected": -257.7859191894531, + "loss": 0.4076, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.42596331238746643, + "rewards/margins": 1.1088837385177612, + "rewards/rejected": -1.5348470211029053, + "step": 1800 + }, + { + "epoch": 1.86, + "eval_logits/chosen": 23.361677169799805, + "eval_logits/rejected": 23.18657112121582, + "eval_logps/chosen": -358.4737854003906, + "eval_logps/rejected": -288.44818115234375, + "eval_loss": 0.4995974004268646, + "eval_rewards/accuracies": 0.7658730149269104, + "eval_rewards/chosen": -0.39036476612091064, + "eval_rewards/margins": 1.0461114645004272, + "eval_rewards/rejected": -1.4364763498306274, + "eval_runtime": 207.254, + "eval_samples_per_second": 9.65, + "eval_steps_per_second": 0.304, + "step": 1800 + }, + { + "epoch": 1.87, + "learning_rate": 2.093379257558362e-07, + "logits/chosen": 23.419483184814453, + "logits/rejected": 23.20507049560547, + "logps/chosen": -321.2181396484375, + "logps/rejected": -264.3491516113281, + "loss": 0.4189, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.5122248530387878, + "rewards/margins": 1.2178277969360352, + "rewards/rejected": -1.7300525903701782, + "step": 1810 + }, + { + "epoch": 1.88, + "learning_rate": 2.0742441637964026e-07, + "logits/chosen": 22.683643341064453, + "logits/rejected": 22.83184242248535, + "logps/chosen": -339.54498291015625, + "logps/rejected": -250.641845703125, + "loss": 0.4102, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.3275443911552429, + "rewards/margins": 1.4303803443908691, + "rewards/rejected": -1.7579247951507568, + "step": 1820 + }, + { + "epoch": 1.89, + "learning_rate": 2.055109070034443e-07, + "logits/chosen": 23.576740264892578, + "logits/rejected": 23.395267486572266, + "logps/chosen": -349.0409240722656, + "logps/rejected": -268.94268798828125, + "loss": 0.4055, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.29274967312812805, + "rewards/margins": 1.1743746995925903, + "rewards/rejected": -1.4671242237091064, + "step": 1830 + }, + { + "epoch": 1.9, + "learning_rate": 2.0359739762724836e-07, + "logits/chosen": 23.352815628051758, + "logits/rejected": 23.3568058013916, + "logps/chosen": -352.31304931640625, + "logps/rejected": -284.7132873535156, + "loss": 0.4133, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.33104512095451355, + "rewards/margins": 1.168330192565918, + "rewards/rejected": -1.499375343322754, + "step": 1840 + }, + { + "epoch": 1.91, + "learning_rate": 2.0168388825105242e-07, + "logits/chosen": 23.261768341064453, + "logits/rejected": 23.228496551513672, + "logps/chosen": -345.1585998535156, + "logps/rejected": -339.8830871582031, + "loss": 0.3868, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.12148020416498184, + "rewards/margins": 1.3025624752044678, + "rewards/rejected": -1.424042820930481, + "step": 1850 + }, + { + "epoch": 1.92, + "learning_rate": 1.997703788748565e-07, + "logits/chosen": 23.22171401977539, + "logits/rejected": 22.969928741455078, + "logps/chosen": -328.1639099121094, + "logps/rejected": -222.90762329101562, + "loss": 0.406, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.09366317093372345, + "rewards/margins": 1.3427413702011108, + "rewards/rejected": -1.4364044666290283, + "step": 1860 + }, + { + "epoch": 1.93, + "learning_rate": 1.9785686949866055e-07, + "logits/chosen": 22.87033462524414, + "logits/rejected": 23.068653106689453, + "logps/chosen": -331.31146240234375, + "logps/rejected": -285.5386962890625, + "loss": 0.4169, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.057704973965883255, + "rewards/margins": 1.1090171337127686, + "rewards/rejected": -1.1667221784591675, + "step": 1870 + }, + { + "epoch": 1.94, + "learning_rate": 1.9594336012246458e-07, + "logits/chosen": 23.055828094482422, + "logits/rejected": 23.081539154052734, + "logps/chosen": -312.4957275390625, + "logps/rejected": -268.7204895019531, + "loss": 0.3909, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.07027649134397507, + "rewards/margins": 1.1952614784240723, + "rewards/rejected": -1.2655378580093384, + "step": 1880 + }, + { + "epoch": 1.95, + "learning_rate": 1.9402985074626865e-07, + "logits/chosen": 22.958328247070312, + "logits/rejected": 23.175926208496094, + "logps/chosen": -312.7813720703125, + "logps/rejected": -272.52215576171875, + "loss": 0.4153, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2530692219734192, + "rewards/margins": 1.2337100505828857, + "rewards/rejected": -1.4867792129516602, + "step": 1890 + }, + { + "epoch": 1.96, + "learning_rate": 1.921163413700727e-07, + "logits/chosen": 23.173688888549805, + "logits/rejected": 23.038455963134766, + "logps/chosen": -327.50531005859375, + "logps/rejected": -257.269287109375, + "loss": 0.4547, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.3794935941696167, + "rewards/margins": 1.0488073825836182, + "rewards/rejected": -1.4283010959625244, + "step": 1900 + }, + { + "epoch": 1.96, + "eval_logits/chosen": 23.329803466796875, + "eval_logits/rejected": 23.160478591918945, + "eval_logps/chosen": -357.08575439453125, + "eval_logps/rejected": -286.7316589355469, + "eval_loss": 0.5008072853088379, + "eval_rewards/accuracies": 0.7857142686843872, + "eval_rewards/chosen": -0.25156161189079285, + "eval_rewards/margins": 1.0132601261138916, + "eval_rewards/rejected": -1.2648216485977173, + "eval_runtime": 212.8249, + "eval_samples_per_second": 9.397, + "eval_steps_per_second": 0.296, + "step": 1900 + }, + { + "epoch": 1.97, + "learning_rate": 1.9020283199387677e-07, + "logits/chosen": 23.098756790161133, + "logits/rejected": 23.10630226135254, + "logps/chosen": -345.1263122558594, + "logps/rejected": -260.34613037109375, + "loss": 0.4359, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.14109382033348083, + "rewards/margins": 1.0066546201705933, + "rewards/rejected": -1.147748589515686, + "step": 1910 + }, + { + "epoch": 1.98, + "learning_rate": 1.8828932261768083e-07, + "logits/chosen": 23.233016967773438, + "logits/rejected": 23.24778175354004, + "logps/chosen": -295.8356628417969, + "logps/rejected": -244.9901885986328, + "loss": 0.4224, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.49083733558654785, + "rewards/margins": 1.3773233890533447, + "rewards/rejected": -1.868160605430603, + "step": 1920 + }, + { + "epoch": 1.99, + "learning_rate": 1.8637581324148487e-07, + "logits/chosen": 23.030765533447266, + "logits/rejected": 22.91606330871582, + "logps/chosen": -311.2125549316406, + "logps/rejected": -287.0599670410156, + "loss": 0.4249, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.3956405520439148, + "rewards/margins": 1.194427728652954, + "rewards/rejected": -1.5900681018829346, + "step": 1930 + }, + { + "epoch": 2.0, + "learning_rate": 1.8446230386528893e-07, + "logits/chosen": 22.784257888793945, + "logits/rejected": 22.93459701538086, + "logps/chosen": -270.6285095214844, + "logps/rejected": -242.1689453125, + "loss": 0.3953, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.2096777856349945, + "rewards/margins": 1.2774814367294312, + "rewards/rejected": -1.487159252166748, + "step": 1940 + }, + { + "epoch": 2.01, + "learning_rate": 1.82548794489093e-07, + "logits/chosen": 22.975915908813477, + "logits/rejected": 23.076732635498047, + "logps/chosen": -285.87164306640625, + "logps/rejected": -285.27105712890625, + "loss": 0.3506, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.4368739724159241, + "rewards/margins": 1.2533791065216064, + "rewards/rejected": -1.6902532577514648, + "step": 1950 + }, + { + "epoch": 2.02, + "learning_rate": 1.8063528511289706e-07, + "logits/chosen": 23.094058990478516, + "logits/rejected": 23.000532150268555, + "logps/chosen": -311.85418701171875, + "logps/rejected": -342.6611633300781, + "loss": 0.3454, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3331337571144104, + "rewards/margins": 1.497775912284851, + "rewards/rejected": -1.8309099674224854, + "step": 1960 + }, + { + "epoch": 2.03, + "learning_rate": 1.7872177573670112e-07, + "logits/chosen": 23.15587043762207, + "logits/rejected": 22.996431350708008, + "logps/chosen": -284.33099365234375, + "logps/rejected": -247.60586547851562, + "loss": 0.3782, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.3313903212547302, + "rewards/margins": 1.394980549812317, + "rewards/rejected": -1.7263710498809814, + "step": 1970 + }, + { + "epoch": 2.04, + "learning_rate": 1.7680826636050515e-07, + "logits/chosen": 23.24311637878418, + "logits/rejected": 23.19965171813965, + "logps/chosen": -307.9765930175781, + "logps/rejected": -258.64697265625, + "loss": 0.3464, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.12858574092388153, + "rewards/margins": 1.5073888301849365, + "rewards/rejected": -1.635974645614624, + "step": 1980 + }, + { + "epoch": 2.05, + "learning_rate": 1.7489475698430921e-07, + "logits/chosen": 23.421756744384766, + "logits/rejected": 23.180278778076172, + "logps/chosen": -351.59796142578125, + "logps/rejected": -271.3390808105469, + "loss": 0.3522, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.3069804012775421, + "rewards/margins": 1.5511845350265503, + "rewards/rejected": -1.8581645488739014, + "step": 1990 + }, + { + "epoch": 2.07, + "learning_rate": 1.7298124760811328e-07, + "logits/chosen": 23.29781723022461, + "logits/rejected": 23.215654373168945, + "logps/chosen": -335.22943115234375, + "logps/rejected": -245.5157470703125, + "loss": 0.3469, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39654579758644104, + "rewards/margins": 1.623525857925415, + "rewards/rejected": -2.020071506500244, + "step": 2000 + }, + { + "epoch": 2.07, + "eval_logits/chosen": 23.29904556274414, + "eval_logits/rejected": 23.136056900024414, + "eval_logps/chosen": -357.43829345703125, + "eval_logps/rejected": -287.9998779296875, + "eval_loss": 0.49774664640426636, + "eval_rewards/accuracies": 0.7777777910232544, + "eval_rewards/chosen": -0.28681743144989014, + "eval_rewards/margins": 1.10482656955719, + "eval_rewards/rejected": -1.39164400100708, + "eval_runtime": 207.5885, + "eval_samples_per_second": 9.634, + "eval_steps_per_second": 0.303, + "step": 2000 + }, + { + "epoch": 2.08, + "learning_rate": 1.7106773823191734e-07, + "logits/chosen": 23.3192138671875, + "logits/rejected": 23.1693115234375, + "logps/chosen": -360.62701416015625, + "logps/rejected": -267.00836181640625, + "loss": 0.3505, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.2135746031999588, + "rewards/margins": 1.6586072444915771, + "rewards/rejected": -1.8721816539764404, + "step": 2010 + }, + { + "epoch": 2.09, + "learning_rate": 1.691542288557214e-07, + "logits/chosen": 22.96431541442871, + "logits/rejected": 22.824430465698242, + "logps/chosen": -327.3959655761719, + "logps/rejected": -293.3819580078125, + "loss": 0.3412, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.4780098795890808, + "rewards/margins": 1.3323651552200317, + "rewards/rejected": -1.8103749752044678, + "step": 2020 + }, + { + "epoch": 2.1, + "learning_rate": 1.6724071947952544e-07, + "logits/chosen": 22.99736213684082, + "logits/rejected": 22.809282302856445, + "logps/chosen": -286.00433349609375, + "logps/rejected": -253.54635620117188, + "loss": 0.3509, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.5133158564567566, + "rewards/margins": 1.4120731353759766, + "rewards/rejected": -1.9253889322280884, + "step": 2030 + }, + { + "epoch": 2.11, + "learning_rate": 1.653272101033295e-07, + "logits/chosen": 22.923995971679688, + "logits/rejected": 22.87368392944336, + "logps/chosen": -293.9073791503906, + "logps/rejected": -263.51397705078125, + "loss": 0.3527, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28854408860206604, + "rewards/margins": 1.3332937955856323, + "rewards/rejected": -1.6218379735946655, + "step": 2040 + }, + { + "epoch": 2.12, + "learning_rate": 1.6341370072713356e-07, + "logits/chosen": 23.294551849365234, + "logits/rejected": 23.248403549194336, + "logps/chosen": -354.1997985839844, + "logps/rejected": -338.943603515625, + "loss": 0.3538, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2328769713640213, + "rewards/margins": 1.2254259586334229, + "rewards/rejected": -1.4583029747009277, + "step": 2050 + }, + { + "epoch": 2.13, + "learning_rate": 1.6150019135093762e-07, + "logits/chosen": 23.40046501159668, + "logits/rejected": 23.170814514160156, + "logps/chosen": -376.11871337890625, + "logps/rejected": -280.2356872558594, + "loss": 0.3409, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.1297599822282791, + "rewards/margins": 1.5910810232162476, + "rewards/rejected": -1.7208411693572998, + "step": 2060 + }, + { + "epoch": 2.14, + "learning_rate": 1.5958668197474169e-07, + "logits/chosen": 23.402286529541016, + "logits/rejected": 23.344829559326172, + "logps/chosen": -316.00634765625, + "logps/rejected": -308.0472717285156, + "loss": 0.3556, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.00810793973505497, + "rewards/margins": 1.592138409614563, + "rewards/rejected": -1.5840303897857666, + "step": 2070 + }, + { + "epoch": 2.15, + "learning_rate": 1.5767317259854572e-07, + "logits/chosen": 23.049579620361328, + "logits/rejected": 23.11331558227539, + "logps/chosen": -320.4759826660156, + "logps/rejected": -237.2650604248047, + "loss": 0.3655, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.1983695775270462, + "rewards/margins": 1.2847638130187988, + "rewards/rejected": -1.4831334352493286, + "step": 2080 + }, + { + "epoch": 2.16, + "learning_rate": 1.5575966322234978e-07, + "logits/chosen": 23.234739303588867, + "logits/rejected": 22.95262336730957, + "logps/chosen": -360.20367431640625, + "logps/rejected": -292.31317138671875, + "loss": 0.3307, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17867961525917053, + "rewards/margins": 1.6552801132202148, + "rewards/rejected": -1.8339598178863525, + "step": 2090 + }, + { + "epoch": 2.17, + "learning_rate": 1.5384615384615385e-07, + "logits/chosen": 23.25923728942871, + "logits/rejected": 23.007633209228516, + "logps/chosen": -305.6047058105469, + "logps/rejected": -246.4465789794922, + "loss": 0.3547, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.2684822380542755, + "rewards/margins": 1.560947060585022, + "rewards/rejected": -1.829429268836975, + "step": 2100 + }, + { + "epoch": 2.17, + "eval_logits/chosen": 23.273019790649414, + "eval_logits/rejected": 23.114229202270508, + "eval_logps/chosen": -358.821044921875, + "eval_logps/rejected": -289.5934753417969, + "eval_loss": 0.49868160486221313, + "eval_rewards/accuracies": 0.761904776096344, + "eval_rewards/chosen": -0.42508772015571594, + "eval_rewards/margins": 1.125916838645935, + "eval_rewards/rejected": -1.5510046482086182, + "eval_runtime": 211.1219, + "eval_samples_per_second": 9.473, + "eval_steps_per_second": 0.298, + "step": 2100 + }, + { + "epoch": 2.18, + "learning_rate": 1.519326444699579e-07, + "logits/chosen": 23.230274200439453, + "logits/rejected": 23.15807342529297, + "logps/chosen": -334.8540954589844, + "logps/rejected": -263.5167236328125, + "loss": 0.3289, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.251259982585907, + "rewards/margins": 1.5572983026504517, + "rewards/rejected": -1.8085582256317139, + "step": 2110 + }, + { + "epoch": 2.19, + "learning_rate": 1.5001913509376197e-07, + "logits/chosen": 23.217041015625, + "logits/rejected": 23.1701717376709, + "logps/chosen": -330.7687683105469, + "logps/rejected": -285.98907470703125, + "loss": 0.3463, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.2551344037055969, + "rewards/margins": 1.4968717098236084, + "rewards/rejected": -1.7520062923431396, + "step": 2120 + }, + { + "epoch": 2.2, + "learning_rate": 1.4810562571756603e-07, + "logits/chosen": 23.419551849365234, + "logits/rejected": 23.216039657592773, + "logps/chosen": -312.8890380859375, + "logps/rejected": -267.16729736328125, + "loss": 0.3688, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.34911391139030457, + "rewards/margins": 1.548995018005371, + "rewards/rejected": -1.8981088399887085, + "step": 2130 + }, + { + "epoch": 2.21, + "learning_rate": 1.4619211634137007e-07, + "logits/chosen": 22.947824478149414, + "logits/rejected": 22.82015037536621, + "logps/chosen": -313.2303771972656, + "logps/rejected": -252.9609375, + "loss": 0.3447, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2606348693370819, + "rewards/margins": 1.5146172046661377, + "rewards/rejected": -1.7752519845962524, + "step": 2140 + }, + { + "epoch": 2.22, + "learning_rate": 1.4427860696517413e-07, + "logits/chosen": 23.21828269958496, + "logits/rejected": 23.22684097290039, + "logps/chosen": -369.90924072265625, + "logps/rejected": -314.48016357421875, + "loss": 0.3561, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.12711670994758606, + "rewards/margins": 1.666666030883789, + "rewards/rejected": -1.7937828302383423, + "step": 2150 + }, + { + "epoch": 2.23, + "learning_rate": 1.423650975889782e-07, + "logits/chosen": 23.231754302978516, + "logits/rejected": 23.19542694091797, + "logps/chosen": -351.79913330078125, + "logps/rejected": -271.44427490234375, + "loss": 0.3303, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14719603955745697, + "rewards/margins": 1.6395785808563232, + "rewards/rejected": -1.7867748737335205, + "step": 2160 + }, + { + "epoch": 2.24, + "learning_rate": 1.4045158821278225e-07, + "logits/chosen": 22.95505142211914, + "logits/rejected": 22.855873107910156, + "logps/chosen": -340.75238037109375, + "logps/rejected": -316.02386474609375, + "loss": 0.3293, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.26123401522636414, + "rewards/margins": 1.4234856367111206, + "rewards/rejected": -1.6847198009490967, + "step": 2170 + }, + { + "epoch": 2.25, + "learning_rate": 1.3853807883658632e-07, + "logits/chosen": 23.029155731201172, + "logits/rejected": 23.120052337646484, + "logps/chosen": -371.22100830078125, + "logps/rejected": -319.261474609375, + "loss": 0.3694, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.3324764668941498, + "rewards/margins": 1.6106780767440796, + "rewards/rejected": -1.9431545734405518, + "step": 2180 + }, + { + "epoch": 2.26, + "learning_rate": 1.3662456946039035e-07, + "logits/chosen": 23.185413360595703, + "logits/rejected": 22.925167083740234, + "logps/chosen": -305.14410400390625, + "logps/rejected": -296.2259521484375, + "loss": 0.3664, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.40003857016563416, + "rewards/margins": 1.3513580560684204, + "rewards/rejected": -1.7513965368270874, + "step": 2190 + }, + { + "epoch": 2.27, + "learning_rate": 1.3471106008419441e-07, + "logits/chosen": 23.18272590637207, + "logits/rejected": 23.115558624267578, + "logps/chosen": -306.1884460449219, + "logps/rejected": -288.0213928222656, + "loss": 0.3468, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.359417200088501, + "rewards/margins": 1.435274362564087, + "rewards/rejected": -1.7946914434432983, + "step": 2200 + }, + { + "epoch": 2.27, + "eval_logits/chosen": 23.256072998046875, + "eval_logits/rejected": 23.099788665771484, + "eval_logps/chosen": -357.2442932128906, + "eval_logps/rejected": -288.0285339355469, + "eval_loss": 0.49792206287384033, + "eval_rewards/accuracies": 0.7777777910232544, + "eval_rewards/chosen": -0.26741600036621094, + "eval_rewards/margins": 1.1270908117294312, + "eval_rewards/rejected": -1.394506812095642, + "eval_runtime": 210.9966, + "eval_samples_per_second": 9.479, + "eval_steps_per_second": 0.299, + "step": 2200 + }, + { + "epoch": 2.28, + "learning_rate": 1.3279755070799848e-07, + "logits/chosen": 23.182937622070312, + "logits/rejected": 23.00518035888672, + "logps/chosen": -325.1390075683594, + "logps/rejected": -254.9105224609375, + "loss": 0.3562, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2701284885406494, + "rewards/margins": 1.4486573934555054, + "rewards/rejected": -1.7187858819961548, + "step": 2210 + }, + { + "epoch": 2.29, + "learning_rate": 1.3088404133180254e-07, + "logits/chosen": 23.192270278930664, + "logits/rejected": 22.86314582824707, + "logps/chosen": -354.3294677734375, + "logps/rejected": -277.08319091796875, + "loss": 0.3275, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.3135210871696472, + "rewards/margins": 1.6535711288452148, + "rewards/rejected": -1.9670922756195068, + "step": 2220 + }, + { + "epoch": 2.3, + "learning_rate": 1.289705319556066e-07, + "logits/chosen": 23.285938262939453, + "logits/rejected": 23.20859146118164, + "logps/chosen": -341.8558044433594, + "logps/rejected": -260.49853515625, + "loss": 0.3339, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.49324899911880493, + "rewards/margins": 1.3661364316940308, + "rewards/rejected": -1.8593854904174805, + "step": 2230 + }, + { + "epoch": 2.31, + "learning_rate": 1.2705702257941064e-07, + "logits/chosen": 23.054574966430664, + "logits/rejected": 22.94180679321289, + "logps/chosen": -314.9513244628906, + "logps/rejected": -253.89779663085938, + "loss": 0.3683, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.5942099690437317, + "rewards/margins": 1.3034783601760864, + "rewards/rejected": -1.8976882696151733, + "step": 2240 + }, + { + "epoch": 2.32, + "learning_rate": 1.251435132032147e-07, + "logits/chosen": 23.040285110473633, + "logits/rejected": 23.092458724975586, + "logps/chosen": -347.84820556640625, + "logps/rejected": -300.02069091796875, + "loss": 0.3424, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.3747033476829529, + "rewards/margins": 1.4116895198822021, + "rewards/rejected": -1.7863928079605103, + "step": 2250 + }, + { + "epoch": 2.33, + "learning_rate": 1.2323000382701873e-07, + "logits/chosen": 23.061431884765625, + "logits/rejected": 22.941814422607422, + "logps/chosen": -363.65850830078125, + "logps/rejected": -299.712890625, + "loss": 0.3701, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.34447842836380005, + "rewards/margins": 1.6744133234024048, + "rewards/rejected": -2.0188918113708496, + "step": 2260 + }, + { + "epoch": 2.34, + "learning_rate": 1.213164944508228e-07, + "logits/chosen": 23.18351936340332, + "logits/rejected": 23.060955047607422, + "logps/chosen": -386.75701904296875, + "logps/rejected": -311.6101379394531, + "loss": 0.3375, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.2950562834739685, + "rewards/margins": 1.5026452541351318, + "rewards/rejected": -1.7977014780044556, + "step": 2270 + }, + { + "epoch": 2.35, + "learning_rate": 1.1940298507462686e-07, + "logits/chosen": 23.240421295166016, + "logits/rejected": 23.235857009887695, + "logps/chosen": -306.81561279296875, + "logps/rejected": -249.85324096679688, + "loss": 0.3425, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -0.47128137946128845, + "rewards/margins": 1.526531457901001, + "rewards/rejected": -1.9978128671646118, + "step": 2280 + }, + { + "epoch": 2.36, + "learning_rate": 1.1748947569843092e-07, + "logits/chosen": 23.31965446472168, + "logits/rejected": 23.00503158569336, + "logps/chosen": -366.4286193847656, + "logps/rejected": -287.65399169921875, + "loss": 0.3404, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20483896136283875, + "rewards/margins": 1.4863460063934326, + "rewards/rejected": -1.6911849975585938, + "step": 2290 + }, + { + "epoch": 2.37, + "learning_rate": 1.1557596632223497e-07, + "logits/chosen": 23.109445571899414, + "logits/rejected": 23.121734619140625, + "logps/chosen": -339.75689697265625, + "logps/rejected": -266.4504699707031, + "loss": 0.3432, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.32633692026138306, + "rewards/margins": 1.3159123659133911, + "rewards/rejected": -1.642249345779419, + "step": 2300 + }, + { + "epoch": 2.37, + "eval_logits/chosen": 23.223342895507812, + "eval_logits/rejected": 23.0726318359375, + "eval_logps/chosen": -358.362060546875, + "eval_logps/rejected": -288.7130126953125, + "eval_loss": 0.5026321411132812, + "eval_rewards/accuracies": 0.773809552192688, + "eval_rewards/chosen": -0.3791937828063965, + "eval_rewards/margins": 1.0837651491165161, + "eval_rewards/rejected": -1.4629590511322021, + "eval_runtime": 212.4288, + "eval_samples_per_second": 9.415, + "eval_steps_per_second": 0.297, + "step": 2300 + }, + { + "epoch": 2.39, + "learning_rate": 1.1366245694603903e-07, + "logits/chosen": 22.923072814941406, + "logits/rejected": 22.96480369567871, + "logps/chosen": -329.4259338378906, + "logps/rejected": -294.24127197265625, + "loss": 0.3706, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3439275026321411, + "rewards/margins": 1.2762069702148438, + "rewards/rejected": -1.6201345920562744, + "step": 2310 + }, + { + "epoch": 2.4, + "learning_rate": 1.1174894756984308e-07, + "logits/chosen": 23.266117095947266, + "logits/rejected": 23.19167137145996, + "logps/chosen": -274.62237548828125, + "logps/rejected": -247.6970672607422, + "loss": 0.3588, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.4183998107910156, + "rewards/margins": 1.5969486236572266, + "rewards/rejected": -2.015348434448242, + "step": 2320 + }, + { + "epoch": 2.41, + "learning_rate": 1.0983543819364714e-07, + "logits/chosen": 23.18532943725586, + "logits/rejected": 23.074344635009766, + "logps/chosen": -333.7981262207031, + "logps/rejected": -262.7727966308594, + "loss": 0.3035, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3411490321159363, + "rewards/margins": 1.6993077993392944, + "rewards/rejected": -2.040456771850586, + "step": 2330 + }, + { + "epoch": 2.42, + "learning_rate": 1.079219288174512e-07, + "logits/chosen": 22.650278091430664, + "logits/rejected": 22.700809478759766, + "logps/chosen": -256.5923156738281, + "logps/rejected": -267.3676452636719, + "loss": 0.3527, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5605247616767883, + "rewards/margins": 1.3785268068313599, + "rewards/rejected": -1.939051628112793, + "step": 2340 + }, + { + "epoch": 2.43, + "learning_rate": 1.0600841944125525e-07, + "logits/chosen": 23.22505760192871, + "logits/rejected": 23.044265747070312, + "logps/chosen": -377.84765625, + "logps/rejected": -308.9931945800781, + "loss": 0.3613, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2691905200481415, + "rewards/margins": 1.7195707559585571, + "rewards/rejected": -1.988761305809021, + "step": 2350 + }, + { + "epoch": 2.44, + "learning_rate": 1.0409491006505931e-07, + "logits/chosen": 23.209278106689453, + "logits/rejected": 23.045013427734375, + "logps/chosen": -341.085205078125, + "logps/rejected": -284.80987548828125, + "loss": 0.3373, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.32976096868515015, + "rewards/margins": 1.4313738346099854, + "rewards/rejected": -1.7611347436904907, + "step": 2360 + }, + { + "epoch": 2.45, + "learning_rate": 1.0218140068886336e-07, + "logits/chosen": 22.989057540893555, + "logits/rejected": 22.930797576904297, + "logps/chosen": -349.1778564453125, + "logps/rejected": -276.46905517578125, + "loss": 0.351, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3507656157016754, + "rewards/margins": 1.4063034057617188, + "rewards/rejected": -1.7570692300796509, + "step": 2370 + }, + { + "epoch": 2.46, + "learning_rate": 1.0026789131266743e-07, + "logits/chosen": 22.69498062133789, + "logits/rejected": 22.762619018554688, + "logps/chosen": -319.37701416015625, + "logps/rejected": -285.0171813964844, + "loss": 0.3298, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41146141290664673, + "rewards/margins": 1.4631173610687256, + "rewards/rejected": -1.874578833580017, + "step": 2380 + }, + { + "epoch": 2.47, + "learning_rate": 9.835438193647149e-08, + "logits/chosen": 22.913543701171875, + "logits/rejected": 22.7869873046875, + "logps/chosen": -328.5509948730469, + "logps/rejected": -253.9607696533203, + "loss": 0.3565, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.4099615216255188, + "rewards/margins": 1.3734912872314453, + "rewards/rejected": -1.7834527492523193, + "step": 2390 + }, + { + "epoch": 2.48, + "learning_rate": 9.644087256027554e-08, + "logits/chosen": 23.1809139251709, + "logits/rejected": 23.06944465637207, + "logps/chosen": -286.20904541015625, + "logps/rejected": -235.44140625, + "loss": 0.324, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.5443722009658813, + "rewards/margins": 1.3517777919769287, + "rewards/rejected": -1.8961498737335205, + "step": 2400 + }, + { + "epoch": 2.48, + "eval_logits/chosen": 23.200559616088867, + "eval_logits/rejected": 23.054319381713867, + "eval_logps/chosen": -359.46197509765625, + "eval_logps/rejected": -290.1737060546875, + "eval_loss": 0.5021990537643433, + "eval_rewards/accuracies": 0.7698412537574768, + "eval_rewards/chosen": -0.48918139934539795, + "eval_rewards/margins": 1.1198451519012451, + "eval_rewards/rejected": -1.609026551246643, + "eval_runtime": 211.6095, + "eval_samples_per_second": 9.451, + "eval_steps_per_second": 0.298, + "step": 2400 + }, + { + "epoch": 2.49, + "learning_rate": 9.45273631840796e-08, + "logits/chosen": 22.888259887695312, + "logits/rejected": 22.884002685546875, + "logps/chosen": -351.7564392089844, + "logps/rejected": -299.20611572265625, + "loss": 0.3645, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.16312038898468018, + "rewards/margins": 1.5724434852600098, + "rewards/rejected": -1.73556387424469, + "step": 2410 + }, + { + "epoch": 2.5, + "learning_rate": 9.261385380788366e-08, + "logits/chosen": 23.231523513793945, + "logits/rejected": 23.333255767822266, + "logps/chosen": -338.1724548339844, + "logps/rejected": -284.2950744628906, + "loss": 0.3772, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.48969048261642456, + "rewards/margins": 1.1433426141738892, + "rewards/rejected": -1.633033037185669, + "step": 2420 + }, + { + "epoch": 2.51, + "learning_rate": 9.070034443168771e-08, + "logits/chosen": 23.16311264038086, + "logits/rejected": 22.952455520629883, + "logps/chosen": -304.960205078125, + "logps/rejected": -273.458251953125, + "loss": 0.3659, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.46648114919662476, + "rewards/margins": 1.2427117824554443, + "rewards/rejected": -1.7091929912567139, + "step": 2430 + }, + { + "epoch": 2.52, + "learning_rate": 8.878683505549177e-08, + "logits/chosen": 23.104694366455078, + "logits/rejected": 23.080604553222656, + "logps/chosen": -287.4930419921875, + "logps/rejected": -265.7315368652344, + "loss": 0.3395, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0566287636756897, + "rewards/margins": 1.7359685897827148, + "rewards/rejected": -1.7925974130630493, + "step": 2440 + }, + { + "epoch": 2.53, + "learning_rate": 8.687332567929582e-08, + "logits/chosen": 22.983219146728516, + "logits/rejected": 23.094844818115234, + "logps/chosen": -316.54547119140625, + "logps/rejected": -292.8838806152344, + "loss": 0.3579, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.4759410321712494, + "rewards/margins": 1.2076809406280518, + "rewards/rejected": -1.6836220026016235, + "step": 2450 + }, + { + "epoch": 2.54, + "learning_rate": 8.495981630309988e-08, + "logits/chosen": 22.804758071899414, + "logits/rejected": 22.753246307373047, + "logps/chosen": -366.3809509277344, + "logps/rejected": -290.44805908203125, + "loss": 0.3613, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.37290042638778687, + "rewards/margins": 1.3757580518722534, + "rewards/rejected": -1.7486584186553955, + "step": 2460 + }, + { + "epoch": 2.55, + "learning_rate": 8.304630692690395e-08, + "logits/chosen": 22.425283432006836, + "logits/rejected": 22.801471710205078, + "logps/chosen": -316.58416748046875, + "logps/rejected": -268.90423583984375, + "loss": 0.3686, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.44941702485084534, + "rewards/margins": 1.3106696605682373, + "rewards/rejected": -1.7600864171981812, + "step": 2470 + }, + { + "epoch": 2.56, + "learning_rate": 8.1132797550708e-08, + "logits/chosen": 23.177642822265625, + "logits/rejected": 23.025049209594727, + "logps/chosen": -345.25958251953125, + "logps/rejected": -269.18951416015625, + "loss": 0.3337, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.33066803216934204, + "rewards/margins": 1.614101767539978, + "rewards/rejected": -1.9447696208953857, + "step": 2480 + }, + { + "epoch": 2.57, + "learning_rate": 7.921928817451206e-08, + "logits/chosen": 23.251216888427734, + "logits/rejected": 23.127471923828125, + "logps/chosen": -321.37078857421875, + "logps/rejected": -251.3941192626953, + "loss": 0.3158, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.3629501461982727, + "rewards/margins": 1.3023030757904053, + "rewards/rejected": -1.6652530431747437, + "step": 2490 + }, + { + "epoch": 2.58, + "learning_rate": 7.73057787983161e-08, + "logits/chosen": 23.11884880065918, + "logits/rejected": 23.025787353515625, + "logps/chosen": -372.3116760253906, + "logps/rejected": -297.9188232421875, + "loss": 0.3556, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.08432115614414215, + "rewards/margins": 1.5375818014144897, + "rewards/rejected": -1.6219028234481812, + "step": 2500 + }, + { + "epoch": 2.58, + "eval_logits/chosen": 23.198068618774414, + "eval_logits/rejected": 23.05204963684082, + "eval_logps/chosen": -359.8403625488281, + "eval_logps/rejected": -290.6595458984375, + "eval_loss": 0.5010030269622803, + "eval_rewards/accuracies": 0.7817460298538208, + "eval_rewards/chosen": -0.5270243287086487, + "eval_rewards/margins": 1.130587100982666, + "eval_rewards/rejected": -1.6576114892959595, + "eval_runtime": 208.0836, + "eval_samples_per_second": 9.612, + "eval_steps_per_second": 0.303, + "step": 2500 + }, + { + "epoch": 2.59, + "learning_rate": 7.539226942212017e-08, + "logits/chosen": 22.56097412109375, + "logits/rejected": 22.520360946655273, + "logps/chosen": -299.781982421875, + "logps/rejected": -319.82171630859375, + "loss": 0.3419, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.4395579397678375, + "rewards/margins": 1.4742848873138428, + "rewards/rejected": -1.9138429164886475, + "step": 2510 + }, + { + "epoch": 2.6, + "learning_rate": 7.347876004592423e-08, + "logits/chosen": 23.056285858154297, + "logits/rejected": 22.88377571105957, + "logps/chosen": -274.74249267578125, + "logps/rejected": -217.62075805664062, + "loss": 0.3617, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.48249778151512146, + "rewards/margins": 1.2477834224700928, + "rewards/rejected": -1.730281114578247, + "step": 2520 + }, + { + "epoch": 2.61, + "learning_rate": 7.156525066972828e-08, + "logits/chosen": 23.401355743408203, + "logits/rejected": 23.3753604888916, + "logps/chosen": -305.857666015625, + "logps/rejected": -269.84344482421875, + "loss": 0.3566, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5294305086135864, + "rewards/margins": 1.2906124591827393, + "rewards/rejected": -1.8200428485870361, + "step": 2530 + }, + { + "epoch": 2.62, + "learning_rate": 6.965174129353234e-08, + "logits/chosen": 23.210468292236328, + "logits/rejected": 23.068767547607422, + "logps/chosen": -418.8457946777344, + "logps/rejected": -322.9905700683594, + "loss": 0.3506, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.12323112785816193, + "rewards/margins": 1.8070284128189087, + "rewards/rejected": -1.9302597045898438, + "step": 2540 + }, + { + "epoch": 2.63, + "learning_rate": 6.773823191733639e-08, + "logits/chosen": 23.159671783447266, + "logits/rejected": 23.031639099121094, + "logps/chosen": -341.6581726074219, + "logps/rejected": -329.7276611328125, + "loss": 0.3489, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -0.43813997507095337, + "rewards/margins": 1.6730334758758545, + "rewards/rejected": -2.111173152923584, + "step": 2550 + }, + { + "epoch": 2.64, + "learning_rate": 6.582472254114045e-08, + "logits/chosen": 22.655208587646484, + "logits/rejected": 22.458200454711914, + "logps/chosen": -298.61737060546875, + "logps/rejected": -273.9313659667969, + "loss": 0.3689, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.464535653591156, + "rewards/margins": 1.3690606355667114, + "rewards/rejected": -1.8335964679718018, + "step": 2560 + }, + { + "epoch": 2.65, + "learning_rate": 6.391121316494451e-08, + "logits/chosen": 23.089435577392578, + "logits/rejected": 23.145009994506836, + "logps/chosen": -347.81793212890625, + "logps/rejected": -289.2439880371094, + "loss": 0.3245, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.26666170358657837, + "rewards/margins": 1.7196756601333618, + "rewards/rejected": -1.9863373041152954, + "step": 2570 + }, + { + "epoch": 2.66, + "learning_rate": 6.199770378874856e-08, + "logits/chosen": 23.03936767578125, + "logits/rejected": 22.83783531188965, + "logps/chosen": -326.26123046875, + "logps/rejected": -285.95294189453125, + "loss": 0.3369, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.3959008455276489, + "rewards/margins": 1.5528860092163086, + "rewards/rejected": -1.948786735534668, + "step": 2580 + }, + { + "epoch": 2.67, + "learning_rate": 6.008419441255262e-08, + "logits/chosen": 22.73525047302246, + "logits/rejected": 22.791269302368164, + "logps/chosen": -291.1043395996094, + "logps/rejected": -250.67880249023438, + "loss": 0.3344, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.7139278650283813, + "rewards/margins": 1.2345958948135376, + "rewards/rejected": -1.948523759841919, + "step": 2590 + }, + { + "epoch": 2.68, + "learning_rate": 5.817068503635668e-08, + "logits/chosen": 23.139039993286133, + "logits/rejected": 23.094404220581055, + "logps/chosen": -375.664794921875, + "logps/rejected": -297.1061706542969, + "loss": 0.3277, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.29812633991241455, + "rewards/margins": 1.5680840015411377, + "rewards/rejected": -1.8662105798721313, + "step": 2600 + }, + { + "epoch": 2.68, + "eval_logits/chosen": 23.19009780883789, + "eval_logits/rejected": 23.044872283935547, + "eval_logps/chosen": -359.9708251953125, + "eval_logps/rejected": -290.89959716796875, + "eval_loss": 0.49901142716407776, + "eval_rewards/accuracies": 0.7777777910232544, + "eval_rewards/chosen": -0.5400659441947937, + "eval_rewards/margins": 1.141547679901123, + "eval_rewards/rejected": -1.6816134452819824, + "eval_runtime": 212.6416, + "eval_samples_per_second": 9.405, + "eval_steps_per_second": 0.296, + "step": 2600 + }, + { + "epoch": 2.69, + "learning_rate": 5.6257175660160735e-08, + "logits/chosen": 23.452491760253906, + "logits/rejected": 23.291522979736328, + "logps/chosen": -321.8898010253906, + "logps/rejected": -302.38250732421875, + "loss": 0.3198, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -0.5151566863059998, + "rewards/margins": 1.5868747234344482, + "rewards/rejected": -2.1020312309265137, + "step": 2610 + }, + { + "epoch": 2.71, + "learning_rate": 5.4343666283964784e-08, + "logits/chosen": 22.93613052368164, + "logits/rejected": 23.02700424194336, + "logps/chosen": -337.6763000488281, + "logps/rejected": -263.98406982421875, + "loss": 0.3544, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.38289493322372437, + "rewards/margins": 1.6999647617340088, + "rewards/rejected": -2.0828592777252197, + "step": 2620 + }, + { + "epoch": 2.72, + "learning_rate": 5.243015690776884e-08, + "logits/chosen": 23.027408599853516, + "logits/rejected": 23.09657096862793, + "logps/chosen": -300.5941162109375, + "logps/rejected": -263.323486328125, + "loss": 0.3481, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.6709809899330139, + "rewards/margins": 1.2178490161895752, + "rewards/rejected": -1.8888299465179443, + "step": 2630 + }, + { + "epoch": 2.73, + "learning_rate": 5.05166475315729e-08, + "logits/chosen": 23.28525161743164, + "logits/rejected": 23.195045471191406, + "logps/chosen": -272.472900390625, + "logps/rejected": -262.8435974121094, + "loss": 0.3379, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.8382778167724609, + "rewards/margins": 1.3932462930679321, + "rewards/rejected": -2.2315242290496826, + "step": 2640 + }, + { + "epoch": 2.74, + "learning_rate": 4.860313815537696e-08, + "logits/chosen": 22.973094940185547, + "logits/rejected": 22.961816787719727, + "logps/chosen": -367.30596923828125, + "logps/rejected": -294.1488952636719, + "loss": 0.3489, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.19501671195030212, + "rewards/margins": 1.6359647512435913, + "rewards/rejected": -1.8309814929962158, + "step": 2650 + }, + { + "epoch": 2.75, + "learning_rate": 4.668962877918101e-08, + "logits/chosen": 23.02678680419922, + "logits/rejected": 22.784521102905273, + "logps/chosen": -329.62030029296875, + "logps/rejected": -373.6632080078125, + "loss": 0.3306, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.4452723562717438, + "rewards/margins": 1.5809751749038696, + "rewards/rejected": -2.026247501373291, + "step": 2660 + }, + { + "epoch": 2.76, + "learning_rate": 4.477611940298507e-08, + "logits/chosen": 22.902379989624023, + "logits/rejected": 22.912425994873047, + "logps/chosen": -332.11102294921875, + "logps/rejected": -280.44976806640625, + "loss": 0.3247, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.35678499937057495, + "rewards/margins": 1.8047094345092773, + "rewards/rejected": -2.161494493484497, + "step": 2670 + }, + { + "epoch": 2.77, + "learning_rate": 4.2862610026789124e-08, + "logits/chosen": 23.08858871459961, + "logits/rejected": 22.95041275024414, + "logps/chosen": -337.7413330078125, + "logps/rejected": -293.63623046875, + "loss": 0.3618, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.3582797646522522, + "rewards/margins": 1.579685091972351, + "rewards/rejected": -1.9379650354385376, + "step": 2680 + }, + { + "epoch": 2.78, + "learning_rate": 4.0949100650593186e-08, + "logits/chosen": 22.903911590576172, + "logits/rejected": 22.945873260498047, + "logps/chosen": -272.99493408203125, + "logps/rejected": -277.4879455566406, + "loss": 0.3657, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6198464035987854, + "rewards/margins": 1.3183465003967285, + "rewards/rejected": -1.9381929636001587, + "step": 2690 + }, + { + "epoch": 2.79, + "learning_rate": 3.903559127439724e-08, + "logits/chosen": 23.07727813720703, + "logits/rejected": 22.927719116210938, + "logps/chosen": -282.8468933105469, + "logps/rejected": -237.0935821533203, + "loss": 0.3262, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40676426887512207, + "rewards/margins": 1.3733875751495361, + "rewards/rejected": -1.7801517248153687, + "step": 2700 + }, + { + "epoch": 2.79, + "eval_logits/chosen": 23.187774658203125, + "eval_logits/rejected": 23.043867111206055, + "eval_logps/chosen": -359.5220031738281, + "eval_logps/rejected": -290.49322509765625, + "eval_loss": 0.4993184804916382, + "eval_rewards/accuracies": 0.7777777910232544, + "eval_rewards/chosen": -0.4951845407485962, + "eval_rewards/margins": 1.1457940340042114, + "eval_rewards/rejected": -1.6409783363342285, + "eval_runtime": 210.8376, + "eval_samples_per_second": 9.486, + "eval_steps_per_second": 0.299, + "step": 2700 + }, + { + "epoch": 2.8, + "learning_rate": 3.71220818982013e-08, + "logits/chosen": 23.060585021972656, + "logits/rejected": 22.836994171142578, + "logps/chosen": -348.59539794921875, + "logps/rejected": -282.60064697265625, + "loss": 0.3585, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.4623526632785797, + "rewards/margins": 1.4493197202682495, + "rewards/rejected": -1.911672592163086, + "step": 2710 + }, + { + "epoch": 2.81, + "learning_rate": 3.520857252200535e-08, + "logits/chosen": 23.332260131835938, + "logits/rejected": 23.22934341430664, + "logps/chosen": -373.9750061035156, + "logps/rejected": -321.8055725097656, + "loss": 0.334, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.24776187539100647, + "rewards/margins": 1.6318897008895874, + "rewards/rejected": -1.879651427268982, + "step": 2720 + }, + { + "epoch": 2.82, + "learning_rate": 3.3295063145809414e-08, + "logits/chosen": 23.10513687133789, + "logits/rejected": 23.070053100585938, + "logps/chosen": -295.7916259765625, + "logps/rejected": -298.29132080078125, + "loss": 0.3567, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4222725033760071, + "rewards/margins": 1.4338531494140625, + "rewards/rejected": -1.8561254739761353, + "step": 2730 + }, + { + "epoch": 2.83, + "learning_rate": 3.138155376961347e-08, + "logits/chosen": 22.993267059326172, + "logits/rejected": 22.975433349609375, + "logps/chosen": -340.28515625, + "logps/rejected": -270.3987731933594, + "loss": 0.3505, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4524230360984802, + "rewards/margins": 1.500270962715149, + "rewards/rejected": -1.9526941776275635, + "step": 2740 + }, + { + "epoch": 2.84, + "learning_rate": 2.9468044393417525e-08, + "logits/chosen": 22.807130813598633, + "logits/rejected": 22.657257080078125, + "logps/chosen": -302.7679748535156, + "logps/rejected": -253.1012420654297, + "loss": 0.3457, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3587990403175354, + "rewards/margins": 1.4861528873443604, + "rewards/rejected": -1.8449519872665405, + "step": 2750 + }, + { + "epoch": 2.85, + "learning_rate": 2.755453501722158e-08, + "logits/chosen": 22.716732025146484, + "logits/rejected": 22.806201934814453, + "logps/chosen": -340.51287841796875, + "logps/rejected": -296.96673583984375, + "loss": 0.3386, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.4456048011779785, + "rewards/margins": 1.4643114805221558, + "rewards/rejected": -1.9099165201187134, + "step": 2760 + }, + { + "epoch": 2.86, + "learning_rate": 2.564102564102564e-08, + "logits/chosen": 23.153486251831055, + "logits/rejected": 23.201038360595703, + "logps/chosen": -308.52288818359375, + "logps/rejected": -289.1993408203125, + "loss": 0.3403, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.3693477213382721, + "rewards/margins": 1.4097144603729248, + "rewards/rejected": -1.779062032699585, + "step": 2770 + }, + { + "epoch": 2.87, + "learning_rate": 2.3727516264829695e-08, + "logits/chosen": 22.77389907836914, + "logits/rejected": 22.64432144165039, + "logps/chosen": -388.85552978515625, + "logps/rejected": -363.8034362792969, + "loss": 0.3601, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.32134681940078735, + "rewards/margins": 1.4628154039382935, + "rewards/rejected": -1.784161925315857, + "step": 2780 + }, + { + "epoch": 2.88, + "learning_rate": 2.1814006888633754e-08, + "logits/chosen": 22.792617797851562, + "logits/rejected": 22.739648818969727, + "logps/chosen": -340.0162353515625, + "logps/rejected": -269.2567443847656, + "loss": 0.3476, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.3537456691265106, + "rewards/margins": 1.403322696685791, + "rewards/rejected": -1.7570682764053345, + "step": 2790 + }, + { + "epoch": 2.89, + "learning_rate": 1.990049751243781e-08, + "logits/chosen": 23.177873611450195, + "logits/rejected": 23.13758087158203, + "logps/chosen": -343.5755615234375, + "logps/rejected": -288.75555419921875, + "loss": 0.3566, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.39105096459388733, + "rewards/margins": 1.0802559852600098, + "rewards/rejected": -1.4713070392608643, + "step": 2800 + }, + { + "epoch": 2.89, + "eval_logits/chosen": 23.187063217163086, + "eval_logits/rejected": 23.043275833129883, + "eval_logps/chosen": -359.0445251464844, + "eval_logps/rejected": -290.0010070800781, + "eval_loss": 0.4985302686691284, + "eval_rewards/accuracies": 0.7777777910232544, + "eval_rewards/chosen": -0.44743794202804565, + "eval_rewards/margins": 1.144317388534546, + "eval_rewards/rejected": -1.5917555093765259, + "eval_runtime": 208.7121, + "eval_samples_per_second": 9.583, + "eval_steps_per_second": 0.302, + "step": 2800 + }, + { + "epoch": 2.9, + "learning_rate": 1.7986988136241865e-08, + "logits/chosen": 22.997955322265625, + "logits/rejected": 23.055164337158203, + "logps/chosen": -362.84918212890625, + "logps/rejected": -298.51922607421875, + "loss": 0.3433, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.2676233649253845, + "rewards/margins": 1.3830516338348389, + "rewards/rejected": -1.650674819946289, + "step": 2810 + }, + { + "epoch": 2.91, + "learning_rate": 1.6073478760045924e-08, + "logits/chosen": 23.13959312438965, + "logits/rejected": 22.91689682006836, + "logps/chosen": -358.01666259765625, + "logps/rejected": -246.228515625, + "loss": 0.3319, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -0.24576838314533234, + "rewards/margins": 1.788368582725525, + "rewards/rejected": -2.03413724899292, + "step": 2820 + }, + { + "epoch": 2.92, + "learning_rate": 1.4159969383849981e-08, + "logits/chosen": 22.98459243774414, + "logits/rejected": 23.025390625, + "logps/chosen": -346.28973388671875, + "logps/rejected": -279.1742858886719, + "loss": 0.3331, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.40038982033729553, + "rewards/margins": 1.4257802963256836, + "rewards/rejected": -1.8261702060699463, + "step": 2830 + }, + { + "epoch": 2.93, + "learning_rate": 1.2246460007654037e-08, + "logits/chosen": 23.19771957397461, + "logits/rejected": 23.1368408203125, + "logps/chosen": -349.0854187011719, + "logps/rejected": -281.1717529296875, + "loss": 0.3685, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.481764018535614, + "rewards/margins": 1.4477870464324951, + "rewards/rejected": -1.929551124572754, + "step": 2840 + }, + { + "epoch": 2.94, + "learning_rate": 1.0332950631458094e-08, + "logits/chosen": 23.161651611328125, + "logits/rejected": 23.000064849853516, + "logps/chosen": -338.02288818359375, + "logps/rejected": -283.4983215332031, + "loss": 0.3501, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.45347729325294495, + "rewards/margins": 1.2738498449325562, + "rewards/rejected": -1.7273271083831787, + "step": 2850 + }, + { + "epoch": 2.95, + "learning_rate": 8.419441255262151e-09, + "logits/chosen": 22.9562931060791, + "logits/rejected": 22.93158531188965, + "logps/chosen": -301.29132080078125, + "logps/rejected": -239.3927001953125, + "loss": 0.3382, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.4900715947151184, + "rewards/margins": 1.371382713317871, + "rewards/rejected": -1.8614543676376343, + "step": 2860 + }, + { + "epoch": 2.96, + "learning_rate": 6.505931879066207e-09, + "logits/chosen": 22.94473648071289, + "logits/rejected": 23.008617401123047, + "logps/chosen": -302.16436767578125, + "logps/rejected": -269.48828125, + "loss": 0.3549, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.308788925409317, + "rewards/margins": 1.3573650121688843, + "rewards/rejected": -1.666154146194458, + "step": 2870 + }, + { + "epoch": 2.97, + "learning_rate": 4.592422502870264e-09, + "logits/chosen": 23.0222225189209, + "logits/rejected": 22.979480743408203, + "logps/chosen": -329.9389343261719, + "logps/rejected": -272.41351318359375, + "loss": 0.3559, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.35177531838417053, + "rewards/margins": 1.4292513132095337, + "rewards/rejected": -1.781026840209961, + "step": 2880 + }, + { + "epoch": 2.98, + "learning_rate": 2.6789131266743202e-09, + "logits/chosen": 23.136159896850586, + "logits/rejected": 23.02133560180664, + "logps/chosen": -328.77093505859375, + "logps/rejected": -275.63995361328125, + "loss": 0.3498, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.4196252226829529, + "rewards/margins": 1.4171664714813232, + "rewards/rejected": -1.8367916345596313, + "step": 2890 + }, + { + "epoch": 2.99, + "learning_rate": 7.654037504783773e-10, + "logits/chosen": 23.257701873779297, + "logits/rejected": 23.05466079711914, + "logps/chosen": -311.91217041015625, + "logps/rejected": -304.32501220703125, + "loss": 0.3386, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.36090317368507385, + "rewards/margins": 1.4512748718261719, + "rewards/rejected": -1.8121780157089233, + "step": 2900 + }, + { + "epoch": 2.99, + "eval_logits/chosen": 23.18655014038086, + "eval_logits/rejected": 23.042728424072266, + "eval_logps/chosen": -359.16790771484375, + "eval_logps/rejected": -290.12347412109375, + "eval_loss": 0.4982847273349762, + "eval_rewards/accuracies": 0.7817460298538208, + "eval_rewards/chosen": -0.4597766697406769, + "eval_rewards/margins": 1.144227385520935, + "eval_rewards/rejected": -1.6040042638778687, + "eval_runtime": 212.9399, + "eval_samples_per_second": 9.392, + "eval_steps_per_second": 0.296, + "step": 2900 + }, + { + "epoch": 3.0, + "step": 2904, + "total_flos": 0.0, + "train_loss": 0.446941960284861, + "train_runtime": 57869.3533, + "train_samples_per_second": 3.212, + "train_steps_per_second": 0.05 + } + ], + "logging_steps": 10, + "max_steps": 2904, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}