{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 8826, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.662514156285391e-10, "logits/chosen": 2.7222177982330322, "logits/rejected": 2.6171863079071045, "logps/chosen": -391.45166015625, "logps/rejected": -819.539306640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 5.66251415628539e-09, "logits/chosen": 1.2315014600753784, "logits/rejected": 2.9117307662963867, "logps/chosen": -618.143798828125, "logps/rejected": -580.205810546875, "loss": 0.6935, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": 0.007726929150521755, "rewards/margins": 0.004576317500323057, "rewards/rejected": 0.0031506128143519163, "step": 10 }, { "epoch": 0.01, "learning_rate": 1.132502831257078e-08, "logits/chosen": 1.0855623483657837, "logits/rejected": 3.2654030323028564, "logps/chosen": -365.8334655761719, "logps/rejected": -507.7064514160156, "loss": 0.6945, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00016126893751788884, "rewards/margins": -0.011320212855935097, "rewards/rejected": 0.011481483466923237, "step": 20 }, { "epoch": 0.01, "learning_rate": 1.698754246885617e-08, "logits/chosen": 1.2892698049545288, "logits/rejected": 2.8500564098358154, "logps/chosen": -507.72869873046875, "logps/rejected": -473.02337646484375, "loss": 0.6923, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0013575742486864328, "rewards/margins": 0.011692820116877556, "rewards/rejected": -0.010335246101021767, "step": 30 }, { "epoch": 0.01, "learning_rate": 2.265005662514156e-08, "logits/chosen": 1.204493761062622, "logits/rejected": 3.2822766304016113, "logps/chosen": -384.868408203125, "logps/rejected": -533.8895874023438, "loss": 0.692, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.005484940949827433, "rewards/margins": -0.0033146303612738848, "rewards/rejected": -0.002170309191569686, "step": 40 }, { "epoch": 0.02, "learning_rate": 2.8312570781426952e-08, "logits/chosen": 0.5130025148391724, "logits/rejected": 2.7183468341827393, "logps/chosen": -339.9174499511719, "logps/rejected": -595.5003051757812, "loss": 0.6896, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0010748256463557482, "rewards/margins": 0.004564857110381126, "rewards/rejected": -0.0034900312311947346, "step": 50 }, { "epoch": 0.02, "learning_rate": 3.397508493771234e-08, "logits/chosen": 1.5610108375549316, "logits/rejected": 3.428831100463867, "logps/chosen": -428.70269775390625, "logps/rejected": -401.84478759765625, "loss": 0.6889, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.012302838265895844, "rewards/margins": 0.004726693034172058, "rewards/rejected": 0.007576142903417349, "step": 60 }, { "epoch": 0.02, "learning_rate": 3.9637599093997736e-08, "logits/chosen": 0.7089935541152954, "logits/rejected": 2.8489346504211426, "logps/chosen": -394.15484619140625, "logps/rejected": -542.0667114257812, "loss": 0.682, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.013238462619483471, "rewards/margins": 0.023042922839522362, "rewards/rejected": -0.00980446022003889, "step": 70 }, { "epoch": 0.03, "learning_rate": 4.530011325028312e-08, "logits/chosen": 1.2423776388168335, "logits/rejected": 3.1232666969299316, "logps/chosen": -538.8358764648438, "logps/rejected": -434.6361389160156, "loss": 0.6757, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.028549503535032272, "rewards/margins": 0.033378563821315765, "rewards/rejected": -0.004829060286283493, "step": 80 }, { "epoch": 0.03, "learning_rate": 5.096262740656852e-08, "logits/chosen": 1.0223588943481445, "logits/rejected": 3.520498752593994, "logps/chosen": -351.19488525390625, "logps/rejected": -328.054931640625, "loss": 0.6678, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.04164254665374756, "rewards/margins": 0.035227615386247635, "rewards/rejected": 0.0064149340614676476, "step": 90 }, { "epoch": 0.03, "learning_rate": 5.6625141562853904e-08, "logits/chosen": 0.7637745141983032, "logits/rejected": 3.3096764087677, "logps/chosen": -328.73541259765625, "logps/rejected": -459.0279235839844, "loss": 0.6612, "rewards/accuracies": 0.8125, "rewards/chosen": 0.06262902915477753, "rewards/margins": 0.08298339694738388, "rewards/rejected": -0.02035437896847725, "step": 100 }, { "epoch": 0.03, "eval_logits/chosen": 0.9032161235809326, "eval_logits/rejected": 3.2378697395324707, "eval_logps/chosen": -388.2252502441406, "eval_logps/rejected": -502.7813415527344, "eval_loss": 0.6499212980270386, "eval_rewards/accuracies": 0.8299663066864014, "eval_rewards/chosen": 0.07646423578262329, "eval_rewards/margins": 0.0915694385766983, "eval_rewards/rejected": -0.015105200931429863, "eval_runtime": 263.1916, "eval_samples_per_second": 36.095, "eval_steps_per_second": 1.128, "step": 100 }, { "epoch": 0.04, "learning_rate": 6.22876557191393e-08, "logits/chosen": 0.8191145062446594, "logits/rejected": 2.4177682399749756, "logps/chosen": -339.58966064453125, "logps/rejected": -722.6461181640625, "loss": 0.6388, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.09637552499771118, "rewards/margins": 0.11133376508951187, "rewards/rejected": -0.014958225190639496, "step": 110 }, { "epoch": 0.04, "learning_rate": 6.795016987542468e-08, "logits/chosen": 1.52385675907135, "logits/rejected": 2.9242682456970215, "logps/chosen": -369.44244384765625, "logps/rejected": -489.9684143066406, "loss": 0.6228, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.15489232540130615, "rewards/margins": 0.17702451348304749, "rewards/rejected": -0.02213219925761223, "step": 120 }, { "epoch": 0.04, "learning_rate": 7.361268403171007e-08, "logits/chosen": 1.4604432582855225, "logits/rejected": 3.0254647731781006, "logps/chosen": -409.232421875, "logps/rejected": -533.1013793945312, "loss": 0.6093, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.16234104335308075, "rewards/margins": 0.22091738879680634, "rewards/rejected": -0.05857633799314499, "step": 130 }, { "epoch": 0.05, "learning_rate": 7.927519818799547e-08, "logits/chosen": 0.8527463674545288, "logits/rejected": 2.8990535736083984, "logps/chosen": -306.9147644042969, "logps/rejected": -554.9722900390625, "loss": 0.6016, "rewards/accuracies": 0.9375, "rewards/chosen": 0.14748191833496094, "rewards/margins": 0.1829536259174347, "rewards/rejected": -0.03547172620892525, "step": 140 }, { "epoch": 0.05, "learning_rate": 8.493771234428086e-08, "logits/chosen": 1.6399835348129272, "logits/rejected": 3.005208730697632, "logps/chosen": -345.0897521972656, "logps/rejected": -477.846923828125, "loss": 0.5901, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.19912423193454742, "rewards/margins": 0.22349488735198975, "rewards/rejected": -0.024370649829506874, "step": 150 }, { "epoch": 0.05, "learning_rate": 9.060022650056625e-08, "logits/chosen": 1.7255092859268188, "logits/rejected": 3.2128493785858154, "logps/chosen": -329.3196716308594, "logps/rejected": -453.23236083984375, "loss": 0.5629, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.23143677413463593, "rewards/margins": 0.29594728350639343, "rewards/rejected": -0.06451050937175751, "step": 160 }, { "epoch": 0.06, "learning_rate": 9.626274065685163e-08, "logits/chosen": 0.5160809755325317, "logits/rejected": 3.3329761028289795, "logps/chosen": -425.4725036621094, "logps/rejected": -473.4015197753906, "loss": 0.5183, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.3114267885684967, "rewards/margins": 0.40139466524124146, "rewards/rejected": -0.08996789157390594, "step": 170 }, { "epoch": 0.06, "learning_rate": 1.0192525481313703e-07, "logits/chosen": 1.6642049551010132, "logits/rejected": 2.928480386734009, "logps/chosen": -462.1224060058594, "logps/rejected": -587.8157958984375, "loss": 0.4853, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.41756492853164673, "rewards/margins": 0.5190845727920532, "rewards/rejected": -0.10151971876621246, "step": 180 }, { "epoch": 0.06, "learning_rate": 1.0758776896942241e-07, "logits/chosen": 1.4960415363311768, "logits/rejected": 3.2985711097717285, "logps/chosen": -346.6057434082031, "logps/rejected": -505.11907958984375, "loss": 0.4605, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5564008951187134, "rewards/margins": 0.6558611392974854, "rewards/rejected": -0.09946014732122421, "step": 190 }, { "epoch": 0.07, "learning_rate": 1.1325028312570781e-07, "logits/chosen": 1.667295217514038, "logits/rejected": 3.3900818824768066, "logps/chosen": -399.5884094238281, "logps/rejected": -394.2358703613281, "loss": 0.4585, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.4879857897758484, "rewards/margins": 0.5740679502487183, "rewards/rejected": -0.08608220517635345, "step": 200 }, { "epoch": 0.07, "eval_logits/chosen": 0.9080647230148315, "eval_logits/rejected": 3.249399423599243, "eval_logps/chosen": -383.7663269042969, "eval_logps/rejected": -503.872314453125, "eval_loss": 0.44577568769454956, "eval_rewards/accuracies": 0.930134654045105, "eval_rewards/chosen": 0.5223554372787476, "eval_rewards/margins": 0.6465521454811096, "eval_rewards/rejected": -0.12419669330120087, "eval_runtime": 267.4864, "eval_samples_per_second": 35.516, "eval_steps_per_second": 1.11, "step": 200 }, { "epoch": 0.07, "learning_rate": 1.189127972819932e-07, "logits/chosen": 1.3309799432754517, "logits/rejected": 2.590378522872925, "logps/chosen": -338.12176513671875, "logps/rejected": -641.5697631835938, "loss": 0.4334, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6168349385261536, "rewards/margins": 0.7532841563224792, "rewards/rejected": -0.13644923269748688, "step": 210 }, { "epoch": 0.07, "learning_rate": 1.245753114382786e-07, "logits/chosen": 0.8702503442764282, "logits/rejected": 3.3507354259490967, "logps/chosen": -455.60137939453125, "logps/rejected": -447.7149963378906, "loss": 0.4091, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6552426218986511, "rewards/margins": 0.7988005876541138, "rewards/rejected": -0.14355802536010742, "step": 220 }, { "epoch": 0.08, "learning_rate": 1.3023782559456398e-07, "logits/chosen": 1.2613376379013062, "logits/rejected": 3.0275485515594482, "logps/chosen": -405.063232421875, "logps/rejected": -535.0067138671875, "loss": 0.3977, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.672575831413269, "rewards/margins": 0.811522364616394, "rewards/rejected": -0.13894659280776978, "step": 230 }, { "epoch": 0.08, "learning_rate": 1.3590033975084937e-07, "logits/chosen": 1.183171272277832, "logits/rejected": 2.8756091594696045, "logps/chosen": -385.80487060546875, "logps/rejected": -625.9861450195312, "loss": 0.3953, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6583465337753296, "rewards/margins": 0.9188927412033081, "rewards/rejected": -0.2605462074279785, "step": 240 }, { "epoch": 0.08, "learning_rate": 1.4156285390713476e-07, "logits/chosen": 1.2423839569091797, "logits/rejected": 3.269233226776123, "logps/chosen": -333.55419921875, "logps/rejected": -476.35003662109375, "loss": 0.3743, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7891533970832825, "rewards/margins": 1.0434578657150269, "rewards/rejected": -0.254304438829422, "step": 250 }, { "epoch": 0.09, "learning_rate": 1.4722536806342014e-07, "logits/chosen": 1.506807804107666, "logits/rejected": 3.2091269493103027, "logps/chosen": -398.2955017089844, "logps/rejected": -529.6971435546875, "loss": 0.3339, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8295198678970337, "rewards/margins": 1.06871497631073, "rewards/rejected": -0.23919522762298584, "step": 260 }, { "epoch": 0.09, "learning_rate": 1.5288788221970556e-07, "logits/chosen": 0.5976768136024475, "logits/rejected": 3.549748182296753, "logps/chosen": -358.4790954589844, "logps/rejected": -366.1632080078125, "loss": 0.3109, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9163538813591003, "rewards/margins": 1.2519645690917969, "rewards/rejected": -0.335610568523407, "step": 270 }, { "epoch": 0.1, "learning_rate": 1.5855039637599094e-07, "logits/chosen": 1.9247735738754272, "logits/rejected": 3.9325199127197266, "logps/chosen": -334.7523498535156, "logps/rejected": -343.63299560546875, "loss": 0.2906, "rewards/accuracies": 0.9375, "rewards/chosen": 1.034354567527771, "rewards/margins": 1.4914100170135498, "rewards/rejected": -0.45705538988113403, "step": 280 }, { "epoch": 0.1, "learning_rate": 1.642129105322763e-07, "logits/chosen": 0.8872585296630859, "logits/rejected": 3.4810404777526855, "logps/chosen": -305.9379577636719, "logps/rejected": -500.093017578125, "loss": 0.2794, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9907447695732117, "rewards/margins": 1.4527558088302612, "rewards/rejected": -0.46201109886169434, "step": 290 }, { "epoch": 0.1, "learning_rate": 1.6987542468856172e-07, "logits/chosen": 1.4215636253356934, "logits/rejected": 3.333827257156372, "logps/chosen": -331.09088134765625, "logps/rejected": -579.3836669921875, "loss": 0.2519, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.1994380950927734, "rewards/margins": 1.746361494064331, "rewards/rejected": -0.5469235777854919, "step": 300 }, { "epoch": 0.1, "eval_logits/chosen": 0.9126914739608765, "eval_logits/rejected": 3.279005765914917, "eval_logps/chosen": -376.9534606933594, "eval_logps/rejected": -507.4445495605469, "eval_loss": 0.253989577293396, "eval_rewards/accuracies": 0.9469696879386902, "eval_rewards/chosen": 1.2036420106887817, "eval_rewards/margins": 1.6850591897964478, "eval_rewards/rejected": -0.48141714930534363, "eval_runtime": 268.0468, "eval_samples_per_second": 35.442, "eval_steps_per_second": 1.108, "step": 300 }, { "epoch": 0.11, "learning_rate": 1.755379388448471e-07, "logits/chosen": 1.0812231302261353, "logits/rejected": 3.189873218536377, "logps/chosen": -342.5466003417969, "logps/rejected": -491.248046875, "loss": 0.2477, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.3413209915161133, "rewards/margins": 1.7608686685562134, "rewards/rejected": -0.4195477068424225, "step": 310 }, { "epoch": 0.11, "learning_rate": 1.812004530011325e-07, "logits/chosen": 1.6338831186294556, "logits/rejected": 3.1776280403137207, "logps/chosen": -517.4567260742188, "logps/rejected": -400.4752502441406, "loss": 0.2259, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.4155336618423462, "rewards/margins": 1.8963558673858643, "rewards/rejected": -0.4808220863342285, "step": 320 }, { "epoch": 0.11, "learning_rate": 1.868629671574179e-07, "logits/chosen": 1.18483567237854, "logits/rejected": 2.889453411102295, "logps/chosen": -396.57440185546875, "logps/rejected": -572.3213500976562, "loss": 0.211, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4644243717193604, "rewards/margins": 2.263093948364258, "rewards/rejected": -0.7986693382263184, "step": 330 }, { "epoch": 0.12, "learning_rate": 1.9252548131370327e-07, "logits/chosen": 1.4275858402252197, "logits/rejected": 3.412844181060791, "logps/chosen": -310.13433837890625, "logps/rejected": -513.6046752929688, "loss": 0.223, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.4905742406845093, "rewards/margins": 2.169682025909424, "rewards/rejected": -0.679107666015625, "step": 340 }, { "epoch": 0.12, "learning_rate": 1.9818799546998865e-07, "logits/chosen": 1.1891658306121826, "logits/rejected": 2.760612964630127, "logps/chosen": -370.5301818847656, "logps/rejected": -568.3065185546875, "loss": 0.2099, "rewards/accuracies": 1.0, "rewards/chosen": 1.5602627992630005, "rewards/margins": 2.338247776031494, "rewards/rejected": -0.777985155582428, "step": 350 }, { "epoch": 0.12, "learning_rate": 2.0385050962627407e-07, "logits/chosen": 1.4980577230453491, "logits/rejected": 3.5479140281677246, "logps/chosen": -308.7692565917969, "logps/rejected": -398.22100830078125, "loss": 0.1847, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.408210039138794, "rewards/margins": 2.200162887573242, "rewards/rejected": -0.7919529676437378, "step": 360 }, { "epoch": 0.13, "learning_rate": 2.0951302378255946e-07, "logits/chosen": 1.1414896249771118, "logits/rejected": 2.569624423980713, "logps/chosen": -315.5340270996094, "logps/rejected": -732.6063232421875, "loss": 0.1939, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.3921127319335938, "rewards/margins": 2.1865804195404053, "rewards/rejected": -0.7944676280021667, "step": 370 }, { "epoch": 0.13, "learning_rate": 2.1517553793884482e-07, "logits/chosen": 1.4118402004241943, "logits/rejected": 2.800691604614258, "logps/chosen": -356.115478515625, "logps/rejected": -640.7478637695312, "loss": 0.1899, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.7040725946426392, "rewards/margins": 2.3913490772247314, "rewards/rejected": -0.6872765421867371, "step": 380 }, { "epoch": 0.13, "learning_rate": 2.2083805209513023e-07, "logits/chosen": 1.4530861377716064, "logits/rejected": 3.166997194290161, "logps/chosen": -379.05584716796875, "logps/rejected": -408.69830322265625, "loss": 0.1978, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.4759045839309692, "rewards/margins": 2.3196094036102295, "rewards/rejected": -0.8437048196792603, "step": 390 }, { "epoch": 0.14, "learning_rate": 2.2650056625141562e-07, "logits/chosen": 1.2422279119491577, "logits/rejected": 3.232043504714966, "logps/chosen": -335.52813720703125, "logps/rejected": -601.8195190429688, "loss": 0.17, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9281127452850342, "rewards/margins": 3.019509792327881, "rewards/rejected": -1.0913970470428467, "step": 400 }, { "epoch": 0.14, "eval_logits/chosen": 0.9173203706741333, "eval_logits/rejected": 3.3007280826568604, "eval_logps/chosen": -373.1958923339844, "eval_logps/rejected": -512.6629028320312, "eval_loss": 0.17509011924266815, "eval_rewards/accuracies": 0.9562289714813232, "eval_rewards/chosen": 1.5794016122817993, "eval_rewards/margins": 2.582660436630249, "eval_rewards/rejected": -1.0032589435577393, "eval_runtime": 268.2511, "eval_samples_per_second": 35.415, "eval_steps_per_second": 1.107, "step": 400 }, { "epoch": 0.14, "learning_rate": 2.32163080407701e-07, "logits/chosen": 0.40517282485961914, "logits/rejected": 2.861290454864502, "logps/chosen": -316.54620361328125, "logps/rejected": -506.2539978027344, "loss": 0.1664, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.8524707555770874, "rewards/margins": 2.5778260231018066, "rewards/rejected": -0.7253550291061401, "step": 410 }, { "epoch": 0.14, "learning_rate": 2.378255945639864e-07, "logits/chosen": 0.664097785949707, "logits/rejected": 2.845303535461426, "logps/chosen": -320.03125, "logps/rejected": -667.8426513671875, "loss": 0.1677, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.7594044208526611, "rewards/margins": 2.9025471210479736, "rewards/rejected": -1.143142819404602, "step": 420 }, { "epoch": 0.15, "learning_rate": 2.434881087202718e-07, "logits/chosen": 0.7501753568649292, "logits/rejected": 2.8510653972625732, "logps/chosen": -307.3896179199219, "logps/rejected": -599.6353149414062, "loss": 0.1465, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.7693700790405273, "rewards/margins": 3.032701015472412, "rewards/rejected": -1.2633306980133057, "step": 430 }, { "epoch": 0.15, "learning_rate": 2.491506228765572e-07, "logits/chosen": 1.5089292526245117, "logits/rejected": 3.524827241897583, "logps/chosen": -321.5747375488281, "logps/rejected": -450.64276123046875, "loss": 0.1466, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5898950099945068, "rewards/margins": 3.1321098804473877, "rewards/rejected": -1.5422146320343018, "step": 440 }, { "epoch": 0.15, "learning_rate": 2.548131370328426e-07, "logits/chosen": 1.3345590829849243, "logits/rejected": 3.7219035625457764, "logps/chosen": -352.8118896484375, "logps/rejected": -376.58013916015625, "loss": 0.1484, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4617602825164795, "rewards/margins": 3.2981064319610596, "rewards/rejected": -1.8363460302352905, "step": 450 }, { "epoch": 0.16, "learning_rate": 2.6047565118912797e-07, "logits/chosen": 1.7875549793243408, "logits/rejected": 3.7318198680877686, "logps/chosen": -387.7720947265625, "logps/rejected": -427.65972900390625, "loss": 0.1293, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.8360264301300049, "rewards/margins": 3.6591992378234863, "rewards/rejected": -1.8231725692749023, "step": 460 }, { "epoch": 0.16, "learning_rate": 2.6613816534541335e-07, "logits/chosen": 1.857513189315796, "logits/rejected": 3.7900021076202393, "logps/chosen": -471.33551025390625, "logps/rejected": -291.6858825683594, "loss": 0.1331, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.937786340713501, "rewards/margins": 3.6518795490264893, "rewards/rejected": -1.7140929698944092, "step": 470 }, { "epoch": 0.16, "learning_rate": 2.7180067950169874e-07, "logits/chosen": 0.5211232304573059, "logits/rejected": 2.575617790222168, "logps/chosen": -314.490966796875, "logps/rejected": -596.9625244140625, "loss": 0.1188, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.9314266443252563, "rewards/margins": 3.5651352405548096, "rewards/rejected": -1.6337085962295532, "step": 480 }, { "epoch": 0.17, "learning_rate": 2.7746319365798413e-07, "logits/chosen": 1.2549169063568115, "logits/rejected": 3.3965110778808594, "logps/chosen": -345.92669677734375, "logps/rejected": -505.69659423828125, "loss": 0.1212, "rewards/accuracies": 1.0, "rewards/chosen": 2.377042531967163, "rewards/margins": 4.275637149810791, "rewards/rejected": -1.8985941410064697, "step": 490 }, { "epoch": 0.17, "learning_rate": 2.831257078142695e-07, "logits/chosen": 1.7851215600967407, "logits/rejected": 3.7306265830993652, "logps/chosen": -459.65802001953125, "logps/rejected": -326.4261169433594, "loss": 0.1179, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.8807199001312256, "rewards/margins": 3.8889358043670654, "rewards/rejected": -2.008216142654419, "step": 500 }, { "epoch": 0.17, "eval_logits/chosen": 0.9103542566299438, "eval_logits/rejected": 3.292522430419922, "eval_logps/chosen": -370.5672607421875, "eval_logps/rejected": -523.4216918945312, "eval_loss": 0.12153849750757217, "eval_rewards/accuracies": 0.9587541818618774, "eval_rewards/chosen": 1.8422629833221436, "eval_rewards/margins": 3.9213974475860596, "eval_rewards/rejected": -2.079134225845337, "eval_runtime": 268.6563, "eval_samples_per_second": 35.361, "eval_steps_per_second": 1.106, "step": 500 }, { "epoch": 0.17, "learning_rate": 2.887882219705549e-07, "logits/chosen": 2.08186936378479, "logits/rejected": 2.738398313522339, "logps/chosen": -426.4551696777344, "logps/rejected": -635.5200805664062, "loss": 0.1274, "rewards/accuracies": 0.9375, "rewards/chosen": 2.002917766571045, "rewards/margins": 4.0474138259887695, "rewards/rejected": -2.0444960594177246, "step": 510 }, { "epoch": 0.18, "learning_rate": 2.944507361268403e-07, "logits/chosen": 1.487585186958313, "logits/rejected": 2.9141831398010254, "logps/chosen": -328.9551086425781, "logps/rejected": -544.206787109375, "loss": 0.112, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.959592580795288, "rewards/margins": 4.269417762756348, "rewards/rejected": -2.3098244667053223, "step": 520 }, { "epoch": 0.18, "learning_rate": 3.001132502831257e-07, "logits/chosen": 1.0084563493728638, "logits/rejected": 3.2186686992645264, "logps/chosen": -324.2669677734375, "logps/rejected": -610.30029296875, "loss": 0.1176, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.2090611457824707, "rewards/margins": 4.53490686416626, "rewards/rejected": -2.32584547996521, "step": 530 }, { "epoch": 0.18, "learning_rate": 3.057757644394111e-07, "logits/chosen": 1.4372812509536743, "logits/rejected": 3.4026083946228027, "logps/chosen": -311.79840087890625, "logps/rejected": -425.08453369140625, "loss": 0.1028, "rewards/accuracies": 1.0, "rewards/chosen": 2.122832775115967, "rewards/margins": 4.550196647644043, "rewards/rejected": -2.427363872528076, "step": 540 }, { "epoch": 0.19, "learning_rate": 3.114382785956965e-07, "logits/chosen": 0.9511749148368835, "logits/rejected": 3.254910707473755, "logps/chosen": -321.9422302246094, "logps/rejected": -476.6572265625, "loss": 0.1174, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.133894681930542, "rewards/margins": 4.333982944488525, "rewards/rejected": -2.2000882625579834, "step": 550 }, { "epoch": 0.19, "learning_rate": 3.171007927519819e-07, "logits/chosen": 1.9490430355072021, "logits/rejected": 3.237525224685669, "logps/chosen": -447.9163513183594, "logps/rejected": -467.66162109375, "loss": 0.1015, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.141505718231201, "rewards/margins": 4.317712306976318, "rewards/rejected": -2.17620587348938, "step": 560 }, { "epoch": 0.19, "learning_rate": 3.227633069082673e-07, "logits/chosen": 0.9235776662826538, "logits/rejected": 3.400035858154297, "logps/chosen": -471.5634765625, "logps/rejected": -351.24774169921875, "loss": 0.0831, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.087817668914795, "rewards/margins": 4.746884346008301, "rewards/rejected": -2.659067153930664, "step": 570 }, { "epoch": 0.2, "learning_rate": 3.284258210645526e-07, "logits/chosen": 1.2091686725616455, "logits/rejected": 3.3013522624969482, "logps/chosen": -441.1993713378906, "logps/rejected": -504.54559326171875, "loss": 0.1315, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.3476431369781494, "rewards/margins": 4.925909996032715, "rewards/rejected": -2.5782666206359863, "step": 580 }, { "epoch": 0.2, "learning_rate": 3.34088335220838e-07, "logits/chosen": 1.3331282138824463, "logits/rejected": 3.4447312355041504, "logps/chosen": -329.92047119140625, "logps/rejected": -407.18438720703125, "loss": 0.1286, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.297781467437744, "rewards/margins": 5.266199111938477, "rewards/rejected": -2.9684176445007324, "step": 590 }, { "epoch": 0.2, "learning_rate": 3.3975084937712344e-07, "logits/chosen": 1.0378614664077759, "logits/rejected": 2.619415760040283, "logps/chosen": -421.515625, "logps/rejected": -664.7316284179688, "loss": 0.1032, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.105104446411133, "rewards/margins": 4.875788688659668, "rewards/rejected": -2.7706847190856934, "step": 600 }, { "epoch": 0.2, "eval_logits/chosen": 0.9179551005363464, "eval_logits/rejected": 3.257366895675659, "eval_logps/chosen": -368.0875549316406, "eval_logps/rejected": -530.27734375, "eval_loss": 0.10775981098413467, "eval_rewards/accuracies": 0.9595959782600403, "eval_rewards/chosen": 2.090228796005249, "eval_rewards/margins": 4.854931831359863, "eval_rewards/rejected": -2.7647030353546143, "eval_runtime": 268.0718, "eval_samples_per_second": 35.438, "eval_steps_per_second": 1.108, "step": 600 }, { "epoch": 0.21, "learning_rate": 3.454133635334088e-07, "logits/chosen": 1.8005974292755127, "logits/rejected": 3.3652682304382324, "logps/chosen": -355.82696533203125, "logps/rejected": -472.01287841796875, "loss": 0.1071, "rewards/accuracies": 0.9375, "rewards/chosen": 1.616498351097107, "rewards/margins": 4.316979885101318, "rewards/rejected": -2.70048189163208, "step": 610 }, { "epoch": 0.21, "learning_rate": 3.510758776896942e-07, "logits/chosen": 1.010353684425354, "logits/rejected": 2.7700181007385254, "logps/chosen": -447.08050537109375, "logps/rejected": -578.945556640625, "loss": 0.1084, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.266951322555542, "rewards/margins": 4.7797393798828125, "rewards/rejected": -2.5127882957458496, "step": 620 }, { "epoch": 0.21, "learning_rate": 3.567383918459796e-07, "logits/chosen": 1.316406488418579, "logits/rejected": 3.448246717453003, "logps/chosen": -392.95458984375, "logps/rejected": -317.03570556640625, "loss": 0.0983, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.9485342502593994, "rewards/margins": 5.007516384124756, "rewards/rejected": -3.0589828491210938, "step": 630 }, { "epoch": 0.22, "learning_rate": 3.62400906002265e-07, "logits/chosen": 1.4376599788665771, "logits/rejected": 3.0495493412017822, "logps/chosen": -385.1656799316406, "logps/rejected": -570.9940185546875, "loss": 0.0844, "rewards/accuracies": 1.0, "rewards/chosen": 2.372509479522705, "rewards/margins": 5.976849555969238, "rewards/rejected": -3.6043407917022705, "step": 640 }, { "epoch": 0.22, "learning_rate": 3.6806342015855037e-07, "logits/chosen": 1.7421470880508423, "logits/rejected": 3.6961002349853516, "logps/chosen": -357.5590515136719, "logps/rejected": -390.57305908203125, "loss": 0.0852, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.394005298614502, "rewards/margins": 5.477439880371094, "rewards/rejected": -3.083434820175171, "step": 650 }, { "epoch": 0.22, "learning_rate": 3.737259343148358e-07, "logits/chosen": 1.2248786687850952, "logits/rejected": 3.567431926727295, "logps/chosen": -317.44512939453125, "logps/rejected": -538.3814697265625, "loss": 0.0836, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.2307286262512207, "rewards/margins": 5.791689872741699, "rewards/rejected": -3.560961961746216, "step": 660 }, { "epoch": 0.23, "learning_rate": 3.7938844847112115e-07, "logits/chosen": 1.7818076610565186, "logits/rejected": 3.1708149909973145, "logps/chosen": -360.680908203125, "logps/rejected": -613.5748901367188, "loss": 0.0958, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.9897119998931885, "rewards/margins": 5.3165106773376465, "rewards/rejected": -3.326798677444458, "step": 670 }, { "epoch": 0.23, "learning_rate": 3.8505096262740653e-07, "logits/chosen": 1.89913010597229, "logits/rejected": 3.2814056873321533, "logps/chosen": -339.29925537109375, "logps/rejected": -550.3546752929688, "loss": 0.0629, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.3413264751434326, "rewards/margins": 5.58084774017334, "rewards/rejected": -3.23952054977417, "step": 680 }, { "epoch": 0.23, "learning_rate": 3.907134767836919e-07, "logits/chosen": 1.546518325805664, "logits/rejected": 2.939469337463379, "logps/chosen": -416.63336181640625, "logps/rejected": -570.0182495117188, "loss": 0.0906, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.569227933883667, "rewards/margins": 6.09161901473999, "rewards/rejected": -3.5223910808563232, "step": 690 }, { "epoch": 0.24, "learning_rate": 3.963759909399773e-07, "logits/chosen": 0.5802558660507202, "logits/rejected": 2.9904799461364746, "logps/chosen": -362.68701171875, "logps/rejected": -586.5096435546875, "loss": 0.0614, "rewards/accuracies": 1.0, "rewards/chosen": 2.8239052295684814, "rewards/margins": 6.065105438232422, "rewards/rejected": -3.2412002086639404, "step": 700 }, { "epoch": 0.24, "eval_logits/chosen": 0.9190412759780884, "eval_logits/rejected": 3.2242751121520996, "eval_logps/chosen": -366.1594543457031, "eval_logps/rejected": -536.8206787109375, "eval_loss": 0.08807818591594696, "eval_rewards/accuracies": 0.9638047218322754, "eval_rewards/chosen": 2.28304123878479, "eval_rewards/margins": 5.702078342437744, "eval_rewards/rejected": -3.419036865234375, "eval_runtime": 268.6383, "eval_samples_per_second": 35.364, "eval_steps_per_second": 1.106, "step": 700 }, { "epoch": 0.24, "learning_rate": 4.0203850509626275e-07, "logits/chosen": 1.4401183128356934, "logits/rejected": 3.3762612342834473, "logps/chosen": -354.66778564453125, "logps/rejected": -428.1043395996094, "loss": 0.0789, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.3884358406066895, "rewards/margins": 6.028875827789307, "rewards/rejected": -3.640439510345459, "step": 710 }, { "epoch": 0.24, "learning_rate": 4.0770101925254814e-07, "logits/chosen": 1.7469635009765625, "logits/rejected": 3.5658583641052246, "logps/chosen": -344.3053894042969, "logps/rejected": -453.9859924316406, "loss": 0.103, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.06357741355896, "rewards/margins": 5.505257606506348, "rewards/rejected": -3.4416797161102295, "step": 720 }, { "epoch": 0.25, "learning_rate": 4.133635334088335e-07, "logits/chosen": 1.3185302019119263, "logits/rejected": 3.143467903137207, "logps/chosen": -398.46685791015625, "logps/rejected": -499.00762939453125, "loss": 0.0714, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.513793706893921, "rewards/margins": 5.737306594848633, "rewards/rejected": -3.2235121726989746, "step": 730 }, { "epoch": 0.25, "learning_rate": 4.190260475651189e-07, "logits/chosen": 1.3261969089508057, "logits/rejected": 3.325129270553589, "logps/chosen": -395.0682678222656, "logps/rejected": -463.11102294921875, "loss": 0.0837, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.9389358758926392, "rewards/margins": 5.200533866882324, "rewards/rejected": -3.2615981101989746, "step": 740 }, { "epoch": 0.25, "learning_rate": 4.2468856172140424e-07, "logits/chosen": 0.8084484338760376, "logits/rejected": 3.0004987716674805, "logps/chosen": -308.0705871582031, "logps/rejected": -545.3846435546875, "loss": 0.0571, "rewards/accuracies": 1.0, "rewards/chosen": 2.676938772201538, "rewards/margins": 6.314852237701416, "rewards/rejected": -3.637913227081299, "step": 750 }, { "epoch": 0.26, "learning_rate": 4.3035107587768963e-07, "logits/chosen": 1.5113528966903687, "logits/rejected": 3.3475310802459717, "logps/chosen": -314.2061462402344, "logps/rejected": -437.4307556152344, "loss": 0.0932, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.2036123275756836, "rewards/margins": 5.510008811950684, "rewards/rejected": -3.306396484375, "step": 760 }, { "epoch": 0.26, "learning_rate": 4.3601359003397507e-07, "logits/chosen": 1.2984874248504639, "logits/rejected": 2.807908058166504, "logps/chosen": -425.2962951660156, "logps/rejected": -527.3756713867188, "loss": 0.0761, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.349900722503662, "rewards/margins": 5.7003302574157715, "rewards/rejected": -3.350429058074951, "step": 770 }, { "epoch": 0.27, "learning_rate": 4.4167610419026046e-07, "logits/chosen": 0.9681515693664551, "logits/rejected": 2.8194892406463623, "logps/chosen": -438.4591369628906, "logps/rejected": -627.0498046875, "loss": 0.0831, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.592127561569214, "rewards/margins": 6.795496940612793, "rewards/rejected": -4.203368663787842, "step": 780 }, { "epoch": 0.27, "learning_rate": 4.4733861834654585e-07, "logits/chosen": 1.1889218091964722, "logits/rejected": 3.3345398902893066, "logps/chosen": -363.73291015625, "logps/rejected": -372.0538635253906, "loss": 0.0681, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.4557104110717773, "rewards/margins": 6.361659526824951, "rewards/rejected": -3.905949115753174, "step": 790 }, { "epoch": 0.27, "learning_rate": 4.5300113250283123e-07, "logits/chosen": 1.7186403274536133, "logits/rejected": 2.917168140411377, "logps/chosen": -373.5525207519531, "logps/rejected": -480.1240234375, "loss": 0.0666, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.4754912853240967, "rewards/margins": 5.41542387008667, "rewards/rejected": -2.939932346343994, "step": 800 }, { "epoch": 0.27, "eval_logits/chosen": 0.9024965167045593, "eval_logits/rejected": 3.1787774562835693, "eval_logps/chosen": -365.29949951171875, "eval_logps/rejected": -543.2213745117188, "eval_loss": 0.07510381191968918, "eval_rewards/accuracies": 0.9688552021980286, "eval_rewards/chosen": 2.3690366744995117, "eval_rewards/margins": 6.428139686584473, "eval_rewards/rejected": -4.059103012084961, "eval_runtime": 268.4853, "eval_samples_per_second": 35.384, "eval_steps_per_second": 1.106, "step": 800 }, { "epoch": 0.28, "learning_rate": 4.586636466591166e-07, "logits/chosen": 1.7102317810058594, "logits/rejected": 3.0732152462005615, "logps/chosen": -386.9404296875, "logps/rejected": -596.4518432617188, "loss": 0.0662, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.70186185836792, "rewards/margins": 6.8812575340271, "rewards/rejected": -4.179396152496338, "step": 810 }, { "epoch": 0.28, "learning_rate": 4.64326160815402e-07, "logits/chosen": 0.8180482983589172, "logits/rejected": 3.4353580474853516, "logps/chosen": -461.099609375, "logps/rejected": -415.77276611328125, "loss": 0.0646, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.202852249145508, "rewards/margins": 5.79763126373291, "rewards/rejected": -3.594778060913086, "step": 820 }, { "epoch": 0.28, "learning_rate": 4.6998867497168745e-07, "logits/chosen": 1.0256288051605225, "logits/rejected": 3.0926876068115234, "logps/chosen": -298.9940490722656, "logps/rejected": -572.1815185546875, "loss": 0.0808, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.5179805755615234, "rewards/margins": 6.361120700836182, "rewards/rejected": -3.843140125274658, "step": 830 }, { "epoch": 0.29, "learning_rate": 4.756511891279728e-07, "logits/chosen": 1.130673885345459, "logits/rejected": 3.3938686847686768, "logps/chosen": -388.12799072265625, "logps/rejected": -495.4458923339844, "loss": 0.0556, "rewards/accuracies": 1.0, "rewards/chosen": 2.9046478271484375, "rewards/margins": 7.362362861633301, "rewards/rejected": -4.457714557647705, "step": 840 }, { "epoch": 0.29, "learning_rate": 4.813137032842582e-07, "logits/chosen": 0.9663546681404114, "logits/rejected": 2.575662612915039, "logps/chosen": -392.9145812988281, "logps/rejected": -729.2246704101562, "loss": 0.065, "rewards/accuracies": 1.0, "rewards/chosen": 2.4775688648223877, "rewards/margins": 6.4355573654174805, "rewards/rejected": -3.9579882621765137, "step": 850 }, { "epoch": 0.29, "learning_rate": 4.869762174405436e-07, "logits/chosen": 1.4549670219421387, "logits/rejected": 3.0449929237365723, "logps/chosen": -364.0623779296875, "logps/rejected": -563.6417236328125, "loss": 0.0749, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.2536864280700684, "rewards/margins": 6.307879447937012, "rewards/rejected": -4.054192543029785, "step": 860 }, { "epoch": 0.3, "learning_rate": 4.92638731596829e-07, "logits/chosen": 1.299466848373413, "logits/rejected": 2.705226421356201, "logps/chosen": -460.9673767089844, "logps/rejected": -606.0197143554688, "loss": 0.0624, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 3.1807148456573486, "rewards/margins": 6.8419928550720215, "rewards/rejected": -3.6612777709960938, "step": 870 }, { "epoch": 0.3, "learning_rate": 4.983012457531144e-07, "logits/chosen": 0.784410834312439, "logits/rejected": 3.4160079956054688, "logps/chosen": -290.4139099121094, "logps/rejected": -488.72674560546875, "loss": 0.0613, "rewards/accuracies": 1.0, "rewards/chosen": 2.693528890609741, "rewards/margins": 7.0350165367126465, "rewards/rejected": -4.341487884521484, "step": 880 }, { "epoch": 0.3, "learning_rate": 4.995593604431575e-07, "logits/chosen": 1.353736162185669, "logits/rejected": 3.3012149333953857, "logps/chosen": -353.3387145996094, "logps/rejected": -380.4452819824219, "loss": 0.0599, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.2113845348358154, "rewards/margins": 6.058213710784912, "rewards/rejected": -3.8468291759490967, "step": 890 }, { "epoch": 0.31, "learning_rate": 4.989298753619539e-07, "logits/chosen": 1.7279170751571655, "logits/rejected": 3.2604126930236816, "logps/chosen": -329.1844482421875, "logps/rejected": -455.5565490722656, "loss": 0.0706, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.6200737953186035, "rewards/margins": 7.700595855712891, "rewards/rejected": -5.080521583557129, "step": 900 }, { "epoch": 0.31, "eval_logits/chosen": 0.9102387428283691, "eval_logits/rejected": 3.162445306777954, "eval_logps/chosen": -364.9873962402344, "eval_logps/rejected": -547.8843383789062, "eval_loss": 0.06624022871255875, "eval_rewards/accuracies": 0.9722222089767456, "eval_rewards/chosen": 2.4002487659454346, "eval_rewards/margins": 6.92565393447876, "eval_rewards/rejected": -4.525404453277588, "eval_runtime": 268.0833, "eval_samples_per_second": 35.437, "eval_steps_per_second": 1.108, "step": 900 }, { "epoch": 0.31, "learning_rate": 4.983003902807503e-07, "logits/chosen": 1.7389227151870728, "logits/rejected": 3.2229621410369873, "logps/chosen": -427.50872802734375, "logps/rejected": -388.01153564453125, "loss": 0.0658, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.2974724769592285, "rewards/margins": 6.737030029296875, "rewards/rejected": -4.439557075500488, "step": 910 }, { "epoch": 0.31, "learning_rate": 4.976709051995467e-07, "logits/chosen": 1.2352038621902466, "logits/rejected": 3.0501484870910645, "logps/chosen": -321.5035705566406, "logps/rejected": -480.87176513671875, "loss": 0.0651, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.5568785667419434, "rewards/margins": 7.235832214355469, "rewards/rejected": -4.678953170776367, "step": 920 }, { "epoch": 0.32, "learning_rate": 4.970414201183432e-07, "logits/chosen": 0.8484708070755005, "logits/rejected": 3.094775438308716, "logps/chosen": -347.93670654296875, "logps/rejected": -578.5217895507812, "loss": 0.0647, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.865225076675415, "rewards/margins": 7.946264743804932, "rewards/rejected": -5.081039905548096, "step": 930 }, { "epoch": 0.32, "learning_rate": 4.964119350371396e-07, "logits/chosen": 1.2264028787612915, "logits/rejected": 2.9254095554351807, "logps/chosen": -314.49755859375, "logps/rejected": -542.8404541015625, "loss": 0.053, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.582854747772217, "rewards/margins": 7.226229667663574, "rewards/rejected": -4.643374443054199, "step": 940 }, { "epoch": 0.32, "learning_rate": 4.95782449955936e-07, "logits/chosen": 1.222130537033081, "logits/rejected": 3.148733139038086, "logps/chosen": -374.54144287109375, "logps/rejected": -497.5044860839844, "loss": 0.0576, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.738006353378296, "rewards/margins": 7.113809108734131, "rewards/rejected": -4.375802993774414, "step": 950 }, { "epoch": 0.33, "learning_rate": 4.951529648747325e-07, "logits/chosen": 1.75167715549469, "logits/rejected": 3.0500741004943848, "logps/chosen": -428.451416015625, "logps/rejected": -546.074462890625, "loss": 0.0441, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.750443935394287, "rewards/margins": 7.7640814781188965, "rewards/rejected": -5.013637065887451, "step": 960 }, { "epoch": 0.33, "learning_rate": 4.945234797935289e-07, "logits/chosen": 0.9173381924629211, "logits/rejected": 2.5977022647857666, "logps/chosen": -297.50433349609375, "logps/rejected": -687.2318115234375, "loss": 0.0657, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.5338656902313232, "rewards/margins": 7.702880859375, "rewards/rejected": -5.1690144538879395, "step": 970 }, { "epoch": 0.33, "learning_rate": 4.938939947123252e-07, "logits/chosen": 0.8320339918136597, "logits/rejected": 2.6507489681243896, "logps/chosen": -423.51678466796875, "logps/rejected": -640.7545776367188, "loss": 0.087, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.4527504444122314, "rewards/margins": 7.721698760986328, "rewards/rejected": -5.268949031829834, "step": 980 }, { "epoch": 0.34, "learning_rate": 4.932645096311217e-07, "logits/chosen": 1.5152180194854736, "logits/rejected": 3.082857608795166, "logps/chosen": -295.34088134765625, "logps/rejected": -608.37353515625, "loss": 0.0672, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.7915849685668945, "rewards/margins": 7.854758262634277, "rewards/rejected": -5.063173770904541, "step": 990 }, { "epoch": 0.34, "learning_rate": 4.926350245499181e-07, "logits/chosen": 1.2758054733276367, "logits/rejected": 3.3109130859375, "logps/chosen": -388.2832946777344, "logps/rejected": -477.9232482910156, "loss": 0.0711, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.6032989025115967, "rewards/margins": 7.5214715003967285, "rewards/rejected": -4.918172359466553, "step": 1000 }, { "epoch": 0.34, "eval_logits/chosen": 0.9093015789985657, "eval_logits/rejected": 3.146707534790039, "eval_logps/chosen": -364.759765625, "eval_logps/rejected": -551.8095703125, "eval_loss": 0.05768350511789322, "eval_rewards/accuracies": 0.9764309525489807, "eval_rewards/chosen": 2.4230129718780518, "eval_rewards/margins": 7.340935230255127, "eval_rewards/rejected": -4.917922496795654, "eval_runtime": 267.7884, "eval_samples_per_second": 35.476, "eval_steps_per_second": 1.109, "step": 1000 }, { "epoch": 0.34, "learning_rate": 4.920055394687146e-07, "logits/chosen": 1.5080076456069946, "logits/rejected": 3.27386474609375, "logps/chosen": -398.11077880859375, "logps/rejected": -458.70404052734375, "loss": 0.0625, "rewards/accuracies": 1.0, "rewards/chosen": 2.3705155849456787, "rewards/margins": 6.589612007141113, "rewards/rejected": -4.2190961837768555, "step": 1010 }, { "epoch": 0.35, "learning_rate": 4.91376054387511e-07, "logits/chosen": 0.7365673780441284, "logits/rejected": 3.152676820755005, "logps/chosen": -289.18304443359375, "logps/rejected": -490.31829833984375, "loss": 0.0608, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.4276957511901855, "rewards/margins": 6.978020668029785, "rewards/rejected": -4.550324440002441, "step": 1020 }, { "epoch": 0.35, "learning_rate": 4.907465693063074e-07, "logits/chosen": 1.0980937480926514, "logits/rejected": 3.2199409008026123, "logps/chosen": -302.3829040527344, "logps/rejected": -502.25421142578125, "loss": 0.0566, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.611393451690674, "rewards/margins": 7.998917579650879, "rewards/rejected": -5.387524604797363, "step": 1030 }, { "epoch": 0.35, "learning_rate": 4.901170842251039e-07, "logits/chosen": 1.529950737953186, "logits/rejected": 2.4496376514434814, "logps/chosen": -472.7461853027344, "logps/rejected": -645.9083251953125, "loss": 0.0607, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.9878193140029907, "rewards/margins": 6.836523532867432, "rewards/rejected": -4.848703861236572, "step": 1040 }, { "epoch": 0.36, "learning_rate": 4.894875991439003e-07, "logits/chosen": 1.2354804277420044, "logits/rejected": 3.264209032058716, "logps/chosen": -457.37066650390625, "logps/rejected": -482.914306640625, "loss": 0.06, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.3142616748809814, "rewards/margins": 7.839533805847168, "rewards/rejected": -5.525272369384766, "step": 1050 }, { "epoch": 0.36, "learning_rate": 4.888581140626966e-07, "logits/chosen": 0.9020367860794067, "logits/rejected": 2.951887607574463, "logps/chosen": -372.59735107421875, "logps/rejected": -528.4854125976562, "loss": 0.0754, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.2299747467041016, "rewards/margins": 7.801189422607422, "rewards/rejected": -5.57121467590332, "step": 1060 }, { "epoch": 0.36, "learning_rate": 4.882286289814931e-07, "logits/chosen": 1.0016891956329346, "logits/rejected": 3.1277599334716797, "logps/chosen": -420.5859375, "logps/rejected": -437.7632751464844, "loss": 0.0458, "rewards/accuracies": 1.0, "rewards/chosen": 2.437075138092041, "rewards/margins": 7.129094123840332, "rewards/rejected": -4.692019462585449, "step": 1070 }, { "epoch": 0.37, "learning_rate": 4.875991439002896e-07, "logits/chosen": 1.0937416553497314, "logits/rejected": 3.4849390983581543, "logps/chosen": -369.060791015625, "logps/rejected": -446.8038635253906, "loss": 0.0509, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.3342339992523193, "rewards/margins": 7.783272743225098, "rewards/rejected": -5.449038505554199, "step": 1080 }, { "epoch": 0.37, "learning_rate": 4.869696588190859e-07, "logits/chosen": 2.1527936458587646, "logits/rejected": 3.5624840259552, "logps/chosen": -362.0032653808594, "logps/rejected": -347.994140625, "loss": 0.0518, "rewards/accuracies": 1.0, "rewards/chosen": 2.3183753490448, "rewards/margins": 7.678530693054199, "rewards/rejected": -5.360154628753662, "step": 1090 }, { "epoch": 0.37, "learning_rate": 4.863401737378824e-07, "logits/chosen": 1.1598182916641235, "logits/rejected": 3.118835926055908, "logps/chosen": -367.0357666015625, "logps/rejected": -561.3696899414062, "loss": 0.0623, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.534534215927124, "rewards/margins": 8.346112251281738, "rewards/rejected": -5.811577796936035, "step": 1100 }, { "epoch": 0.37, "eval_logits/chosen": 0.901133120059967, "eval_logits/rejected": 3.118581533432007, "eval_logps/chosen": -364.1503601074219, "eval_logps/rejected": -556.2498779296875, "eval_loss": 0.05724797025322914, "eval_rewards/accuracies": 0.9772727489471436, "eval_rewards/chosen": 2.4839539527893066, "eval_rewards/margins": 7.845913410186768, "eval_rewards/rejected": -5.361959934234619, "eval_runtime": 268.446, "eval_samples_per_second": 35.389, "eval_steps_per_second": 1.106, "step": 1100 }, { "epoch": 0.38, "learning_rate": 4.857106886566788e-07, "logits/chosen": 0.25722193717956543, "logits/rejected": 2.8666605949401855, "logps/chosen": -269.8269958496094, "logps/rejected": -616.5667114257812, "loss": 0.0638, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.2851195335388184, "rewards/margins": 8.181058883666992, "rewards/rejected": -5.895938873291016, "step": 1110 }, { "epoch": 0.38, "learning_rate": 4.850812035754753e-07, "logits/chosen": 1.0288280248641968, "logits/rejected": 3.402256727218628, "logps/chosen": -324.79913330078125, "logps/rejected": -548.8817138671875, "loss": 0.0428, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.5030677318573, "rewards/margins": 7.847231864929199, "rewards/rejected": -5.344164848327637, "step": 1120 }, { "epoch": 0.38, "learning_rate": 4.844517184942716e-07, "logits/chosen": 1.3817869424819946, "logits/rejected": 3.4411487579345703, "logps/chosen": -389.9842529296875, "logps/rejected": -385.0381774902344, "loss": 0.0403, "rewards/accuracies": 1.0, "rewards/chosen": 2.078273057937622, "rewards/margins": 7.2798590660095215, "rewards/rejected": -5.201586723327637, "step": 1130 }, { "epoch": 0.39, "learning_rate": 4.838222334130681e-07, "logits/chosen": 1.1128469705581665, "logits/rejected": 3.301898241043091, "logps/chosen": -366.19134521484375, "logps/rejected": -476.60205078125, "loss": 0.051, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.424424648284912, "rewards/margins": 8.37041187286377, "rewards/rejected": -5.945986270904541, "step": 1140 }, { "epoch": 0.39, "learning_rate": 4.831927483318645e-07, "logits/chosen": 1.1458241939544678, "logits/rejected": 3.3127903938293457, "logps/chosen": -397.9763488769531, "logps/rejected": -356.6136169433594, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": 2.525275230407715, "rewards/margins": 7.989278316497803, "rewards/rejected": -5.464003562927246, "step": 1150 }, { "epoch": 0.39, "learning_rate": 4.82563263250661e-07, "logits/chosen": 1.3799407482147217, "logits/rejected": 3.288724422454834, "logps/chosen": -439.81427001953125, "logps/rejected": -482.54461669921875, "loss": 0.0472, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.0920233726501465, "rewards/margins": 7.863224983215332, "rewards/rejected": -5.771200656890869, "step": 1160 }, { "epoch": 0.4, "learning_rate": 4.819337781694573e-07, "logits/chosen": 1.8935362100601196, "logits/rejected": 3.2015902996063232, "logps/chosen": -301.77191162109375, "logps/rejected": -543.3331298828125, "loss": 0.0508, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.1477038860321045, "rewards/margins": 7.75766658782959, "rewards/rejected": -5.609963893890381, "step": 1170 }, { "epoch": 0.4, "learning_rate": 4.813042930882538e-07, "logits/chosen": 1.364073634147644, "logits/rejected": 2.3843796253204346, "logps/chosen": -350.85760498046875, "logps/rejected": -687.0422973632812, "loss": 0.0436, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.2611756324768066, "rewards/margins": 7.602535247802734, "rewards/rejected": -5.341360092163086, "step": 1180 }, { "epoch": 0.4, "learning_rate": 4.806748080070503e-07, "logits/chosen": 1.4911861419677734, "logits/rejected": 3.1939921379089355, "logps/chosen": -389.1288146972656, "logps/rejected": -405.0220947265625, "loss": 0.0607, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1533682346343994, "rewards/margins": 7.3057379722595215, "rewards/rejected": -5.152369022369385, "step": 1190 }, { "epoch": 0.41, "learning_rate": 4.800453229258466e-07, "logits/chosen": 1.3050758838653564, "logits/rejected": 2.954599380493164, "logps/chosen": -454.8966369628906, "logps/rejected": -481.5711975097656, "loss": 0.0443, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.1215577125549316, "rewards/margins": 7.0846848487854, "rewards/rejected": -4.963128089904785, "step": 1200 }, { "epoch": 0.41, "eval_logits/chosen": 0.8960862159729004, "eval_logits/rejected": 3.119628667831421, "eval_logps/chosen": -364.7530212402344, "eval_logps/rejected": -557.41455078125, "eval_loss": 0.052623916417360306, "eval_rewards/accuracies": 0.9797979593276978, "eval_rewards/chosen": 2.4236865043640137, "eval_rewards/margins": 7.902113437652588, "eval_rewards/rejected": -5.478426933288574, "eval_runtime": 267.6269, "eval_samples_per_second": 35.497, "eval_steps_per_second": 1.11, "step": 1200 }, { "epoch": 0.41, "learning_rate": 4.79415837844643e-07, "logits/chosen": 1.5837571620941162, "logits/rejected": 3.5294737815856934, "logps/chosen": -378.23101806640625, "logps/rejected": -340.2025451660156, "loss": 0.0529, "rewards/accuracies": 1.0, "rewards/chosen": 2.2081446647644043, "rewards/margins": 8.476436614990234, "rewards/rejected": -6.268291473388672, "step": 1210 }, { "epoch": 0.41, "learning_rate": 4.787863527634395e-07, "logits/chosen": 1.1652976274490356, "logits/rejected": 2.906097173690796, "logps/chosen": -320.04486083984375, "logps/rejected": -624.504150390625, "loss": 0.0438, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.833052396774292, "rewards/margins": 7.845742225646973, "rewards/rejected": -6.012689113616943, "step": 1220 }, { "epoch": 0.42, "learning_rate": 4.781568676822359e-07, "logits/chosen": 0.824341893196106, "logits/rejected": 2.7557225227355957, "logps/chosen": -301.5578308105469, "logps/rejected": -791.7188720703125, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": 2.675546884536743, "rewards/margins": 8.524243354797363, "rewards/rejected": -5.848695755004883, "step": 1230 }, { "epoch": 0.42, "learning_rate": 4.775273826010323e-07, "logits/chosen": 1.4236054420471191, "logits/rejected": 3.063476085662842, "logps/chosen": -381.88348388671875, "logps/rejected": -580.1704711914062, "loss": 0.0398, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.311436176300049, "rewards/margins": 8.216426849365234, "rewards/rejected": -5.9049906730651855, "step": 1240 }, { "epoch": 0.42, "learning_rate": 4.768978975198288e-07, "logits/chosen": 1.1934541463851929, "logits/rejected": 2.6553595066070557, "logps/chosen": -375.6439208984375, "logps/rejected": -746.5518798828125, "loss": 0.0377, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.372680187225342, "rewards/margins": 7.412602424621582, "rewards/rejected": -5.03992223739624, "step": 1250 }, { "epoch": 0.43, "learning_rate": 4.762684124386252e-07, "logits/chosen": 1.407812237739563, "logits/rejected": 2.5921523571014404, "logps/chosen": -382.3465881347656, "logps/rejected": -705.896484375, "loss": 0.0441, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.423621654510498, "rewards/margins": 8.631671905517578, "rewards/rejected": -6.208049297332764, "step": 1260 }, { "epoch": 0.43, "learning_rate": 4.756389273574216e-07, "logits/chosen": 1.3090200424194336, "logits/rejected": 3.249882459640503, "logps/chosen": -420.4686584472656, "logps/rejected": -606.3863525390625, "loss": 0.0381, "rewards/accuracies": 1.0, "rewards/chosen": 2.314528465270996, "rewards/margins": 8.804101943969727, "rewards/rejected": -6.489573001861572, "step": 1270 }, { "epoch": 0.44, "learning_rate": 4.7500944227621803e-07, "logits/chosen": 1.533039927482605, "logits/rejected": 3.1679673194885254, "logps/chosen": -336.41033935546875, "logps/rejected": -435.3956604003906, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": 2.788140058517456, "rewards/margins": 9.346943855285645, "rewards/rejected": -6.558804512023926, "step": 1280 }, { "epoch": 0.44, "learning_rate": 4.7437995719501445e-07, "logits/chosen": 0.8365401029586792, "logits/rejected": 3.102825164794922, "logps/chosen": -311.44677734375, "logps/rejected": -572.3267211914062, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": 2.1671829223632812, "rewards/margins": 8.971689224243164, "rewards/rejected": -6.804505825042725, "step": 1290 }, { "epoch": 0.44, "learning_rate": 4.737504721138109e-07, "logits/chosen": 1.3206243515014648, "logits/rejected": 2.648153781890869, "logps/chosen": -361.6004333496094, "logps/rejected": -589.2341918945312, "loss": 0.0416, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.8718124628067017, "rewards/margins": 7.338369846343994, "rewards/rejected": -5.466557502746582, "step": 1300 }, { "epoch": 0.44, "eval_logits/chosen": 0.872046709060669, "eval_logits/rejected": 3.0682923793792725, "eval_logps/chosen": -365.11627197265625, "eval_logps/rejected": -564.8768310546875, "eval_loss": 0.04774455353617668, "eval_rewards/accuracies": 0.9823232293128967, "eval_rewards/chosen": 2.3873627185821533, "eval_rewards/margins": 8.612015724182129, "eval_rewards/rejected": -6.2246527671813965, "eval_runtime": 267.762, "eval_samples_per_second": 35.479, "eval_steps_per_second": 1.109, "step": 1300 }, { "epoch": 0.45, "learning_rate": 4.7312098703260735e-07, "logits/chosen": 1.2614483833312988, "logits/rejected": 3.360116958618164, "logps/chosen": -307.0997619628906, "logps/rejected": -477.2865295410156, "loss": 0.0437, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.121788263320923, "rewards/margins": 8.030182838439941, "rewards/rejected": -5.908394813537598, "step": 1310 }, { "epoch": 0.45, "learning_rate": 4.724915019514038e-07, "logits/chosen": 1.0011619329452515, "logits/rejected": 2.776240348815918, "logps/chosen": -362.7132568359375, "logps/rejected": -661.7002563476562, "loss": 0.0371, "rewards/accuracies": 1.0, "rewards/chosen": 2.4083709716796875, "rewards/margins": 8.502340316772461, "rewards/rejected": -6.093968391418457, "step": 1320 }, { "epoch": 0.45, "learning_rate": 4.7186201687020014e-07, "logits/chosen": 1.6232942342758179, "logits/rejected": 3.438117504119873, "logps/chosen": -356.1873474121094, "logps/rejected": -431.338134765625, "loss": 0.0357, "rewards/accuracies": 1.0, "rewards/chosen": 2.2725751399993896, "rewards/margins": 8.91747760772705, "rewards/rejected": -6.64490270614624, "step": 1330 }, { "epoch": 0.46, "learning_rate": 4.7123253178899657e-07, "logits/chosen": 0.9556019902229309, "logits/rejected": 3.3059210777282715, "logps/chosen": -296.5425109863281, "logps/rejected": -507.9095764160156, "loss": 0.0306, "rewards/accuracies": 1.0, "rewards/chosen": 3.0043280124664307, "rewards/margins": 9.976794242858887, "rewards/rejected": -6.972466945648193, "step": 1340 }, { "epoch": 0.46, "learning_rate": 4.70603046707793e-07, "logits/chosen": 1.2038887739181519, "logits/rejected": 2.7011523246765137, "logps/chosen": -311.3517150878906, "logps/rejected": -632.9171752929688, "loss": 0.0267, "rewards/accuracies": 1.0, "rewards/chosen": 2.609565258026123, "rewards/margins": 9.721056938171387, "rewards/rejected": -7.111492156982422, "step": 1350 }, { "epoch": 0.46, "learning_rate": 4.699735616265894e-07, "logits/chosen": 0.7160284519195557, "logits/rejected": 3.156071186065674, "logps/chosen": -371.474365234375, "logps/rejected": -511.2286682128906, "loss": 0.1187, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.7736752033233643, "rewards/margins": 8.387275695800781, "rewards/rejected": -5.613600730895996, "step": 1360 }, { "epoch": 0.47, "learning_rate": 4.693440765453859e-07, "logits/chosen": 1.0091116428375244, "logits/rejected": 2.463162660598755, "logps/chosen": -320.53118896484375, "logps/rejected": -748.8678588867188, "loss": 0.1145, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6646820306777954, "rewards/margins": 8.154109954833984, "rewards/rejected": -6.4894280433654785, "step": 1370 }, { "epoch": 0.47, "learning_rate": 4.687145914641823e-07, "logits/chosen": 2.027181625366211, "logits/rejected": 3.0980985164642334, "logps/chosen": -349.1291198730469, "logps/rejected": -616.3356323242188, "loss": 0.0332, "rewards/accuracies": 1.0, "rewards/chosen": 2.5397064685821533, "rewards/margins": 9.038924217224121, "rewards/rejected": -6.499217987060547, "step": 1380 }, { "epoch": 0.47, "learning_rate": 4.6808510638297873e-07, "logits/chosen": 1.762028694152832, "logits/rejected": 2.9455161094665527, "logps/chosen": -345.4776306152344, "logps/rejected": -617.8087768554688, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": 2.7131543159484863, "rewards/margins": 8.694609642028809, "rewards/rejected": -5.9814558029174805, "step": 1390 }, { "epoch": 0.48, "learning_rate": 4.674556213017751e-07, "logits/chosen": 1.4126743078231812, "logits/rejected": 3.22453236579895, "logps/chosen": -295.3870849609375, "logps/rejected": -528.4091186523438, "loss": 0.0365, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1942496299743652, "rewards/margins": 10.106409072875977, "rewards/rejected": -7.912158966064453, "step": 1400 }, { "epoch": 0.48, "eval_logits/chosen": 0.8667415380477905, "eval_logits/rejected": 3.049123525619507, "eval_logps/chosen": -366.1031494140625, "eval_logps/rejected": -570.9899291992188, "eval_loss": 0.04477392137050629, "eval_rewards/accuracies": 0.9806397557258606, "eval_rewards/chosen": 2.2886757850646973, "eval_rewards/margins": 9.124639511108398, "eval_rewards/rejected": -6.835964202880859, "eval_runtime": 267.6793, "eval_samples_per_second": 35.49, "eval_steps_per_second": 1.11, "step": 1400 }, { "epoch": 0.48, "learning_rate": 4.668261362205715e-07, "logits/chosen": 1.426150918006897, "logits/rejected": 3.05987811088562, "logps/chosen": -397.50238037109375, "logps/rejected": -614.9202270507812, "loss": 0.0244, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.230624198913574, "rewards/margins": 10.895345687866211, "rewards/rejected": -8.664721488952637, "step": 1410 }, { "epoch": 0.48, "learning_rate": 4.6619665113936795e-07, "logits/chosen": 1.4373013973236084, "logits/rejected": 2.838214159011841, "logps/chosen": -378.0550231933594, "logps/rejected": -618.8988037109375, "loss": 0.0456, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.2754616737365723, "rewards/margins": 8.592214584350586, "rewards/rejected": -6.31675386428833, "step": 1420 }, { "epoch": 0.49, "learning_rate": 4.6556716605816437e-07, "logits/chosen": 1.1580406427383423, "logits/rejected": 3.0366053581237793, "logps/chosen": -366.52362060546875, "logps/rejected": -564.1707153320312, "loss": 0.0379, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.398660182952881, "rewards/margins": 8.526018142700195, "rewards/rejected": -6.127357006072998, "step": 1430 }, { "epoch": 0.49, "learning_rate": 4.6493768097696085e-07, "logits/chosen": 1.256246566772461, "logits/rejected": 3.201850175857544, "logps/chosen": -451.3427734375, "logps/rejected": -479.74505615234375, "loss": 0.0505, "rewards/accuracies": 1.0, "rewards/chosen": 2.0574326515197754, "rewards/margins": 8.310302734375, "rewards/rejected": -6.252870082855225, "step": 1440 }, { "epoch": 0.49, "learning_rate": 4.6430819589575727e-07, "logits/chosen": 1.2370970249176025, "logits/rejected": 3.4503719806671143, "logps/chosen": -319.02691650390625, "logps/rejected": -411.275146484375, "loss": 0.036, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.039693832397461, "rewards/margins": 9.272503852844238, "rewards/rejected": -7.232810020446777, "step": 1450 }, { "epoch": 0.5, "learning_rate": 4.636787108145537e-07, "logits/chosen": 1.3211932182312012, "logits/rejected": 2.6074912548065186, "logps/chosen": -429.85015869140625, "logps/rejected": -610.8492431640625, "loss": 0.0301, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.3383851051330566, "rewards/margins": 8.625310897827148, "rewards/rejected": -6.286925792694092, "step": 1460 }, { "epoch": 0.5, "learning_rate": 4.630492257333501e-07, "logits/chosen": 1.121718168258667, "logits/rejected": 3.2523887157440186, "logps/chosen": -302.6455078125, "logps/rejected": -466.39752197265625, "loss": 0.0288, "rewards/accuracies": 1.0, "rewards/chosen": 1.9783689975738525, "rewards/margins": 8.877293586730957, "rewards/rejected": -6.898924827575684, "step": 1470 }, { "epoch": 0.5, "learning_rate": 4.624197406521465e-07, "logits/chosen": 1.5639989376068115, "logits/rejected": 3.146419048309326, "logps/chosen": -286.6660461425781, "logps/rejected": -513.6187744140625, "loss": 0.0427, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8529775142669678, "rewards/margins": 9.158210754394531, "rewards/rejected": -7.305233955383301, "step": 1480 }, { "epoch": 0.51, "learning_rate": 4.617902555709429e-07, "logits/chosen": 0.8676093816757202, "logits/rejected": 3.324131488800049, "logps/chosen": -344.65277099609375, "logps/rejected": -363.31390380859375, "loss": 0.0459, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1465797424316406, "rewards/margins": 8.678162574768066, "rewards/rejected": -6.531582832336426, "step": 1490 }, { "epoch": 0.51, "learning_rate": 4.611607704897394e-07, "logits/chosen": 1.0654547214508057, "logits/rejected": 3.0400424003601074, "logps/chosen": -427.28790283203125, "logps/rejected": -542.1488647460938, "loss": 0.0341, "rewards/accuracies": 1.0, "rewards/chosen": 2.3886258602142334, "rewards/margins": 8.62549877166748, "rewards/rejected": -6.236873626708984, "step": 1500 }, { "epoch": 0.51, "eval_logits/chosen": 0.8500487208366394, "eval_logits/rejected": 3.0298707485198975, "eval_logps/chosen": -366.1944580078125, "eval_logps/rejected": -572.177734375, "eval_loss": 0.04415823146700859, "eval_rewards/accuracies": 0.9848484992980957, "eval_rewards/chosen": 2.2795426845550537, "eval_rewards/margins": 9.234275817871094, "eval_rewards/rejected": -6.954732894897461, "eval_runtime": 267.8215, "eval_samples_per_second": 35.471, "eval_steps_per_second": 1.109, "step": 1500 }, { "epoch": 0.51, "learning_rate": 4.605312854085358e-07, "logits/chosen": 1.42298424243927, "logits/rejected": 2.857264518737793, "logps/chosen": -413.36065673828125, "logps/rejected": -489.81353759765625, "loss": 0.0406, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.0433743000030518, "rewards/margins": 8.779324531555176, "rewards/rejected": -6.7359514236450195, "step": 1510 }, { "epoch": 0.52, "learning_rate": 4.5990180032733223e-07, "logits/chosen": 1.4577559232711792, "logits/rejected": 2.931697368621826, "logps/chosen": -374.41046142578125, "logps/rejected": -628.4669799804688, "loss": 0.0432, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.2504830360412598, "rewards/margins": 9.084136009216309, "rewards/rejected": -6.833653450012207, "step": 1520 }, { "epoch": 0.52, "learning_rate": 4.5927231524612865e-07, "logits/chosen": 1.394997000694275, "logits/rejected": 3.512721538543701, "logps/chosen": -348.52667236328125, "logps/rejected": -423.2193298339844, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": 2.209585666656494, "rewards/margins": 9.598394393920898, "rewards/rejected": -7.388807773590088, "step": 1530 }, { "epoch": 0.52, "learning_rate": 4.586428301649251e-07, "logits/chosen": 0.8830512762069702, "logits/rejected": 2.386176347732544, "logps/chosen": -316.9195861816406, "logps/rejected": -686.3156127929688, "loss": 0.0385, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.390429735183716, "rewards/margins": 7.798093318939209, "rewards/rejected": -5.407662391662598, "step": 1540 }, { "epoch": 0.53, "learning_rate": 4.5801334508372145e-07, "logits/chosen": 1.4992390871047974, "logits/rejected": 3.2531185150146484, "logps/chosen": -381.68865966796875, "logps/rejected": -466.6773376464844, "loss": 0.0408, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.968583345413208, "rewards/margins": 8.427129745483398, "rewards/rejected": -6.4585466384887695, "step": 1550 }, { "epoch": 0.53, "learning_rate": 4.573838600025179e-07, "logits/chosen": 1.1836860179901123, "logits/rejected": 2.7051196098327637, "logps/chosen": -460.22515869140625, "logps/rejected": -650.5518798828125, "loss": 0.0425, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.747108817100525, "rewards/margins": 8.638761520385742, "rewards/rejected": -6.891653537750244, "step": 1560 }, { "epoch": 0.53, "learning_rate": 4.5675437492131434e-07, "logits/chosen": 1.3394988775253296, "logits/rejected": 2.7925000190734863, "logps/chosen": -351.8819274902344, "logps/rejected": -588.3709106445312, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": 2.231447219848633, "rewards/margins": 8.942026138305664, "rewards/rejected": -6.710578918457031, "step": 1570 }, { "epoch": 0.54, "learning_rate": 4.5612488984011077e-07, "logits/chosen": 1.5489451885223389, "logits/rejected": 3.2238707542419434, "logps/chosen": -369.0827331542969, "logps/rejected": -516.4092407226562, "loss": 0.0426, "rewards/accuracies": 1.0, "rewards/chosen": 2.270219326019287, "rewards/margins": 8.84095287322998, "rewards/rejected": -6.570733070373535, "step": 1580 }, { "epoch": 0.54, "learning_rate": 4.554954047589072e-07, "logits/chosen": 1.0998046398162842, "logits/rejected": 3.2007484436035156, "logps/chosen": -366.46356201171875, "logps/rejected": -464.4613342285156, "loss": 0.0357, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.3737833499908447, "rewards/margins": 10.289929389953613, "rewards/rejected": -7.916145324707031, "step": 1590 }, { "epoch": 0.54, "learning_rate": 4.548659196777036e-07, "logits/chosen": 1.2878637313842773, "logits/rejected": 3.2345385551452637, "logps/chosen": -499.41259765625, "logps/rejected": -477.4854431152344, "loss": 0.0406, "rewards/accuracies": 1.0, "rewards/chosen": 2.7783913612365723, "rewards/margins": 8.754561424255371, "rewards/rejected": -5.976171016693115, "step": 1600 }, { "epoch": 0.54, "eval_logits/chosen": 0.844233512878418, "eval_logits/rejected": 3.043703317642212, "eval_logps/chosen": -368.0941162109375, "eval_logps/rejected": -572.6334228515625, "eval_loss": 0.04141601547598839, "eval_rewards/accuracies": 0.9848484992980957, "eval_rewards/chosen": 2.0895750522613525, "eval_rewards/margins": 9.089882850646973, "eval_rewards/rejected": -7.000306606292725, "eval_runtime": 268.1349, "eval_samples_per_second": 35.43, "eval_steps_per_second": 1.108, "step": 1600 }, { "epoch": 0.55, "learning_rate": 4.5423643459650003e-07, "logits/chosen": 0.930519700050354, "logits/rejected": 2.513514518737793, "logps/chosen": -379.267333984375, "logps/rejected": -638.7659912109375, "loss": 0.0365, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.8533432483673096, "rewards/margins": 8.584343910217285, "rewards/rejected": -6.731001377105713, "step": 1610 }, { "epoch": 0.55, "learning_rate": 4.536069495152965e-07, "logits/chosen": 0.8711696863174438, "logits/rejected": 2.6304049491882324, "logps/chosen": -361.69207763671875, "logps/rejected": -652.2589721679688, "loss": 0.0412, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.895737648010254, "rewards/margins": 9.80798625946045, "rewards/rejected": -7.912248134613037, "step": 1620 }, { "epoch": 0.55, "learning_rate": 4.529774644340929e-07, "logits/chosen": 1.4021892547607422, "logits/rejected": 3.2112183570861816, "logps/chosen": -386.8944091796875, "logps/rejected": -473.6744079589844, "loss": 0.0281, "rewards/accuracies": 1.0, "rewards/chosen": 2.199531078338623, "rewards/margins": 8.601357460021973, "rewards/rejected": -6.401825904846191, "step": 1630 }, { "epoch": 0.56, "learning_rate": 4.523479793528893e-07, "logits/chosen": 1.5126926898956299, "logits/rejected": 2.907017946243286, "logps/chosen": -352.89801025390625, "logps/rejected": -503.09149169921875, "loss": 0.0419, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7960695028305054, "rewards/margins": 9.080196380615234, "rewards/rejected": -7.284126281738281, "step": 1640 }, { "epoch": 0.56, "learning_rate": 4.517184942716857e-07, "logits/chosen": 0.9331147074699402, "logits/rejected": 2.9222187995910645, "logps/chosen": -430.34521484375, "logps/rejected": -444.293212890625, "loss": 0.0345, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6567729711532593, "rewards/margins": 8.3241605758667, "rewards/rejected": -6.667386531829834, "step": 1650 }, { "epoch": 0.56, "learning_rate": 4.5108900919048215e-07, "logits/chosen": 1.6180213689804077, "logits/rejected": 2.923030376434326, "logps/chosen": -390.9251403808594, "logps/rejected": -516.4937744140625, "loss": 0.0343, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9237686395645142, "rewards/margins": 9.0204496383667, "rewards/rejected": -7.096681118011475, "step": 1660 }, { "epoch": 0.57, "learning_rate": 4.5045952410927857e-07, "logits/chosen": 1.0812413692474365, "logits/rejected": 2.9793589115142822, "logps/chosen": -359.8444519042969, "logps/rejected": -535.794677734375, "loss": 0.0289, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.535153865814209, "rewards/margins": 9.450161933898926, "rewards/rejected": -6.915008544921875, "step": 1670 }, { "epoch": 0.57, "learning_rate": 4.4983003902807505e-07, "logits/chosen": 1.24253249168396, "logits/rejected": 2.6760640144348145, "logps/chosen": -463.6806640625, "logps/rejected": -732.2550048828125, "loss": 0.023, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.381948947906494, "rewards/margins": 9.9815034866333, "rewards/rejected": -7.59955358505249, "step": 1680 }, { "epoch": 0.57, "learning_rate": 4.4920055394687147e-07, "logits/chosen": 1.4033453464508057, "logits/rejected": 3.3531792163848877, "logps/chosen": -316.72991943359375, "logps/rejected": -479.224853515625, "loss": 0.0209, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9447095394134521, "rewards/margins": 8.633768081665039, "rewards/rejected": -6.689058780670166, "step": 1690 }, { "epoch": 0.58, "learning_rate": 4.485710688656679e-07, "logits/chosen": 1.282178282737732, "logits/rejected": 2.8951597213745117, "logps/chosen": -346.11138916015625, "logps/rejected": -551.4237060546875, "loss": 0.0427, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1353259086608887, "rewards/margins": 10.34667682647705, "rewards/rejected": -8.21135139465332, "step": 1700 }, { "epoch": 0.58, "eval_logits/chosen": 0.8383491039276123, "eval_logits/rejected": 3.045766830444336, "eval_logps/chosen": -368.6102294921875, "eval_logps/rejected": -573.771240234375, "eval_loss": 0.03865913301706314, "eval_rewards/accuracies": 0.9856902360916138, "eval_rewards/chosen": 2.037966728210449, "eval_rewards/margins": 9.152060508728027, "eval_rewards/rejected": -7.114094257354736, "eval_runtime": 267.881, "eval_samples_per_second": 35.464, "eval_steps_per_second": 1.109, "step": 1700 }, { "epoch": 0.58, "learning_rate": 4.4794158378446426e-07, "logits/chosen": 0.8792635798454285, "logits/rejected": 2.5632190704345703, "logps/chosen": -436.8245544433594, "logps/rejected": -714.1575927734375, "loss": 0.0476, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.0016674995422363, "rewards/margins": 8.103446960449219, "rewards/rejected": -6.101779460906982, "step": 1710 }, { "epoch": 0.58, "learning_rate": 4.473120987032607e-07, "logits/chosen": 1.1539610624313354, "logits/rejected": 2.0238335132598877, "logps/chosen": -366.057861328125, "logps/rejected": -899.9832763671875, "loss": 0.0234, "rewards/accuracies": 1.0, "rewards/chosen": 2.1319379806518555, "rewards/margins": 9.034749984741211, "rewards/rejected": -6.902812957763672, "step": 1720 }, { "epoch": 0.59, "learning_rate": 4.466826136220571e-07, "logits/chosen": 1.4467999935150146, "logits/rejected": 3.262604236602783, "logps/chosen": -328.6813049316406, "logps/rejected": -455.77520751953125, "loss": 0.0315, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.3364861011505127, "rewards/margins": 9.599719047546387, "rewards/rejected": -7.263233184814453, "step": 1730 }, { "epoch": 0.59, "learning_rate": 4.460531285408536e-07, "logits/chosen": 1.053497552871704, "logits/rejected": 2.9759325981140137, "logps/chosen": -494.638671875, "logps/rejected": -478.7875061035156, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": 1.7100532054901123, "rewards/margins": 9.044734954833984, "rewards/rejected": -7.334682464599609, "step": 1740 }, { "epoch": 0.59, "learning_rate": 4.4542364345965e-07, "logits/chosen": 1.587790608406067, "logits/rejected": 3.05256986618042, "logps/chosen": -425.80120849609375, "logps/rejected": -446.116455078125, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": 2.355698347091675, "rewards/margins": 10.500600814819336, "rewards/rejected": -8.144902229309082, "step": 1750 }, { "epoch": 0.6, "learning_rate": 4.4479415837844643e-07, "logits/chosen": 0.9711793065071106, "logits/rejected": 3.033139705657959, "logps/chosen": -419.15093994140625, "logps/rejected": -527.05908203125, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": 1.8909587860107422, "rewards/margins": 9.026007652282715, "rewards/rejected": -7.135048866271973, "step": 1760 }, { "epoch": 0.6, "learning_rate": 4.4416467329724285e-07, "logits/chosen": 1.252915620803833, "logits/rejected": 2.7287914752960205, "logps/chosen": -316.9794921875, "logps/rejected": -696.9098510742188, "loss": 0.025, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.2421488761901855, "rewards/margins": 9.83061408996582, "rewards/rejected": -7.588465213775635, "step": 1770 }, { "epoch": 0.61, "learning_rate": 4.435351882160392e-07, "logits/chosen": 1.7731847763061523, "logits/rejected": 3.256561756134033, "logps/chosen": -392.49127197265625, "logps/rejected": -497.767822265625, "loss": 0.0289, "rewards/accuracies": 1.0, "rewards/chosen": 1.7031304836273193, "rewards/margins": 8.996789932250977, "rewards/rejected": -7.293660640716553, "step": 1780 }, { "epoch": 0.61, "learning_rate": 4.4290570313483564e-07, "logits/chosen": 1.0066537857055664, "logits/rejected": 2.923328161239624, "logps/chosen": -497.8016662597656, "logps/rejected": -501.8356018066406, "loss": 0.0407, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.5872151851654053, "rewards/margins": 10.02701187133789, "rewards/rejected": -8.439796447753906, "step": 1790 }, { "epoch": 0.61, "learning_rate": 4.422762180536321e-07, "logits/chosen": 0.7579858899116516, "logits/rejected": 2.7158029079437256, "logps/chosen": -332.276123046875, "logps/rejected": -624.5234375, "loss": 0.0225, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.291430950164795, "rewards/margins": 8.889410018920898, "rewards/rejected": -6.597978115081787, "step": 1800 }, { "epoch": 0.61, "eval_logits/chosen": 0.8362001776695251, "eval_logits/rejected": 3.0442867279052734, "eval_logps/chosen": -366.83953857421875, "eval_logps/rejected": -573.6825561523438, "eval_loss": 0.04210779815912247, "eval_rewards/accuracies": 0.9890572428703308, "eval_rewards/chosen": 2.2150368690490723, "eval_rewards/margins": 9.32026195526123, "eval_rewards/rejected": -7.105224609375, "eval_runtime": 267.6481, "eval_samples_per_second": 35.494, "eval_steps_per_second": 1.11, "step": 1800 }, { "epoch": 0.62, "learning_rate": 4.4164673297242854e-07, "logits/chosen": 1.06367027759552, "logits/rejected": 2.7651069164276123, "logps/chosen": -433.72210693359375, "logps/rejected": -608.6502075195312, "loss": 0.0341, "rewards/accuracies": 1.0, "rewards/chosen": 2.3269495964050293, "rewards/margins": 10.384018898010254, "rewards/rejected": -8.057069778442383, "step": 1810 }, { "epoch": 0.62, "learning_rate": 4.4101724789122497e-07, "logits/chosen": 1.0821582078933716, "logits/rejected": 2.331120491027832, "logps/chosen": -378.4905090332031, "logps/rejected": -800.6824340820312, "loss": 0.026, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1085920333862305, "rewards/margins": 8.5978422164917, "rewards/rejected": -6.489250183105469, "step": 1820 }, { "epoch": 0.62, "learning_rate": 4.403877628100214e-07, "logits/chosen": 1.1348297595977783, "logits/rejected": 3.279541015625, "logps/chosen": -381.3242492675781, "logps/rejected": -529.54052734375, "loss": 0.0213, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.102947950363159, "rewards/margins": 8.581428527832031, "rewards/rejected": -6.478480339050293, "step": 1830 }, { "epoch": 0.63, "learning_rate": 4.397582777288178e-07, "logits/chosen": 1.0020978450775146, "logits/rejected": 2.961073875427246, "logps/chosen": -446.6199645996094, "logps/rejected": -650.9901123046875, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": 2.3660645484924316, "rewards/margins": 9.614389419555664, "rewards/rejected": -7.248324394226074, "step": 1840 }, { "epoch": 0.63, "learning_rate": 4.3912879264761423e-07, "logits/chosen": 1.8732408285140991, "logits/rejected": 2.957521915435791, "logps/chosen": -387.747802734375, "logps/rejected": -548.1691284179688, "loss": 0.0216, "rewards/accuracies": 1.0, "rewards/chosen": 1.9975941181182861, "rewards/margins": 8.945771217346191, "rewards/rejected": -6.948177337646484, "step": 1850 }, { "epoch": 0.63, "learning_rate": 4.3849930756641066e-07, "logits/chosen": 0.8730059862136841, "logits/rejected": 2.674221992492676, "logps/chosen": -351.84716796875, "logps/rejected": -652.6336669921875, "loss": 0.0283, "rewards/accuracies": 1.0, "rewards/chosen": 2.034841537475586, "rewards/margins": 9.151686668395996, "rewards/rejected": -7.116845607757568, "step": 1860 }, { "epoch": 0.64, "learning_rate": 4.378698224852071e-07, "logits/chosen": 1.5931994915008545, "logits/rejected": 3.244697093963623, "logps/chosen": -370.19097900390625, "logps/rejected": -500.9292907714844, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": 1.7228028774261475, "rewards/margins": 9.614213943481445, "rewards/rejected": -7.891410827636719, "step": 1870 }, { "epoch": 0.64, "learning_rate": 4.372403374040035e-07, "logits/chosen": 0.840854287147522, "logits/rejected": 3.0632858276367188, "logps/chosen": -317.24822998046875, "logps/rejected": -540.1351928710938, "loss": 0.0299, "rewards/accuracies": 1.0, "rewards/chosen": 1.6507081985473633, "rewards/margins": 9.825498580932617, "rewards/rejected": -8.174789428710938, "step": 1880 }, { "epoch": 0.64, "learning_rate": 4.366108523227999e-07, "logits/chosen": 1.8016700744628906, "logits/rejected": 2.8221445083618164, "logps/chosen": -406.5692138671875, "logps/rejected": -540.0714111328125, "loss": 0.0376, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.989933967590332, "rewards/margins": 9.233619689941406, "rewards/rejected": -7.243685722351074, "step": 1890 }, { "epoch": 0.65, "learning_rate": 4.3598136724159635e-07, "logits/chosen": 1.2036279439926147, "logits/rejected": 3.0226616859436035, "logps/chosen": -446.08294677734375, "logps/rejected": -568.0128784179688, "loss": 0.0298, "rewards/accuracies": 1.0, "rewards/chosen": 2.3940460681915283, "rewards/margins": 9.725221633911133, "rewards/rejected": -7.331175804138184, "step": 1900 }, { "epoch": 0.65, "eval_logits/chosen": 0.8392346501350403, "eval_logits/rejected": 3.0305709838867188, "eval_logps/chosen": -368.1361389160156, "eval_logps/rejected": -579.7667846679688, "eval_loss": 0.03639867901802063, "eval_rewards/accuracies": 0.9882155060768127, "eval_rewards/chosen": 2.085376262664795, "eval_rewards/margins": 9.799016952514648, "eval_rewards/rejected": -7.713641166687012, "eval_runtime": 267.3409, "eval_samples_per_second": 35.535, "eval_steps_per_second": 1.111, "step": 1900 }, { "epoch": 0.65, "learning_rate": 4.3535188216039277e-07, "logits/chosen": 1.0752043724060059, "logits/rejected": 2.855588674545288, "logps/chosen": -346.389404296875, "logps/rejected": -544.1331787109375, "loss": 0.0359, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9727954864501953, "rewards/margins": 9.62835693359375, "rewards/rejected": -7.655562400817871, "step": 1910 }, { "epoch": 0.65, "learning_rate": 4.3472239707918925e-07, "logits/chosen": 0.7856771945953369, "logits/rejected": 3.2946181297302246, "logps/chosen": -305.7819519042969, "logps/rejected": -429.08038330078125, "loss": 0.0207, "rewards/accuracies": 1.0, "rewards/chosen": 2.1681599617004395, "rewards/margins": 9.955987930297852, "rewards/rejected": -7.787827968597412, "step": 1920 }, { "epoch": 0.66, "learning_rate": 4.3409291199798567e-07, "logits/chosen": 1.8215280771255493, "logits/rejected": 2.743664264678955, "logps/chosen": -331.0490417480469, "logps/rejected": -699.108154296875, "loss": 0.0213, "rewards/accuracies": 1.0, "rewards/chosen": 2.219869375228882, "rewards/margins": 8.896055221557617, "rewards/rejected": -6.67618465423584, "step": 1930 }, { "epoch": 0.66, "learning_rate": 4.3346342691678204e-07, "logits/chosen": 1.0438158512115479, "logits/rejected": 2.8148791790008545, "logps/chosen": -324.4605407714844, "logps/rejected": -731.7349853515625, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": 1.9487861394882202, "rewards/margins": 9.898294448852539, "rewards/rejected": -7.9495062828063965, "step": 1940 }, { "epoch": 0.66, "learning_rate": 4.3283394183557846e-07, "logits/chosen": 1.4545763731002808, "logits/rejected": 2.9583208560943604, "logps/chosen": -527.7854614257812, "logps/rejected": -516.2374877929688, "loss": 0.0415, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.4202537536621094, "rewards/margins": 9.624616622924805, "rewards/rejected": -7.204362392425537, "step": 1950 }, { "epoch": 0.67, "learning_rate": 4.322044567543749e-07, "logits/chosen": 1.4765589237213135, "logits/rejected": 3.1827173233032227, "logps/chosen": -379.47216796875, "logps/rejected": -503.27569580078125, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": 2.047214984893799, "rewards/margins": 9.799707412719727, "rewards/rejected": -7.752492427825928, "step": 1960 }, { "epoch": 0.67, "learning_rate": 4.315749716731713e-07, "logits/chosen": 1.7842438220977783, "logits/rejected": 2.418262481689453, "logps/chosen": -389.3029479980469, "logps/rejected": -714.0911254882812, "loss": 0.0325, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.008929967880249, "rewards/margins": 8.412801742553711, "rewards/rejected": -6.403871059417725, "step": 1970 }, { "epoch": 0.67, "learning_rate": 4.309454865919678e-07, "logits/chosen": 1.215798258781433, "logits/rejected": 3.197004795074463, "logps/chosen": -390.9176940917969, "logps/rejected": -528.724609375, "loss": 0.0248, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9435479640960693, "rewards/margins": 8.589653968811035, "rewards/rejected": -6.6461052894592285, "step": 1980 }, { "epoch": 0.68, "learning_rate": 4.303160015107642e-07, "logits/chosen": 1.5803320407867432, "logits/rejected": 3.488564968109131, "logps/chosen": -419.08111572265625, "logps/rejected": -409.6341247558594, "loss": 0.0232, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.560288906097412, "rewards/margins": 8.865260124206543, "rewards/rejected": -7.304971218109131, "step": 1990 }, { "epoch": 0.68, "learning_rate": 4.2968651642956063e-07, "logits/chosen": 1.274867057800293, "logits/rejected": 2.914517879486084, "logps/chosen": -367.41143798828125, "logps/rejected": -487.2129821777344, "loss": 0.0255, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.8918441534042358, "rewards/margins": 9.418905258178711, "rewards/rejected": -7.527061462402344, "step": 2000 }, { "epoch": 0.68, "eval_logits/chosen": 0.8292204737663269, "eval_logits/rejected": 3.020355701446533, "eval_logps/chosen": -367.63873291015625, "eval_logps/rejected": -579.4823608398438, "eval_loss": 0.035267189145088196, "eval_rewards/accuracies": 0.9907407164573669, "eval_rewards/chosen": 2.135115623474121, "eval_rewards/margins": 9.820322036743164, "eval_rewards/rejected": -7.685206413269043, "eval_runtime": 267.0722, "eval_samples_per_second": 35.571, "eval_steps_per_second": 1.112, "step": 2000 }, { "epoch": 0.68, "learning_rate": 4.29057031348357e-07, "logits/chosen": 1.5710475444793701, "logits/rejected": 3.378007411956787, "logps/chosen": -315.57305908203125, "logps/rejected": -575.1492919921875, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": 1.9405778646469116, "rewards/margins": 9.704689025878906, "rewards/rejected": -7.764111518859863, "step": 2010 }, { "epoch": 0.69, "learning_rate": 4.284275462671534e-07, "logits/chosen": 1.243847131729126, "logits/rejected": 3.022942304611206, "logps/chosen": -323.0140075683594, "logps/rejected": -645.990478515625, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": 2.4514198303222656, "rewards/margins": 10.53857135772705, "rewards/rejected": -8.087152481079102, "step": 2020 }, { "epoch": 0.69, "learning_rate": 4.2779806118594984e-07, "logits/chosen": 0.9828505516052246, "logits/rejected": 2.906207323074341, "logps/chosen": -411.5979919433594, "logps/rejected": -462.4324645996094, "loss": 0.0244, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.9616676568984985, "rewards/margins": 8.81393051147461, "rewards/rejected": -6.852262020111084, "step": 2030 }, { "epoch": 0.69, "learning_rate": 4.271685761047463e-07, "logits/chosen": 0.7124877572059631, "logits/rejected": 3.1481852531433105, "logps/chosen": -312.6952819824219, "logps/rejected": -522.9578857421875, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": 2.42478346824646, "rewards/margins": 11.089437484741211, "rewards/rejected": -8.664652824401855, "step": 2040 }, { "epoch": 0.7, "learning_rate": 4.2653909102354274e-07, "logits/chosen": 1.7187106609344482, "logits/rejected": 3.0672192573547363, "logps/chosen": -375.66827392578125, "logps/rejected": -492.21087646484375, "loss": 0.0222, "rewards/accuracies": 1.0, "rewards/chosen": 1.80177903175354, "rewards/margins": 9.827333450317383, "rewards/rejected": -8.025555610656738, "step": 2050 }, { "epoch": 0.7, "learning_rate": 4.2590960594233917e-07, "logits/chosen": 1.6581732034683228, "logits/rejected": 3.062495231628418, "logps/chosen": -395.61846923828125, "logps/rejected": -509.40985107421875, "loss": 0.0251, "rewards/accuracies": 1.0, "rewards/chosen": 1.8729314804077148, "rewards/margins": 9.967903137207031, "rewards/rejected": -8.094970703125, "step": 2060 }, { "epoch": 0.7, "learning_rate": 4.252801208611356e-07, "logits/chosen": 1.193673849105835, "logits/rejected": 3.2088775634765625, "logps/chosen": -363.78912353515625, "logps/rejected": -526.5405883789062, "loss": 0.0265, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.700319290161133, "rewards/margins": 9.979622840881348, "rewards/rejected": -7.279304504394531, "step": 2070 }, { "epoch": 0.71, "learning_rate": 4.24650635779932e-07, "logits/chosen": 1.1417957544326782, "logits/rejected": 3.5138957500457764, "logps/chosen": -296.9524841308594, "logps/rejected": -465.9798889160156, "loss": 0.0216, "rewards/accuracies": 1.0, "rewards/chosen": 2.034841775894165, "rewards/margins": 11.99071979522705, "rewards/rejected": -9.955877304077148, "step": 2080 }, { "epoch": 0.71, "learning_rate": 4.240211506987284e-07, "logits/chosen": 0.8898839950561523, "logits/rejected": 2.8017234802246094, "logps/chosen": -394.20703125, "logps/rejected": -560.9544067382812, "loss": 0.0232, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.06630277633667, "rewards/margins": 11.01480770111084, "rewards/rejected": -8.948505401611328, "step": 2090 }, { "epoch": 0.71, "learning_rate": 4.233916656175248e-07, "logits/chosen": 1.0146639347076416, "logits/rejected": 2.3725476264953613, "logps/chosen": -412.14776611328125, "logps/rejected": -777.4273681640625, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": 2.079822063446045, "rewards/margins": 10.065254211425781, "rewards/rejected": -7.985433101654053, "step": 2100 }, { "epoch": 0.71, "eval_logits/chosen": 0.8412470817565918, "eval_logits/rejected": 3.0052082538604736, "eval_logps/chosen": -367.7745361328125, "eval_logps/rejected": -584.4203491210938, "eval_loss": 0.029633017256855965, "eval_rewards/accuracies": 0.9915825128555298, "eval_rewards/chosen": 2.1215357780456543, "eval_rewards/margins": 10.300537109375, "eval_rewards/rejected": -8.179000854492188, "eval_runtime": 267.8451, "eval_samples_per_second": 35.468, "eval_steps_per_second": 1.109, "step": 2100 }, { "epoch": 0.72, "learning_rate": 4.227621805363213e-07, "logits/chosen": 1.0731689929962158, "logits/rejected": 2.9808132648468018, "logps/chosen": -345.23016357421875, "logps/rejected": -500.6625061035156, "loss": 0.0349, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9625818729400635, "rewards/margins": 10.356372833251953, "rewards/rejected": -8.393791198730469, "step": 2110 }, { "epoch": 0.72, "learning_rate": 4.221326954551177e-07, "logits/chosen": 1.323359727859497, "logits/rejected": 2.431626081466675, "logps/chosen": -340.8362731933594, "logps/rejected": -664.531494140625, "loss": 0.026, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.480254650115967, "rewards/margins": 10.480634689331055, "rewards/rejected": -8.00037956237793, "step": 2120 }, { "epoch": 0.72, "learning_rate": 4.215032103739141e-07, "logits/chosen": 1.2426482439041138, "logits/rejected": 2.7286458015441895, "logps/chosen": -382.8769836425781, "logps/rejected": -631.4144287109375, "loss": 0.0187, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8545036315917969, "rewards/margins": 10.803675651550293, "rewards/rejected": -8.949172019958496, "step": 2130 }, { "epoch": 0.73, "learning_rate": 4.2087372529271055e-07, "logits/chosen": 1.8994210958480835, "logits/rejected": 3.1800224781036377, "logps/chosen": -349.4104919433594, "logps/rejected": -475.03564453125, "loss": 0.0814, "rewards/accuracies": 1.0, "rewards/chosen": 1.8960243463516235, "rewards/margins": 10.89116096496582, "rewards/rejected": -8.995137214660645, "step": 2140 }, { "epoch": 0.73, "learning_rate": 4.2024424021150697e-07, "logits/chosen": 1.1531842947006226, "logits/rejected": 2.8384928703308105, "logps/chosen": -323.14703369140625, "logps/rejected": -614.2777709960938, "loss": 0.0295, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.258028030395508, "rewards/margins": 10.949602127075195, "rewards/rejected": -8.691572189331055, "step": 2150 }, { "epoch": 0.73, "learning_rate": 4.1961475513030334e-07, "logits/chosen": 1.6405277252197266, "logits/rejected": 3.201380968093872, "logps/chosen": -454.3759765625, "logps/rejected": -358.6201171875, "loss": 0.031, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.8849576711654663, "rewards/margins": 10.349023818969727, "rewards/rejected": -8.464067459106445, "step": 2160 }, { "epoch": 0.74, "learning_rate": 4.189852700490998e-07, "logits/chosen": 1.3615721464157104, "logits/rejected": 3.039472818374634, "logps/chosen": -371.7926025390625, "logps/rejected": -482.4500427246094, "loss": 0.0416, "rewards/accuracies": 1.0, "rewards/chosen": 1.9604123830795288, "rewards/margins": 11.068208694458008, "rewards/rejected": -9.107797622680664, "step": 2170 }, { "epoch": 0.74, "learning_rate": 4.1835578496789624e-07, "logits/chosen": 1.3801050186157227, "logits/rejected": 2.936621904373169, "logps/chosen": -394.67620849609375, "logps/rejected": -551.686767578125, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": 2.2390217781066895, "rewards/margins": 10.892252922058105, "rewards/rejected": -8.653233528137207, "step": 2180 }, { "epoch": 0.74, "learning_rate": 4.1772629988669266e-07, "logits/chosen": 1.5016125440597534, "logits/rejected": 2.9172017574310303, "logps/chosen": -386.29168701171875, "logps/rejected": -602.5443725585938, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/chosen": 2.164307117462158, "rewards/margins": 12.321807861328125, "rewards/rejected": -10.157500267028809, "step": 2190 }, { "epoch": 0.75, "learning_rate": 4.170968148054891e-07, "logits/chosen": 1.2123631238937378, "logits/rejected": 2.915442705154419, "logps/chosen": -608.3240966796875, "logps/rejected": -524.0863037109375, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": 1.4522815942764282, "rewards/margins": 10.55662727355957, "rewards/rejected": -9.10434627532959, "step": 2200 }, { "epoch": 0.75, "eval_logits/chosen": 0.8183408975601196, "eval_logits/rejected": 2.9877638816833496, "eval_logps/chosen": -367.7718811035156, "eval_logps/rejected": -586.932373046875, "eval_loss": 0.024771658703684807, "eval_rewards/accuracies": 0.9907407164573669, "eval_rewards/chosen": 2.12180233001709, "eval_rewards/margins": 10.552002906799316, "eval_rewards/rejected": -8.43019962310791, "eval_runtime": 267.5281, "eval_samples_per_second": 35.51, "eval_steps_per_second": 1.11, "step": 2200 }, { "epoch": 0.75, "learning_rate": 4.164673297242855e-07, "logits/chosen": 1.6179412603378296, "logits/rejected": 2.405421733856201, "logps/chosen": -524.8575439453125, "logps/rejected": -612.1927490234375, "loss": 0.0239, "rewards/accuracies": 1.0, "rewards/chosen": 1.593804121017456, "rewards/margins": 9.6146240234375, "rewards/rejected": -8.020818710327148, "step": 2210 }, { "epoch": 0.75, "learning_rate": 4.1583784464308193e-07, "logits/chosen": 0.659404456615448, "logits/rejected": 2.946692943572998, "logps/chosen": -289.25628662109375, "logps/rejected": -510.9583435058594, "loss": 0.0309, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.205263376235962, "rewards/margins": 10.87122917175293, "rewards/rejected": -8.665966033935547, "step": 2220 }, { "epoch": 0.76, "learning_rate": 4.152083595618784e-07, "logits/chosen": 1.4942001104354858, "logits/rejected": 3.1428303718566895, "logps/chosen": -299.8153991699219, "logps/rejected": -480.1278381347656, "loss": 0.0201, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1747162342071533, "rewards/margins": 10.812664985656738, "rewards/rejected": -8.637947082519531, "step": 2230 }, { "epoch": 0.76, "learning_rate": 4.145788744806748e-07, "logits/chosen": 1.2512485980987549, "logits/rejected": 2.7009167671203613, "logps/chosen": -316.3193054199219, "logps/rejected": -531.200439453125, "loss": 0.0124, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.160095691680908, "rewards/margins": 9.601961135864258, "rewards/rejected": -7.441866397857666, "step": 2240 }, { "epoch": 0.76, "learning_rate": 4.139493893994712e-07, "logits/chosen": 1.755748987197876, "logits/rejected": 3.5284976959228516, "logps/chosen": -345.405029296875, "logps/rejected": -425.1649475097656, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": 2.156236171722412, "rewards/margins": 10.638303756713867, "rewards/rejected": -8.482067108154297, "step": 2250 }, { "epoch": 0.77, "learning_rate": 4.133199043182676e-07, "logits/chosen": 1.370705008506775, "logits/rejected": 3.1592979431152344, "logps/chosen": -323.472900390625, "logps/rejected": -570.658203125, "loss": 0.0228, "rewards/accuracies": 1.0, "rewards/chosen": 2.0871829986572266, "rewards/margins": 11.190869331359863, "rewards/rejected": -9.10368537902832, "step": 2260 }, { "epoch": 0.77, "learning_rate": 4.1269041923706404e-07, "logits/chosen": 0.9058796167373657, "logits/rejected": 3.13399076461792, "logps/chosen": -415.1564025878906, "logps/rejected": -534.84521484375, "loss": 0.0214, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.332183837890625, "rewards/margins": 11.294748306274414, "rewards/rejected": -8.962564468383789, "step": 2270 }, { "epoch": 0.77, "learning_rate": 4.1206093415586047e-07, "logits/chosen": 1.493805170059204, "logits/rejected": 2.8733553886413574, "logps/chosen": -335.87750244140625, "logps/rejected": -545.4400024414062, "loss": 0.0257, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.600989580154419, "rewards/margins": 10.510766983032227, "rewards/rejected": -7.909777641296387, "step": 2280 }, { "epoch": 0.78, "learning_rate": 4.1143144907465694e-07, "logits/chosen": 1.5079195499420166, "logits/rejected": 2.865042209625244, "logps/chosen": -333.11895751953125, "logps/rejected": -583.3126831054688, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": 2.3659024238586426, "rewards/margins": 10.046285629272461, "rewards/rejected": -7.680383205413818, "step": 2290 }, { "epoch": 0.78, "learning_rate": 4.1080196399345336e-07, "logits/chosen": 1.1294147968292236, "logits/rejected": 2.9211654663085938, "logps/chosen": -356.6783447265625, "logps/rejected": -602.4715576171875, "loss": 0.0192, "rewards/accuracies": 1.0, "rewards/chosen": 2.0863723754882812, "rewards/margins": 9.908502578735352, "rewards/rejected": -7.822129726409912, "step": 2300 }, { "epoch": 0.78, "eval_logits/chosen": 0.7942115664482117, "eval_logits/rejected": 2.975782871246338, "eval_logps/chosen": -368.0401916503906, "eval_logps/rejected": -585.2183837890625, "eval_loss": 0.023808879777789116, "eval_rewards/accuracies": 0.9924242496490479, "eval_rewards/chosen": 2.094971179962158, "eval_rewards/margins": 10.353780746459961, "eval_rewards/rejected": -8.258810043334961, "eval_runtime": 268.1286, "eval_samples_per_second": 35.431, "eval_steps_per_second": 1.108, "step": 2300 }, { "epoch": 0.79, "learning_rate": 4.101724789122498e-07, "logits/chosen": 0.789573073387146, "logits/rejected": 2.8354058265686035, "logps/chosen": -455.05389404296875, "logps/rejected": -584.878173828125, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": 2.0611634254455566, "rewards/margins": 9.882638931274414, "rewards/rejected": -7.821475982666016, "step": 2310 }, { "epoch": 0.79, "learning_rate": 4.0954299383104616e-07, "logits/chosen": 1.5280182361602783, "logits/rejected": 3.006089210510254, "logps/chosen": -358.0874328613281, "logps/rejected": -526.4508666992188, "loss": 0.0194, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.2880072593688965, "rewards/margins": 9.189247131347656, "rewards/rejected": -6.901240348815918, "step": 2320 }, { "epoch": 0.79, "learning_rate": 4.089135087498426e-07, "logits/chosen": 1.7580665349960327, "logits/rejected": 3.0975661277770996, "logps/chosen": -551.3494873046875, "logps/rejected": -511.8233947753906, "loss": 0.0276, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6591579914093018, "rewards/margins": 10.34562873840332, "rewards/rejected": -8.686470031738281, "step": 2330 }, { "epoch": 0.8, "learning_rate": 4.08284023668639e-07, "logits/chosen": 1.2768250703811646, "logits/rejected": 1.9886287450790405, "logps/chosen": -363.474853515625, "logps/rejected": -855.5643310546875, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": 2.3451056480407715, "rewards/margins": 10.982662200927734, "rewards/rejected": -8.637556076049805, "step": 2340 }, { "epoch": 0.8, "learning_rate": 4.076545385874355e-07, "logits/chosen": 1.0143083333969116, "logits/rejected": 3.3220772743225098, "logps/chosen": -345.3077087402344, "logps/rejected": -444.23480224609375, "loss": 0.0172, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1893310546875, "rewards/margins": 11.086088180541992, "rewards/rejected": -8.896757125854492, "step": 2350 }, { "epoch": 0.8, "learning_rate": 4.070250535062319e-07, "logits/chosen": 0.9742077589035034, "logits/rejected": 3.147322177886963, "logps/chosen": -305.42431640625, "logps/rejected": -496.04156494140625, "loss": 0.0237, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.3513169288635254, "rewards/margins": 11.449823379516602, "rewards/rejected": -9.098505020141602, "step": 2360 }, { "epoch": 0.81, "learning_rate": 4.063955684250283e-07, "logits/chosen": 1.33650803565979, "logits/rejected": 2.385652542114258, "logps/chosen": -355.97296142578125, "logps/rejected": -673.2279663085938, "loss": 0.0244, "rewards/accuracies": 1.0, "rewards/chosen": 1.8403629064559937, "rewards/margins": 10.767081260681152, "rewards/rejected": -8.926717758178711, "step": 2370 }, { "epoch": 0.81, "learning_rate": 4.0576608334382475e-07, "logits/chosen": 1.350214958190918, "logits/rejected": 3.0495901107788086, "logps/chosen": -378.8364562988281, "logps/rejected": -482.08209228515625, "loss": 0.016, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1247801780700684, "rewards/margins": 11.251646041870117, "rewards/rejected": -9.126866340637207, "step": 2380 }, { "epoch": 0.81, "learning_rate": 4.051365982626211e-07, "logits/chosen": 0.7518723011016846, "logits/rejected": 2.554800271987915, "logps/chosen": -378.2152404785156, "logps/rejected": -664.992919921875, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": 2.0884647369384766, "rewards/margins": 10.981764793395996, "rewards/rejected": -8.893301010131836, "step": 2390 }, { "epoch": 0.82, "learning_rate": 4.0450711318141754e-07, "logits/chosen": 0.7830019593238831, "logits/rejected": 2.8458642959594727, "logps/chosen": -339.97833251953125, "logps/rejected": -581.98095703125, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": 2.2123093605041504, "rewards/margins": 10.689801216125488, "rewards/rejected": -8.477492332458496, "step": 2400 }, { "epoch": 0.82, "eval_logits/chosen": 0.8048932552337646, "eval_logits/rejected": 2.971872091293335, "eval_logps/chosen": -367.2885437011719, "eval_logps/rejected": -589.0294799804688, "eval_loss": 0.02128712832927704, "eval_rewards/accuracies": 0.994107723236084, "eval_rewards/chosen": 2.170135021209717, "eval_rewards/margins": 10.810053825378418, "eval_rewards/rejected": -8.63991928100586, "eval_runtime": 268.8871, "eval_samples_per_second": 35.331, "eval_steps_per_second": 1.105, "step": 2400 }, { "epoch": 0.82, "learning_rate": 4.03877628100214e-07, "logits/chosen": 1.4170640707015991, "logits/rejected": 3.0084924697875977, "logps/chosen": -338.78497314453125, "logps/rejected": -485.98419189453125, "loss": 0.017, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.2607309818267822, "rewards/margins": 10.983499526977539, "rewards/rejected": -8.722768783569336, "step": 2410 }, { "epoch": 0.82, "learning_rate": 4.0324814301901044e-07, "logits/chosen": 1.5939487218856812, "logits/rejected": 2.8795523643493652, "logps/chosen": -412.08203125, "logps/rejected": -499.1363220214844, "loss": 0.0188, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.2640907764434814, "rewards/margins": 10.028116226196289, "rewards/rejected": -7.7640252113342285, "step": 2420 }, { "epoch": 0.83, "learning_rate": 4.0261865793780686e-07, "logits/chosen": 1.2058733701705933, "logits/rejected": 2.6473612785339355, "logps/chosen": -375.0848693847656, "logps/rejected": -608.9097290039062, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": 2.1971919536590576, "rewards/margins": 10.215367317199707, "rewards/rejected": -8.01817512512207, "step": 2430 }, { "epoch": 0.83, "learning_rate": 4.019891728566033e-07, "logits/chosen": 1.4290034770965576, "logits/rejected": 2.5581865310668945, "logps/chosen": -384.47479248046875, "logps/rejected": -717.1683349609375, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": 1.8649975061416626, "rewards/margins": 11.204587936401367, "rewards/rejected": -9.339591026306152, "step": 2440 }, { "epoch": 0.83, "learning_rate": 4.013596877753997e-07, "logits/chosen": 0.9973786473274231, "logits/rejected": 2.5864202976226807, "logps/chosen": -328.24285888671875, "logps/rejected": -633.4420166015625, "loss": 0.0219, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9440271854400635, "rewards/margins": 10.755793571472168, "rewards/rejected": -8.81176471710205, "step": 2450 }, { "epoch": 0.84, "learning_rate": 4.0073020269419613e-07, "logits/chosen": 1.055772066116333, "logits/rejected": 2.9290902614593506, "logps/chosen": -300.1732177734375, "logps/rejected": -595.3922119140625, "loss": 0.0268, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.2839577198028564, "rewards/margins": 11.178118705749512, "rewards/rejected": -8.894161224365234, "step": 2460 }, { "epoch": 0.84, "learning_rate": 4.0010071761299255e-07, "logits/chosen": 1.3420003652572632, "logits/rejected": 2.7169554233551025, "logps/chosen": -376.03997802734375, "logps/rejected": -562.87939453125, "loss": 0.0239, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.4601471424102783, "rewards/margins": 12.127756118774414, "rewards/rejected": -9.667607307434082, "step": 2470 }, { "epoch": 0.84, "learning_rate": 3.99471232531789e-07, "logits/chosen": 1.2516037225723267, "logits/rejected": 3.1634230613708496, "logps/chosen": -407.861328125, "logps/rejected": -549.5359497070312, "loss": 0.0963, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2946739196777344, "rewards/margins": 10.591371536254883, "rewards/rejected": -9.296697616577148, "step": 2480 }, { "epoch": 0.85, "learning_rate": 3.988417474505854e-07, "logits/chosen": 1.7729272842407227, "logits/rejected": 3.2436935901641846, "logps/chosen": -366.273681640625, "logps/rejected": -456.9263610839844, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": 2.1124117374420166, "rewards/margins": 10.958487510681152, "rewards/rejected": -8.846076965332031, "step": 2490 }, { "epoch": 0.85, "learning_rate": 3.982122623693818e-07, "logits/chosen": 1.80498468875885, "logits/rejected": 3.289989471435547, "logps/chosen": -427.5309143066406, "logps/rejected": -459.2511291503906, "loss": 0.0215, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.114067792892456, "rewards/margins": 12.101420402526855, "rewards/rejected": -9.987353324890137, "step": 2500 }, { "epoch": 0.85, "eval_logits/chosen": 0.7799403071403503, "eval_logits/rejected": 2.9391396045684814, "eval_logps/chosen": -367.76953125, "eval_logps/rejected": -594.5902099609375, "eval_loss": 0.022420957684516907, "eval_rewards/accuracies": 0.9932659864425659, "eval_rewards/chosen": 2.1220319271087646, "eval_rewards/margins": 11.318024635314941, "eval_rewards/rejected": -9.195991516113281, "eval_runtime": 268.5512, "eval_samples_per_second": 35.375, "eval_steps_per_second": 1.106, "step": 2500 }, { "epoch": 0.85, "learning_rate": 3.9758277728817824e-07, "logits/chosen": 1.6378087997436523, "logits/rejected": 2.7543601989746094, "logps/chosen": -395.02276611328125, "logps/rejected": -580.5901489257812, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": 2.1616625785827637, "rewards/margins": 11.545696258544922, "rewards/rejected": -9.384033203125, "step": 2510 }, { "epoch": 0.86, "learning_rate": 3.9695329220697467e-07, "logits/chosen": 0.9824737310409546, "logits/rejected": 2.4738383293151855, "logps/chosen": -315.70977783203125, "logps/rejected": -785.3726806640625, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/chosen": 2.2579476833343506, "rewards/margins": 11.292442321777344, "rewards/rejected": -9.034494400024414, "step": 2520 }, { "epoch": 0.86, "learning_rate": 3.9632380712577114e-07, "logits/chosen": 1.0176562070846558, "logits/rejected": 2.7639870643615723, "logps/chosen": -386.0313415527344, "logps/rejected": -576.0577392578125, "loss": 0.0247, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.4897618293762207, "rewards/margins": 10.882287979125977, "rewards/rejected": -8.392526626586914, "step": 2530 }, { "epoch": 0.86, "learning_rate": 3.9569432204456756e-07, "logits/chosen": 1.5484209060668945, "logits/rejected": 2.956909656524658, "logps/chosen": -316.6213684082031, "logps/rejected": -477.16070556640625, "loss": 0.0275, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1205673217773438, "rewards/margins": 10.799238204956055, "rewards/rejected": -8.678671836853027, "step": 2540 }, { "epoch": 0.87, "learning_rate": 3.9506483696336393e-07, "logits/chosen": 1.5496317148208618, "logits/rejected": 2.9354426860809326, "logps/chosen": -373.88018798828125, "logps/rejected": -591.4130859375, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": 2.0508036613464355, "rewards/margins": 11.654314041137695, "rewards/rejected": -9.603509902954102, "step": 2550 }, { "epoch": 0.87, "learning_rate": 3.9443535188216036e-07, "logits/chosen": 0.9137738347053528, "logits/rejected": 2.731851577758789, "logps/chosen": -320.07855224609375, "logps/rejected": -649.6725463867188, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": 2.17970609664917, "rewards/margins": 12.163008689880371, "rewards/rejected": -9.98330307006836, "step": 2560 }, { "epoch": 0.87, "learning_rate": 3.938058668009568e-07, "logits/chosen": 1.3828331232070923, "logits/rejected": 2.774026870727539, "logps/chosen": -388.3182067871094, "logps/rejected": -615.1720581054688, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": 1.6940110921859741, "rewards/margins": 11.10916519165039, "rewards/rejected": -9.415154457092285, "step": 2570 }, { "epoch": 0.88, "learning_rate": 3.931763817197532e-07, "logits/chosen": 1.4449328184127808, "logits/rejected": 3.2123007774353027, "logps/chosen": -364.09442138671875, "logps/rejected": -411.33203125, "loss": 0.0275, "rewards/accuracies": 1.0, "rewards/chosen": 2.192070960998535, "rewards/margins": 10.379565238952637, "rewards/rejected": -8.187494277954102, "step": 2580 }, { "epoch": 0.88, "learning_rate": 3.925468966385497e-07, "logits/chosen": 1.5864359140396118, "logits/rejected": 2.989715337753296, "logps/chosen": -428.8077087402344, "logps/rejected": -388.0070495605469, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": 2.4646167755126953, "rewards/margins": 11.220399856567383, "rewards/rejected": -8.755781173706055, "step": 2590 }, { "epoch": 0.88, "learning_rate": 3.919174115573461e-07, "logits/chosen": 1.2315651178359985, "logits/rejected": 2.7592966556549072, "logps/chosen": -322.261962890625, "logps/rejected": -582.4110717773438, "loss": 0.0579, "rewards/accuracies": 1.0, "rewards/chosen": 2.216446876525879, "rewards/margins": 12.121309280395508, "rewards/rejected": -9.904863357543945, "step": 2600 }, { "epoch": 0.88, "eval_logits/chosen": 0.7932816743850708, "eval_logits/rejected": 2.929717540740967, "eval_logps/chosen": -368.6217346191406, "eval_logps/rejected": -596.0587158203125, "eval_loss": 0.019284222275018692, "eval_rewards/accuracies": 0.9932659864425659, "eval_rewards/chosen": 2.036813259124756, "eval_rewards/margins": 11.379647254943848, "eval_rewards/rejected": -9.34283447265625, "eval_runtime": 268.0206, "eval_samples_per_second": 35.445, "eval_steps_per_second": 1.108, "step": 2600 }, { "epoch": 0.89, "learning_rate": 3.912879264761425e-07, "logits/chosen": 0.6048256158828735, "logits/rejected": 2.5253026485443115, "logps/chosen": -369.83587646484375, "logps/rejected": -640.2315673828125, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": 2.063535690307617, "rewards/margins": 10.798006057739258, "rewards/rejected": -8.73447036743164, "step": 2610 }, { "epoch": 0.89, "learning_rate": 3.906584413949389e-07, "logits/chosen": 1.3932554721832275, "logits/rejected": 2.831205368041992, "logps/chosen": -462.0, "logps/rejected": -515.1595458984375, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": 1.8407312631607056, "rewards/margins": 9.663253784179688, "rewards/rejected": -7.8225226402282715, "step": 2620 }, { "epoch": 0.89, "learning_rate": 3.900289563137353e-07, "logits/chosen": 1.561541199684143, "logits/rejected": 2.811318874359131, "logps/chosen": -347.90399169921875, "logps/rejected": -610.0369873046875, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": 1.943642020225525, "rewards/margins": 12.20551586151123, "rewards/rejected": -10.261874198913574, "step": 2630 }, { "epoch": 0.9, "learning_rate": 3.8939947123253174e-07, "logits/chosen": 1.2890950441360474, "logits/rejected": 3.1064820289611816, "logps/chosen": -348.0389709472656, "logps/rejected": -499.8296813964844, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": 1.4777417182922363, "rewards/margins": 9.751089096069336, "rewards/rejected": -8.273346900939941, "step": 2640 }, { "epoch": 0.9, "learning_rate": 3.887699861513282e-07, "logits/chosen": 1.138880729675293, "logits/rejected": 3.108267068862915, "logps/chosen": -288.1042785644531, "logps/rejected": -444.10870361328125, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": 1.9868762493133545, "rewards/margins": 11.673996925354004, "rewards/rejected": -9.687118530273438, "step": 2650 }, { "epoch": 0.9, "learning_rate": 3.8814050107012464e-07, "logits/chosen": 1.364803671836853, "logits/rejected": 3.290356159210205, "logps/chosen": -384.6515197753906, "logps/rejected": -439.424072265625, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": 2.177926540374756, "rewards/margins": 12.535216331481934, "rewards/rejected": -10.357290267944336, "step": 2660 }, { "epoch": 0.91, "learning_rate": 3.8751101598892106e-07, "logits/chosen": 0.7594862580299377, "logits/rejected": 2.463059186935425, "logps/chosen": -334.9971008300781, "logps/rejected": -688.7122192382812, "loss": 0.0146, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.329500436782837, "rewards/margins": 11.81839656829834, "rewards/rejected": -9.488895416259766, "step": 2670 }, { "epoch": 0.91, "learning_rate": 3.868815309077175e-07, "logits/chosen": 1.1251529455184937, "logits/rejected": 2.556021213531494, "logps/chosen": -383.8106994628906, "logps/rejected": -728.5634765625, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": 1.8957366943359375, "rewards/margins": 11.471336364746094, "rewards/rejected": -9.575600624084473, "step": 2680 }, { "epoch": 0.91, "learning_rate": 3.862520458265139e-07, "logits/chosen": 1.1408156156539917, "logits/rejected": 2.342296838760376, "logps/chosen": -417.59320068359375, "logps/rejected": -750.766357421875, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": 1.7239834070205688, "rewards/margins": 12.186110496520996, "rewards/rejected": -10.462127685546875, "step": 2690 }, { "epoch": 0.92, "learning_rate": 3.856225607453103e-07, "logits/chosen": 0.2998413145542145, "logits/rejected": 3.279633045196533, "logps/chosen": -267.75390625, "logps/rejected": -492.7950134277344, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": 1.8813514709472656, "rewards/margins": 11.568644523620605, "rewards/rejected": -9.687294006347656, "step": 2700 }, { "epoch": 0.92, "eval_logits/chosen": 0.7627749443054199, "eval_logits/rejected": 2.911400079727173, "eval_logps/chosen": -369.9328308105469, "eval_logps/rejected": -597.5867309570312, "eval_loss": 0.017963021993637085, "eval_rewards/accuracies": 0.994107723236084, "eval_rewards/chosen": 1.9057058095932007, "eval_rewards/margins": 11.401346206665039, "eval_rewards/rejected": -9.49563980102539, "eval_runtime": 268.4558, "eval_samples_per_second": 35.388, "eval_steps_per_second": 1.106, "step": 2700 }, { "epoch": 0.92, "learning_rate": 3.8499307566410675e-07, "logits/chosen": 1.146585464477539, "logits/rejected": 2.8043265342712402, "logps/chosen": -323.42303466796875, "logps/rejected": -566.1694946289062, "loss": 0.0192, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6829410791397095, "rewards/margins": 12.46528148651123, "rewards/rejected": -10.782341003417969, "step": 2710 }, { "epoch": 0.92, "learning_rate": 3.843635905829032e-07, "logits/chosen": 0.9065067172050476, "logits/rejected": 2.6599488258361816, "logps/chosen": -466.5685119628906, "logps/rejected": -674.6013793945312, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": 1.813049077987671, "rewards/margins": 10.931371688842773, "rewards/rejected": -9.118322372436523, "step": 2720 }, { "epoch": 0.93, "learning_rate": 3.837341055016996e-07, "logits/chosen": 1.059467077255249, "logits/rejected": 2.8486685752868652, "logps/chosen": -314.97503662109375, "logps/rejected": -662.3733520507812, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": 1.9159355163574219, "rewards/margins": 10.187853813171387, "rewards/rejected": -8.271917343139648, "step": 2730 }, { "epoch": 0.93, "learning_rate": 3.83104620420496e-07, "logits/chosen": 0.7870198488235474, "logits/rejected": 2.1669564247131348, "logps/chosen": -379.6060485839844, "logps/rejected": -886.0685424804688, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": 1.5106027126312256, "rewards/margins": 10.739408493041992, "rewards/rejected": -9.228806495666504, "step": 2740 }, { "epoch": 0.93, "learning_rate": 3.8247513533929244e-07, "logits/chosen": 0.8939107060432434, "logits/rejected": 2.5925869941711426, "logps/chosen": -322.8708190917969, "logps/rejected": -716.4050903320312, "loss": 0.058, "rewards/accuracies": 1.0, "rewards/chosen": 1.69145929813385, "rewards/margins": 11.752408981323242, "rewards/rejected": -10.060951232910156, "step": 2750 }, { "epoch": 0.94, "learning_rate": 3.8184565025808887e-07, "logits/chosen": 1.5585074424743652, "logits/rejected": 2.8523619174957275, "logps/chosen": -417.64892578125, "logps/rejected": -644.0786743164062, "loss": 0.0207, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.8149226903915405, "rewards/margins": 11.294268608093262, "rewards/rejected": -9.47934627532959, "step": 2760 }, { "epoch": 0.94, "learning_rate": 3.8121616517688534e-07, "logits/chosen": 0.7556962370872498, "logits/rejected": 2.8186545372009277, "logps/chosen": -309.7710266113281, "logps/rejected": -595.9371337890625, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": 2.438164234161377, "rewards/margins": 11.598024368286133, "rewards/rejected": -9.159860610961914, "step": 2770 }, { "epoch": 0.94, "learning_rate": 3.805866800956817e-07, "logits/chosen": 1.1116828918457031, "logits/rejected": 3.601109027862549, "logps/chosen": -313.4578552246094, "logps/rejected": -368.9190368652344, "loss": 0.0204, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.6321697235107422, "rewards/margins": 11.596505165100098, "rewards/rejected": -9.964335441589355, "step": 2780 }, { "epoch": 0.95, "learning_rate": 3.7995719501447813e-07, "logits/chosen": 1.1926578283309937, "logits/rejected": 2.93463134765625, "logps/chosen": -332.8338928222656, "logps/rejected": -521.95361328125, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": 2.0567336082458496, "rewards/margins": 12.912908554077148, "rewards/rejected": -10.856175422668457, "step": 2790 }, { "epoch": 0.95, "learning_rate": 3.7932770993327456e-07, "logits/chosen": 1.2860372066497803, "logits/rejected": 2.5439960956573486, "logps/chosen": -330.3837890625, "logps/rejected": -797.3041381835938, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": 1.8657745122909546, "rewards/margins": 11.384241104125977, "rewards/rejected": -9.51846694946289, "step": 2800 }, { "epoch": 0.95, "eval_logits/chosen": 0.7736470699310303, "eval_logits/rejected": 2.922307014465332, "eval_logps/chosen": -369.0752258300781, "eval_logps/rejected": -596.8948974609375, "eval_loss": 0.019366171211004257, "eval_rewards/accuracies": 0.9932659864425659, "eval_rewards/chosen": 1.9914653301239014, "eval_rewards/margins": 11.417922973632812, "eval_rewards/rejected": -9.426457405090332, "eval_runtime": 268.0209, "eval_samples_per_second": 35.445, "eval_steps_per_second": 1.108, "step": 2800 }, { "epoch": 0.96, "learning_rate": 3.78698224852071e-07, "logits/chosen": 1.0611159801483154, "logits/rejected": 3.3828582763671875, "logps/chosen": -308.9765319824219, "logps/rejected": -434.8480529785156, "loss": 0.0237, "rewards/accuracies": 1.0, "rewards/chosen": 1.962587594985962, "rewards/margins": 11.504419326782227, "rewards/rejected": -9.54183292388916, "step": 2810 }, { "epoch": 0.96, "learning_rate": 3.780687397708674e-07, "logits/chosen": 1.3641759157180786, "logits/rejected": 3.2257022857666016, "logps/chosen": -328.934814453125, "logps/rejected": -443.7102966308594, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": 1.9324061870574951, "rewards/margins": 10.669090270996094, "rewards/rejected": -8.736684799194336, "step": 2820 }, { "epoch": 0.96, "learning_rate": 3.774392546896638e-07, "logits/chosen": 1.0812848806381226, "logits/rejected": 2.389310598373413, "logps/chosen": -402.924072265625, "logps/rejected": -687.3925170898438, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": 2.1839425563812256, "rewards/margins": 10.521379470825195, "rewards/rejected": -8.33743667602539, "step": 2830 }, { "epoch": 0.97, "learning_rate": 3.768097696084603e-07, "logits/chosen": 1.0410970449447632, "logits/rejected": 2.740626335144043, "logps/chosen": -360.38861083984375, "logps/rejected": -626.6319580078125, "loss": 0.021, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.4108242988586426, "rewards/margins": 13.338290214538574, "rewards/rejected": -10.92746639251709, "step": 2840 }, { "epoch": 0.97, "learning_rate": 3.761802845272567e-07, "logits/chosen": 0.6179854273796082, "logits/rejected": 2.8828322887420654, "logps/chosen": -372.5259094238281, "logps/rejected": -498.89093017578125, "loss": 0.0216, "rewards/accuracies": 1.0, "rewards/chosen": 1.887270212173462, "rewards/margins": 11.659965515136719, "rewards/rejected": -9.772695541381836, "step": 2850 }, { "epoch": 0.97, "learning_rate": 3.755507994460531e-07, "logits/chosen": 1.0825567245483398, "logits/rejected": 3.1808676719665527, "logps/chosen": -344.6096496582031, "logps/rejected": -501.6756896972656, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": 2.016951084136963, "rewards/margins": 11.96239185333252, "rewards/rejected": -9.945440292358398, "step": 2860 }, { "epoch": 0.98, "learning_rate": 3.749213143648495e-07, "logits/chosen": 0.966810405254364, "logits/rejected": 2.5417160987854004, "logps/chosen": -432.48321533203125, "logps/rejected": -644.724365234375, "loss": 0.0302, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.2609174251556396, "rewards/margins": 10.42978286743164, "rewards/rejected": -8.168864250183105, "step": 2870 }, { "epoch": 0.98, "learning_rate": 3.7429182928364594e-07, "logits/chosen": 0.7974327206611633, "logits/rejected": 2.231346368789673, "logps/chosen": -384.548828125, "logps/rejected": -807.3264770507812, "loss": 0.0185, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.849522352218628, "rewards/margins": 10.164338111877441, "rewards/rejected": -8.314815521240234, "step": 2880 }, { "epoch": 0.98, "learning_rate": 3.7366234420244236e-07, "logits/chosen": 1.5715937614440918, "logits/rejected": 2.6558785438537598, "logps/chosen": -506.68157958984375, "logps/rejected": -646.1708374023438, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": 1.9481022357940674, "rewards/margins": 11.122952461242676, "rewards/rejected": -9.174850463867188, "step": 2890 }, { "epoch": 0.99, "learning_rate": 3.7303285912123884e-07, "logits/chosen": 1.315989375114441, "logits/rejected": 2.911738157272339, "logps/chosen": -338.9214782714844, "logps/rejected": -540.4888916015625, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": 2.19695782661438, "rewards/margins": 11.435901641845703, "rewards/rejected": -9.238944053649902, "step": 2900 }, { "epoch": 0.99, "eval_logits/chosen": 0.759222686290741, "eval_logits/rejected": 2.918637275695801, "eval_logps/chosen": -368.2200927734375, "eval_logps/rejected": -594.5847778320312, "eval_loss": 0.0181864183396101, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": 2.0769829750061035, "eval_rewards/margins": 11.272428512573242, "eval_rewards/rejected": -9.195446014404297, "eval_runtime": 267.5725, "eval_samples_per_second": 35.504, "eval_steps_per_second": 1.11, "step": 2900 }, { "epoch": 0.99, "learning_rate": 3.7240337404003526e-07, "logits/chosen": 1.4009299278259277, "logits/rejected": 2.185304880142212, "logps/chosen": -324.7784729003906, "logps/rejected": -688.9271240234375, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": 1.8561160564422607, "rewards/margins": 10.968622207641602, "rewards/rejected": -9.112505912780762, "step": 2910 }, { "epoch": 0.99, "learning_rate": 3.717738889588317e-07, "logits/chosen": 1.8506414890289307, "logits/rejected": 2.459937572479248, "logps/chosen": -335.97039794921875, "logps/rejected": -724.4388427734375, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": 1.7944343090057373, "rewards/margins": 11.212178230285645, "rewards/rejected": -9.417744636535645, "step": 2920 }, { "epoch": 1.0, "learning_rate": 3.7114440387762805e-07, "logits/chosen": 1.602473497390747, "logits/rejected": 3.060894727706909, "logps/chosen": -315.49273681640625, "logps/rejected": -431.2628479003906, "loss": 0.0145, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9308958053588867, "rewards/margins": 11.219779968261719, "rewards/rejected": -9.288885116577148, "step": 2930 }, { "epoch": 1.0, "learning_rate": 3.705149187964245e-07, "logits/chosen": 1.5184214115142822, "logits/rejected": 2.9138710498809814, "logps/chosen": -386.93157958984375, "logps/rejected": -530.5115966796875, "loss": 0.0155, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.803008794784546, "rewards/margins": 11.285566329956055, "rewards/rejected": -9.48255729675293, "step": 2940 }, { "epoch": 1.0, "learning_rate": 3.698854337152209e-07, "logits/chosen": 1.690629005432129, "logits/rejected": 3.274028778076172, "logps/chosen": -315.9340515136719, "logps/rejected": -507.6627502441406, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": 2.0165982246398926, "rewards/margins": 11.94556999206543, "rewards/rejected": -9.928972244262695, "step": 2950 }, { "epoch": 1.01, "learning_rate": 3.692559486340174e-07, "logits/chosen": 0.7777345180511475, "logits/rejected": 2.230090856552124, "logps/chosen": -353.92840576171875, "logps/rejected": -908.7879028320312, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": 1.9050586223602295, "rewards/margins": 12.09577465057373, "rewards/rejected": -10.190717697143555, "step": 2960 }, { "epoch": 1.01, "learning_rate": 3.686264635528138e-07, "logits/chosen": 0.5998127460479736, "logits/rejected": 3.0432848930358887, "logps/chosen": -305.62451171875, "logps/rejected": -328.70220947265625, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": 1.7932469844818115, "rewards/margins": 10.898086547851562, "rewards/rejected": -9.104839324951172, "step": 2970 }, { "epoch": 1.01, "learning_rate": 3.679969784716102e-07, "logits/chosen": 0.6457049250602722, "logits/rejected": 3.008762836456299, "logps/chosen": -289.96527099609375, "logps/rejected": -502.66107177734375, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": 1.8909308910369873, "rewards/margins": 11.495240211486816, "rewards/rejected": -9.604308128356934, "step": 2980 }, { "epoch": 1.02, "learning_rate": 3.6736749339040664e-07, "logits/chosen": 1.1077024936676025, "logits/rejected": 2.665431499481201, "logps/chosen": -381.0195617675781, "logps/rejected": -638.0323486328125, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": 2.257869005203247, "rewards/margins": 12.202370643615723, "rewards/rejected": -9.944503784179688, "step": 2990 }, { "epoch": 1.02, "learning_rate": 3.6673800830920307e-07, "logits/chosen": 1.476149320602417, "logits/rejected": 2.7508435249328613, "logps/chosen": -362.0536804199219, "logps/rejected": -539.8151245117188, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": 2.177727222442627, "rewards/margins": 10.220754623413086, "rewards/rejected": -8.0430269241333, "step": 3000 }, { "epoch": 1.02, "eval_logits/chosen": 0.7556570768356323, "eval_logits/rejected": 2.895693063735962, "eval_logps/chosen": -369.8956604003906, "eval_logps/rejected": -597.5946655273438, "eval_loss": 0.017979048192501068, "eval_rewards/accuracies": 0.994107723236084, "eval_rewards/chosen": 1.909419298171997, "eval_rewards/margins": 11.405853271484375, "eval_rewards/rejected": -9.496432304382324, "eval_runtime": 268.1886, "eval_samples_per_second": 35.423, "eval_steps_per_second": 1.107, "step": 3000 }, { "epoch": 1.02, "learning_rate": 3.6610852322799943e-07, "logits/chosen": 0.7976253628730774, "logits/rejected": 2.7230780124664307, "logps/chosen": -431.26348876953125, "logps/rejected": -602.6639404296875, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 2.011127471923828, "rewards/margins": 11.370102882385254, "rewards/rejected": -9.358975410461426, "step": 3010 }, { "epoch": 1.03, "learning_rate": 3.654790381467959e-07, "logits/chosen": 1.573567271232605, "logits/rejected": 3.549926280975342, "logps/chosen": -337.05633544921875, "logps/rejected": -439.34820556640625, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": 2.0311856269836426, "rewards/margins": 12.150315284729004, "rewards/rejected": -10.11913013458252, "step": 3020 }, { "epoch": 1.03, "learning_rate": 3.6484955306559233e-07, "logits/chosen": 1.0998715162277222, "logits/rejected": 2.7142605781555176, "logps/chosen": -445.3197326660156, "logps/rejected": -485.9790954589844, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": 2.2856061458587646, "rewards/margins": 11.167757034301758, "rewards/rejected": -8.882150650024414, "step": 3030 }, { "epoch": 1.03, "learning_rate": 3.6422006798438876e-07, "logits/chosen": 0.7898514270782471, "logits/rejected": 2.9154164791107178, "logps/chosen": -340.6654357910156, "logps/rejected": -562.8787841796875, "loss": 0.0216, "rewards/accuracies": 1.0, "rewards/chosen": 2.05285906791687, "rewards/margins": 11.81180477142334, "rewards/rejected": -9.758944511413574, "step": 3040 }, { "epoch": 1.04, "learning_rate": 3.635905829031852e-07, "logits/chosen": 0.8585756421089172, "logits/rejected": 3.1079821586608887, "logps/chosen": -429.490478515625, "logps/rejected": -481.5127868652344, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 1.7858238220214844, "rewards/margins": 11.667909622192383, "rewards/rejected": -9.882083892822266, "step": 3050 }, { "epoch": 1.04, "learning_rate": 3.629610978219816e-07, "logits/chosen": 1.2229456901550293, "logits/rejected": 2.210904836654663, "logps/chosen": -349.9075012207031, "logps/rejected": -760.1993408203125, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": 2.1999588012695312, "rewards/margins": 11.409255981445312, "rewards/rejected": -9.209296226501465, "step": 3060 }, { "epoch": 1.04, "learning_rate": 3.62331612740778e-07, "logits/chosen": 1.2499350309371948, "logits/rejected": 2.8214731216430664, "logps/chosen": -432.19989013671875, "logps/rejected": -587.6690673828125, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": 2.0051960945129395, "rewards/margins": 12.072220802307129, "rewards/rejected": -10.067024230957031, "step": 3070 }, { "epoch": 1.05, "learning_rate": 3.617021276595745e-07, "logits/chosen": 1.251936912536621, "logits/rejected": 2.393967390060425, "logps/chosen": -538.280029296875, "logps/rejected": -499.31427001953125, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": 1.7456916570663452, "rewards/margins": 10.943967819213867, "rewards/rejected": -9.198275566101074, "step": 3080 }, { "epoch": 1.05, "learning_rate": 3.6107264257837087e-07, "logits/chosen": 0.43956202268600464, "logits/rejected": 2.4506821632385254, "logps/chosen": -369.17193603515625, "logps/rejected": -669.3594970703125, "loss": 0.0104, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8963171243667603, "rewards/margins": 11.685440063476562, "rewards/rejected": -9.78912353515625, "step": 3090 }, { "epoch": 1.05, "learning_rate": 3.604431574971673e-07, "logits/chosen": 1.2288469076156616, "logits/rejected": 2.752350091934204, "logps/chosen": -352.03094482421875, "logps/rejected": -578.940673828125, "loss": 0.011, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6200469732284546, "rewards/margins": 10.573201179504395, "rewards/rejected": -8.953152656555176, "step": 3100 }, { "epoch": 1.05, "eval_logits/chosen": 0.7294398546218872, "eval_logits/rejected": 2.855963945388794, "eval_logps/chosen": -368.9812316894531, "eval_logps/rejected": -601.975830078125, "eval_loss": 0.015040040947496891, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": 2.0008652210235596, "eval_rewards/margins": 11.935413360595703, "eval_rewards/rejected": -9.934547424316406, "eval_runtime": 267.2447, "eval_samples_per_second": 35.548, "eval_steps_per_second": 1.111, "step": 3100 }, { "epoch": 1.06, "learning_rate": 3.598136724159637e-07, "logits/chosen": 1.2908390760421753, "logits/rejected": 3.2633769512176514, "logps/chosen": -426.2572326660156, "logps/rejected": -449.2496032714844, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": 2.091630458831787, "rewards/margins": 12.465272903442383, "rewards/rejected": -10.37364387512207, "step": 3110 }, { "epoch": 1.06, "learning_rate": 3.5918418733476014e-07, "logits/chosen": 1.0254614353179932, "logits/rejected": 2.644998788833618, "logps/chosen": -444.62115478515625, "logps/rejected": -557.4613037109375, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": 2.416001081466675, "rewards/margins": 13.090082168579102, "rewards/rejected": -10.674080848693848, "step": 3120 }, { "epoch": 1.06, "learning_rate": 3.5855470225355656e-07, "logits/chosen": 0.3688656985759735, "logits/rejected": 2.787039279937744, "logps/chosen": -371.48199462890625, "logps/rejected": -606.9403076171875, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": 2.235377788543701, "rewards/margins": 11.422839164733887, "rewards/rejected": -9.187460899353027, "step": 3130 }, { "epoch": 1.07, "learning_rate": 3.5792521717235304e-07, "logits/chosen": 1.1069527864456177, "logits/rejected": 2.7388956546783447, "logps/chosen": -383.25970458984375, "logps/rejected": -508.56884765625, "loss": 0.0196, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.3463032245635986, "rewards/margins": 13.309300422668457, "rewards/rejected": -10.962995529174805, "step": 3140 }, { "epoch": 1.07, "learning_rate": 3.5729573209114946e-07, "logits/chosen": 1.341090202331543, "logits/rejected": 3.0517337322235107, "logps/chosen": -318.0455627441406, "logps/rejected": -569.5386962890625, "loss": 0.0121, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.2804172039031982, "rewards/margins": 10.993358612060547, "rewards/rejected": -8.71294116973877, "step": 3150 }, { "epoch": 1.07, "learning_rate": 3.5666624700994583e-07, "logits/chosen": 0.7650747299194336, "logits/rejected": 2.015530824661255, "logps/chosen": -323.05804443359375, "logps/rejected": -855.6818237304688, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": 2.203866481781006, "rewards/margins": 12.014130592346191, "rewards/rejected": -9.810262680053711, "step": 3160 }, { "epoch": 1.08, "learning_rate": 3.5603676192874225e-07, "logits/chosen": 0.6404491066932678, "logits/rejected": 2.8913769721984863, "logps/chosen": -307.3809509277344, "logps/rejected": -477.2265625, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": 2.644552230834961, "rewards/margins": 11.691588401794434, "rewards/rejected": -9.047036170959473, "step": 3170 }, { "epoch": 1.08, "learning_rate": 3.554072768475387e-07, "logits/chosen": 1.2438385486602783, "logits/rejected": 2.9186463356018066, "logps/chosen": -448.6307067871094, "logps/rejected": -547.8207397460938, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": 2.0710911750793457, "rewards/margins": 10.842884063720703, "rewards/rejected": -8.7717924118042, "step": 3180 }, { "epoch": 1.08, "learning_rate": 3.547777917663351e-07, "logits/chosen": 0.6184150576591492, "logits/rejected": 2.433743953704834, "logps/chosen": -344.1460876464844, "logps/rejected": -703.8804931640625, "loss": 0.0122, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.277899742126465, "rewards/margins": 13.781367301940918, "rewards/rejected": -11.503466606140137, "step": 3190 }, { "epoch": 1.09, "learning_rate": 3.5414830668513157e-07, "logits/chosen": 0.9604349136352539, "logits/rejected": 2.8598134517669678, "logps/chosen": -297.17474365234375, "logps/rejected": -472.0577697753906, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": 2.357182025909424, "rewards/margins": 12.045616149902344, "rewards/rejected": -9.688433647155762, "step": 3200 }, { "epoch": 1.09, "eval_logits/chosen": 0.7070604562759399, "eval_logits/rejected": 2.8564767837524414, "eval_logps/chosen": -368.12896728515625, "eval_logps/rejected": -598.782958984375, "eval_loss": 0.01390204019844532, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": 2.0860908031463623, "eval_rewards/margins": 11.701354026794434, "eval_rewards/rejected": -9.615262985229492, "eval_runtime": 267.9523, "eval_samples_per_second": 35.454, "eval_steps_per_second": 1.108, "step": 3200 }, { "epoch": 1.09, "learning_rate": 3.53518821603928e-07, "logits/chosen": 1.42811918258667, "logits/rejected": 3.1818394660949707, "logps/chosen": -312.5201110839844, "logps/rejected": -479.3251953125, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": 1.7172324657440186, "rewards/margins": 11.587867736816406, "rewards/rejected": -9.870635986328125, "step": 3210 }, { "epoch": 1.09, "learning_rate": 3.528893365227244e-07, "logits/chosen": 1.412206768989563, "logits/rejected": 2.872518301010132, "logps/chosen": -452.25439453125, "logps/rejected": -519.6819458007812, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": 1.7541221380233765, "rewards/margins": 11.772988319396973, "rewards/rejected": -10.018865585327148, "step": 3220 }, { "epoch": 1.1, "learning_rate": 3.5225985144152084e-07, "logits/chosen": 0.8214865922927856, "logits/rejected": 3.258615493774414, "logps/chosen": -339.0177307128906, "logps/rejected": -487.43475341796875, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": 2.4326157569885254, "rewards/margins": 13.359159469604492, "rewards/rejected": -10.926544189453125, "step": 3230 }, { "epoch": 1.1, "learning_rate": 3.516303663603172e-07, "logits/chosen": 1.7511094808578491, "logits/rejected": 3.0158190727233887, "logps/chosen": -378.07861328125, "logps/rejected": -562.0789794921875, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 2.06204891204834, "rewards/margins": 11.8056001663208, "rewards/rejected": -9.743552207946777, "step": 3240 }, { "epoch": 1.1, "learning_rate": 3.5100088127911363e-07, "logits/chosen": 0.7212003469467163, "logits/rejected": 3.1654887199401855, "logps/chosen": -399.91644287109375, "logps/rejected": -462.4659118652344, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": 2.0864148139953613, "rewards/margins": 13.867498397827148, "rewards/rejected": -11.781085014343262, "step": 3250 }, { "epoch": 1.11, "learning_rate": 3.503713961979101e-07, "logits/chosen": 1.574752688407898, "logits/rejected": 2.749525547027588, "logps/chosen": -357.22637939453125, "logps/rejected": -579.518310546875, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": 1.607234001159668, "rewards/margins": 11.814053535461426, "rewards/rejected": -10.206819534301758, "step": 3260 }, { "epoch": 1.11, "learning_rate": 3.4974191111670653e-07, "logits/chosen": 1.2592524290084839, "logits/rejected": 2.282480239868164, "logps/chosen": -393.34625244140625, "logps/rejected": -747.9046020507812, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": 2.0157406330108643, "rewards/margins": 11.675127029418945, "rewards/rejected": -9.65938663482666, "step": 3270 }, { "epoch": 1.11, "learning_rate": 3.4911242603550296e-07, "logits/chosen": 1.7345688343048096, "logits/rejected": 2.5775959491729736, "logps/chosen": -336.7502136230469, "logps/rejected": -515.5429077148438, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": 1.4989908933639526, "rewards/margins": 10.366995811462402, "rewards/rejected": -8.868005752563477, "step": 3280 }, { "epoch": 1.12, "learning_rate": 3.484829409542994e-07, "logits/chosen": 1.371095061302185, "logits/rejected": 2.897758960723877, "logps/chosen": -422.10595703125, "logps/rejected": -568.4336547851562, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": 2.2941784858703613, "rewards/margins": 13.477294921875, "rewards/rejected": -11.183117866516113, "step": 3290 }, { "epoch": 1.12, "learning_rate": 3.478534558730958e-07, "logits/chosen": 1.2534377574920654, "logits/rejected": 2.912048816680908, "logps/chosen": -365.91363525390625, "logps/rejected": -521.9572143554688, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": 1.7988227605819702, "rewards/margins": 11.863677024841309, "rewards/rejected": -10.064854621887207, "step": 3300 }, { "epoch": 1.12, "eval_logits/chosen": 0.7083035111427307, "eval_logits/rejected": 2.828965902328491, "eval_logps/chosen": -369.2344055175781, "eval_logps/rejected": -606.5661010742188, "eval_loss": 0.013400154188275337, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": 1.9755483865737915, "eval_rewards/margins": 12.369123458862305, "eval_rewards/rejected": -10.393575668334961, "eval_runtime": 268.3764, "eval_samples_per_second": 35.398, "eval_steps_per_second": 1.107, "step": 3300 }, { "epoch": 1.13, "learning_rate": 3.4722397079189217e-07, "logits/chosen": 0.8336647748947144, "logits/rejected": 2.756704568862915, "logps/chosen": -324.1752624511719, "logps/rejected": -625.7420654296875, "loss": 0.015, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8257815837860107, "rewards/margins": 12.731443405151367, "rewards/rejected": -10.905664443969727, "step": 3310 }, { "epoch": 1.13, "learning_rate": 3.4659448571068865e-07, "logits/chosen": 0.3902451694011688, "logits/rejected": 2.532710552215576, "logps/chosen": -473.7185974121094, "logps/rejected": -582.3638305664062, "loss": 0.0125, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6804431676864624, "rewards/margins": 12.236088752746582, "rewards/rejected": -10.555645942687988, "step": 3320 }, { "epoch": 1.13, "learning_rate": 3.4596500062948507e-07, "logits/chosen": 0.8588132858276367, "logits/rejected": 2.571681261062622, "logps/chosen": -399.33587646484375, "logps/rejected": -604.447265625, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": 2.3679261207580566, "rewards/margins": 13.345097541809082, "rewards/rejected": -10.97716999053955, "step": 3330 }, { "epoch": 1.14, "learning_rate": 3.453355155482815e-07, "logits/chosen": 0.8485054969787598, "logits/rejected": 2.0267395973205566, "logps/chosen": -366.140869140625, "logps/rejected": -839.7205200195312, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 2.0398154258728027, "rewards/margins": 12.325020790100098, "rewards/rejected": -10.28520393371582, "step": 3340 }, { "epoch": 1.14, "learning_rate": 3.447060304670779e-07, "logits/chosen": 0.7669464349746704, "logits/rejected": 2.867802143096924, "logps/chosen": -337.73590087890625, "logps/rejected": -584.9771728515625, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": 2.1630008220672607, "rewards/margins": 13.043685913085938, "rewards/rejected": -10.880684852600098, "step": 3350 }, { "epoch": 1.14, "learning_rate": 3.4407654538587434e-07, "logits/chosen": 0.9161348342895508, "logits/rejected": 2.3917458057403564, "logps/chosen": -380.1439514160156, "logps/rejected": -691.7762451171875, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": 1.82815420627594, "rewards/margins": 11.48265552520752, "rewards/rejected": -9.654500961303711, "step": 3360 }, { "epoch": 1.15, "learning_rate": 3.4344706030467076e-07, "logits/chosen": 0.9669869542121887, "logits/rejected": 2.661142349243164, "logps/chosen": -324.63262939453125, "logps/rejected": -747.2676391601562, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": 1.553868055343628, "rewards/margins": 11.775548934936523, "rewards/rejected": -10.221680641174316, "step": 3370 }, { "epoch": 1.15, "learning_rate": 3.4281757522346724e-07, "logits/chosen": 1.1901271343231201, "logits/rejected": 2.964195966720581, "logps/chosen": -433.86602783203125, "logps/rejected": -518.0977783203125, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": 2.0861451625823975, "rewards/margins": 12.089399337768555, "rewards/rejected": -10.003252029418945, "step": 3380 }, { "epoch": 1.15, "learning_rate": 3.421880901422636e-07, "logits/chosen": 0.620611310005188, "logits/rejected": 2.369555950164795, "logps/chosen": -368.126708984375, "logps/rejected": -743.3250122070312, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": 1.87155020236969, "rewards/margins": 12.570436477661133, "rewards/rejected": -10.698884963989258, "step": 3390 }, { "epoch": 1.16, "learning_rate": 3.4155860506106003e-07, "logits/chosen": 0.7373756170272827, "logits/rejected": 2.399385452270508, "logps/chosen": -397.4166564941406, "logps/rejected": -620.8201904296875, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": 1.5980751514434814, "rewards/margins": 11.54615306854248, "rewards/rejected": -9.948077201843262, "step": 3400 }, { "epoch": 1.16, "eval_logits/chosen": 0.7183800935745239, "eval_logits/rejected": 2.821152925491333, "eval_logps/chosen": -369.2712097167969, "eval_logps/rejected": -606.4811401367188, "eval_loss": 0.012870008125901222, "eval_rewards/accuracies": 0.9949495196342468, "eval_rewards/chosen": 1.9718691110610962, "eval_rewards/margins": 12.356949806213379, "eval_rewards/rejected": -10.38508129119873, "eval_runtime": 269.4638, "eval_samples_per_second": 35.255, "eval_steps_per_second": 1.102, "step": 3400 }, { "epoch": 1.16, "learning_rate": 3.4092911997985645e-07, "logits/chosen": 1.1883623600006104, "logits/rejected": 2.577510356903076, "logps/chosen": -362.335205078125, "logps/rejected": -606.2926025390625, "loss": 0.0135, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4554922580718994, "rewards/margins": 11.256914138793945, "rewards/rejected": -9.801423072814941, "step": 3410 }, { "epoch": 1.16, "learning_rate": 3.402996348986529e-07, "logits/chosen": 1.3287720680236816, "logits/rejected": 2.5388364791870117, "logps/chosen": -438.2972106933594, "logps/rejected": -667.9464111328125, "loss": 0.0185, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.7842785120010376, "rewards/margins": 11.153425216674805, "rewards/rejected": -9.369147300720215, "step": 3420 }, { "epoch": 1.17, "learning_rate": 3.396701498174493e-07, "logits/chosen": 1.2645375728607178, "logits/rejected": 2.871066093444824, "logps/chosen": -398.85028076171875, "logps/rejected": -565.3001098632812, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": 2.0519790649414062, "rewards/margins": 11.566902160644531, "rewards/rejected": -9.514923095703125, "step": 3430 }, { "epoch": 1.17, "learning_rate": 3.3904066473624577e-07, "logits/chosen": 0.7664368152618408, "logits/rejected": 2.894336223602295, "logps/chosen": -297.42779541015625, "logps/rejected": -642.1676635742188, "loss": 0.0116, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9993404150009155, "rewards/margins": 12.201227188110352, "rewards/rejected": -10.201887130737305, "step": 3440 }, { "epoch": 1.17, "learning_rate": 3.384111796550422e-07, "logits/chosen": 0.8389550447463989, "logits/rejected": 2.6036429405212402, "logps/chosen": -315.84979248046875, "logps/rejected": -680.8143920898438, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": 1.8412582874298096, "rewards/margins": 11.332327842712402, "rewards/rejected": -9.491069793701172, "step": 3450 }, { "epoch": 1.18, "learning_rate": 3.377816945738386e-07, "logits/chosen": 0.4181036353111267, "logits/rejected": 2.9160115718841553, "logps/chosen": -286.0548095703125, "logps/rejected": -531.5001220703125, "loss": 0.013, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6251122951507568, "rewards/margins": 11.32032585144043, "rewards/rejected": -9.695211410522461, "step": 3460 }, { "epoch": 1.18, "learning_rate": 3.37152209492635e-07, "logits/chosen": 0.6207982301712036, "logits/rejected": 2.664804458618164, "logps/chosen": -356.7328186035156, "logps/rejected": -650.884033203125, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": 1.8970571756362915, "rewards/margins": 12.091269493103027, "rewards/rejected": -10.1942138671875, "step": 3470 }, { "epoch": 1.18, "learning_rate": 3.365227244114314e-07, "logits/chosen": 0.7093061804771423, "logits/rejected": 2.628840684890747, "logps/chosen": -305.00457763671875, "logps/rejected": -528.3023681640625, "loss": 0.0095, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1378726959228516, "rewards/margins": 11.653341293334961, "rewards/rejected": -9.515467643737793, "step": 3480 }, { "epoch": 1.19, "learning_rate": 3.3589323933022783e-07, "logits/chosen": 1.2648298740386963, "logits/rejected": 2.892852783203125, "logps/chosen": -398.9606018066406, "logps/rejected": -603.8753662109375, "loss": 0.0088, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4770350456237793, "rewards/margins": 12.576154708862305, "rewards/rejected": -11.099119186401367, "step": 3490 }, { "epoch": 1.19, "learning_rate": 3.3526375424902426e-07, "logits/chosen": 1.1422218084335327, "logits/rejected": 3.253868818283081, "logps/chosen": -365.7401428222656, "logps/rejected": -566.3377685546875, "loss": 0.0152, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4305555820465088, "rewards/margins": 11.32600212097168, "rewards/rejected": -9.895447731018066, "step": 3500 }, { "epoch": 1.19, "eval_logits/chosen": 0.7139882445335388, "eval_logits/rejected": 2.8217320442199707, "eval_logps/chosen": -368.63287353515625, "eval_logps/rejected": -604.761474609375, "eval_loss": 0.012398996390402317, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": 2.0357046127319336, "eval_rewards/margins": 12.248809814453125, "eval_rewards/rejected": -10.213105201721191, "eval_runtime": 268.6396, "eval_samples_per_second": 35.363, "eval_steps_per_second": 1.106, "step": 3500 }, { "epoch": 1.19, "learning_rate": 3.3463426916782073e-07, "logits/chosen": 0.7617170214653015, "logits/rejected": 2.90086030960083, "logps/chosen": -347.78912353515625, "logps/rejected": -474.829345703125, "loss": 0.0091, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9028854370117188, "rewards/margins": 11.47684383392334, "rewards/rejected": -9.573958396911621, "step": 3510 }, { "epoch": 1.2, "learning_rate": 3.3400478408661716e-07, "logits/chosen": 1.6692126989364624, "logits/rejected": 3.0957980155944824, "logps/chosen": -336.6679382324219, "logps/rejected": -442.8111877441406, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": 2.1993305683135986, "rewards/margins": 13.359231948852539, "rewards/rejected": -11.15990161895752, "step": 3520 }, { "epoch": 1.2, "learning_rate": 3.333752990054136e-07, "logits/chosen": 0.9762203097343445, "logits/rejected": 2.1863927841186523, "logps/chosen": -366.7749938964844, "logps/rejected": -685.2867431640625, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": 2.2956480979919434, "rewards/margins": 13.168296813964844, "rewards/rejected": -10.872648239135742, "step": 3530 }, { "epoch": 1.2, "learning_rate": 3.3274581392420995e-07, "logits/chosen": 0.9696139097213745, "logits/rejected": 3.0056960582733154, "logps/chosen": -297.73333740234375, "logps/rejected": -467.8060607910156, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": 2.282499074935913, "rewards/margins": 12.94469165802002, "rewards/rejected": -10.662192344665527, "step": 3540 }, { "epoch": 1.21, "learning_rate": 3.3211632884300637e-07, "logits/chosen": 1.7081263065338135, "logits/rejected": 2.8718748092651367, "logps/chosen": -421.0556640625, "logps/rejected": -577.2882080078125, "loss": 0.006, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.122880458831787, "rewards/margins": 12.477874755859375, "rewards/rejected": -10.354994773864746, "step": 3550 }, { "epoch": 1.21, "learning_rate": 3.314868437618028e-07, "logits/chosen": 1.3022390604019165, "logits/rejected": 2.8691840171813965, "logps/chosen": -433.652587890625, "logps/rejected": -585.9044189453125, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": 2.045884609222412, "rewards/margins": 11.981613159179688, "rewards/rejected": -9.935728073120117, "step": 3560 }, { "epoch": 1.21, "learning_rate": 3.3085735868059927e-07, "logits/chosen": 0.563024640083313, "logits/rejected": 2.9883615970611572, "logps/chosen": -486.9850158691406, "logps/rejected": -485.02978515625, "loss": 0.0108, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.0081732273101807, "rewards/margins": 13.054903984069824, "rewards/rejected": -11.046730995178223, "step": 3570 }, { "epoch": 1.22, "learning_rate": 3.302278735993957e-07, "logits/chosen": 1.2381908893585205, "logits/rejected": 2.346295118331909, "logps/chosen": -450.12139892578125, "logps/rejected": -788.5457763671875, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": 1.7062536478042603, "rewards/margins": 11.615732192993164, "rewards/rejected": -9.909477233886719, "step": 3580 }, { "epoch": 1.22, "learning_rate": 3.295983885181921e-07, "logits/chosen": 0.8633956909179688, "logits/rejected": 2.1898419857025146, "logps/chosen": -524.0609741210938, "logps/rejected": -731.7201538085938, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": 2.227761745452881, "rewards/margins": 12.04895305633545, "rewards/rejected": -9.821191787719727, "step": 3590 }, { "epoch": 1.22, "learning_rate": 3.2896890343698854e-07, "logits/chosen": 0.4451712965965271, "logits/rejected": 2.969043254852295, "logps/chosen": -298.8716125488281, "logps/rejected": -563.7342529296875, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": 1.909551978111267, "rewards/margins": 11.801454544067383, "rewards/rejected": -9.891901016235352, "step": 3600 }, { "epoch": 1.22, "eval_logits/chosen": 0.6516625881195068, "eval_logits/rejected": 2.758915424346924, "eval_logps/chosen": -368.8428039550781, "eval_logps/rejected": -611.873291015625, "eval_loss": 0.011604116298258305, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": 2.0147087574005127, "eval_rewards/margins": 12.93900203704834, "eval_rewards/rejected": -10.924293518066406, "eval_runtime": 268.5101, "eval_samples_per_second": 35.38, "eval_steps_per_second": 1.106, "step": 3600 }, { "epoch": 1.23, "learning_rate": 3.2833941835578496e-07, "logits/chosen": 0.9128265380859375, "logits/rejected": 2.6559739112854004, "logps/chosen": -323.16656494140625, "logps/rejected": -685.8175659179688, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": 2.358858108520508, "rewards/margins": 15.383926391601562, "rewards/rejected": -13.025070190429688, "step": 3610 }, { "epoch": 1.23, "learning_rate": 3.2770993327458133e-07, "logits/chosen": 1.0553185939788818, "logits/rejected": 3.042325973510742, "logps/chosen": -379.9535827636719, "logps/rejected": -480.6560974121094, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": 1.9295809268951416, "rewards/margins": 15.311012268066406, "rewards/rejected": -13.381433486938477, "step": 3620 }, { "epoch": 1.23, "learning_rate": 3.270804481933778e-07, "logits/chosen": 1.2228381633758545, "logits/rejected": 2.805607557296753, "logps/chosen": -317.10711669921875, "logps/rejected": -619.1337280273438, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": 2.014592409133911, "rewards/margins": 12.018266677856445, "rewards/rejected": -10.003674507141113, "step": 3630 }, { "epoch": 1.24, "learning_rate": 3.2645096311217423e-07, "logits/chosen": 1.6097183227539062, "logits/rejected": 3.0599427223205566, "logps/chosen": -386.4832763671875, "logps/rejected": -413.746826171875, "loss": 0.0108, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8478138446807861, "rewards/margins": 11.943216323852539, "rewards/rejected": -10.095402717590332, "step": 3640 }, { "epoch": 1.24, "learning_rate": 3.2582147803097065e-07, "logits/chosen": 0.20550115406513214, "logits/rejected": 2.465472459793091, "logps/chosen": -335.13824462890625, "logps/rejected": -604.3788452148438, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": 2.0051052570343018, "rewards/margins": 12.726202011108398, "rewards/rejected": -10.721096992492676, "step": 3650 }, { "epoch": 1.24, "learning_rate": 3.251919929497671e-07, "logits/chosen": 1.2652103900909424, "logits/rejected": 2.926816463470459, "logps/chosen": -407.6763610839844, "logps/rejected": -495.2499084472656, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 1.9935003519058228, "rewards/margins": 12.497861862182617, "rewards/rejected": -10.504361152648926, "step": 3660 }, { "epoch": 1.25, "learning_rate": 3.245625078685635e-07, "logits/chosen": 1.3886959552764893, "logits/rejected": 2.630584478378296, "logps/chosen": -481.4646911621094, "logps/rejected": -748.5326538085938, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": 1.8012968301773071, "rewards/margins": 12.587298393249512, "rewards/rejected": -10.786002159118652, "step": 3670 }, { "epoch": 1.25, "learning_rate": 3.239330227873599e-07, "logits/chosen": 0.5694072842597961, "logits/rejected": 2.876537322998047, "logps/chosen": -342.1631774902344, "logps/rejected": -513.7351684570312, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": 2.246203899383545, "rewards/margins": 13.117487907409668, "rewards/rejected": -10.871283531188965, "step": 3680 }, { "epoch": 1.25, "learning_rate": 3.233035377061564e-07, "logits/chosen": 1.219588041305542, "logits/rejected": 2.5020108222961426, "logps/chosen": -361.3206787109375, "logps/rejected": -556.8141479492188, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": 1.6606366634368896, "rewards/margins": 12.658553123474121, "rewards/rejected": -10.997916221618652, "step": 3690 }, { "epoch": 1.26, "learning_rate": 3.2267405262495277e-07, "logits/chosen": 2.0143651962280273, "logits/rejected": 2.8961198329925537, "logps/chosen": -376.0616149902344, "logps/rejected": -571.3860473632812, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": 1.6621707677841187, "rewards/margins": 12.810911178588867, "rewards/rejected": -11.148740768432617, "step": 3700 }, { "epoch": 1.26, "eval_logits/chosen": 0.70637047290802, "eval_logits/rejected": 2.8017258644104004, "eval_logps/chosen": -369.4627685546875, "eval_logps/rejected": -611.279541015625, "eval_loss": 0.011604195460677147, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": 1.9527100324630737, "eval_rewards/margins": 12.817631721496582, "eval_rewards/rejected": -10.864921569824219, "eval_runtime": 268.6034, "eval_samples_per_second": 35.368, "eval_steps_per_second": 1.106, "step": 3700 }, { "epoch": 1.26, "learning_rate": 3.220445675437492e-07, "logits/chosen": 1.3589471578598022, "logits/rejected": 2.3055531978607178, "logps/chosen": -345.74774169921875, "logps/rejected": -734.9967041015625, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": 1.7902826070785522, "rewards/margins": 13.758868217468262, "rewards/rejected": -11.968585968017578, "step": 3710 }, { "epoch": 1.26, "learning_rate": 3.214150824625456e-07, "logits/chosen": 0.9244592785835266, "logits/rejected": 2.705847978591919, "logps/chosen": -314.8524475097656, "logps/rejected": -643.5977783203125, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": 1.7707325220108032, "rewards/margins": 12.588135719299316, "rewards/rejected": -10.817402839660645, "step": 3720 }, { "epoch": 1.27, "learning_rate": 3.2078559738134203e-07, "logits/chosen": 1.0180917978286743, "logits/rejected": 2.3311429023742676, "logps/chosen": -413.64862060546875, "logps/rejected": -681.5325927734375, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": 1.2779762744903564, "rewards/margins": 13.67229175567627, "rewards/rejected": -12.394315719604492, "step": 3730 }, { "epoch": 1.27, "learning_rate": 3.2015611230013846e-07, "logits/chosen": 1.3414030075073242, "logits/rejected": 2.343313217163086, "logps/chosen": -337.6605529785156, "logps/rejected": -741.5252075195312, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": 2.429905891418457, "rewards/margins": 12.461353302001953, "rewards/rejected": -10.031448364257812, "step": 3740 }, { "epoch": 1.27, "learning_rate": 3.1952662721893493e-07, "logits/chosen": 0.8920739889144897, "logits/rejected": 2.75960111618042, "logps/chosen": -292.0826416015625, "logps/rejected": -628.2627563476562, "loss": 0.0137, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9123607873916626, "rewards/margins": 12.0170316696167, "rewards/rejected": -10.104671478271484, "step": 3750 }, { "epoch": 1.28, "learning_rate": 3.1889714213773135e-07, "logits/chosen": 1.1719636917114258, "logits/rejected": 2.649860382080078, "logps/chosen": -366.6575927734375, "logps/rejected": -607.0701293945312, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": 2.313616991043091, "rewards/margins": 12.286029815673828, "rewards/rejected": -9.972412109375, "step": 3760 }, { "epoch": 1.28, "learning_rate": 3.182676570565277e-07, "logits/chosen": 0.8673251867294312, "logits/rejected": 2.7998039722442627, "logps/chosen": -315.17510986328125, "logps/rejected": -605.2402954101562, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": 1.8954683542251587, "rewards/margins": 14.503629684448242, "rewards/rejected": -12.608160018920898, "step": 3770 }, { "epoch": 1.28, "learning_rate": 3.1763817197532415e-07, "logits/chosen": 1.0487251281738281, "logits/rejected": 2.9428367614746094, "logps/chosen": -364.67401123046875, "logps/rejected": -538.7274169921875, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": 2.0998024940490723, "rewards/margins": 13.77617359161377, "rewards/rejected": -11.676373481750488, "step": 3780 }, { "epoch": 1.29, "learning_rate": 3.1700868689412057e-07, "logits/chosen": 1.02610182762146, "logits/rejected": 2.5808565616607666, "logps/chosen": -461.45526123046875, "logps/rejected": -686.8546142578125, "loss": 0.0093, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.335012912750244, "rewards/margins": 14.393170356750488, "rewards/rejected": -12.05815601348877, "step": 3790 }, { "epoch": 1.29, "learning_rate": 3.16379201812917e-07, "logits/chosen": 0.750059962272644, "logits/rejected": 2.3385863304138184, "logps/chosen": -291.5013732910156, "logps/rejected": -727.7017822265625, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": 1.783456563949585, "rewards/margins": 12.742868423461914, "rewards/rejected": -10.959412574768066, "step": 3800 }, { "epoch": 1.29, "eval_logits/chosen": 0.6878785490989685, "eval_logits/rejected": 2.7622649669647217, "eval_logps/chosen": -371.6280517578125, "eval_logps/rejected": -618.2280883789062, "eval_loss": 0.011163265444338322, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": 1.7361834049224854, "eval_rewards/margins": 13.295957565307617, "eval_rewards/rejected": -11.559774398803711, "eval_runtime": 268.3977, "eval_samples_per_second": 35.395, "eval_steps_per_second": 1.107, "step": 3800 }, { "epoch": 1.3, "learning_rate": 3.1574971673171347e-07, "logits/chosen": 0.9575430750846863, "logits/rejected": 2.964996814727783, "logps/chosen": -424.96856689453125, "logps/rejected": -482.3935546875, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": 1.4636036157608032, "rewards/margins": 14.330609321594238, "rewards/rejected": -12.867006301879883, "step": 3810 }, { "epoch": 1.3, "learning_rate": 3.151202316505099e-07, "logits/chosen": 0.723203182220459, "logits/rejected": 2.6531193256378174, "logps/chosen": -339.8141784667969, "logps/rejected": -731.8880615234375, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": 2.1962404251098633, "rewards/margins": 15.106730461120605, "rewards/rejected": -12.910490036010742, "step": 3820 }, { "epoch": 1.3, "learning_rate": 3.144907465693063e-07, "logits/chosen": 0.9935673475265503, "logits/rejected": 2.505452871322632, "logps/chosen": -361.5821838378906, "logps/rejected": -588.7732543945312, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": 1.2631385326385498, "rewards/margins": 11.31373405456543, "rewards/rejected": -10.050596237182617, "step": 3830 }, { "epoch": 1.31, "learning_rate": 3.1386126148810274e-07, "logits/chosen": 1.2363111972808838, "logits/rejected": 3.0512309074401855, "logps/chosen": -489.4908752441406, "logps/rejected": -554.0120239257812, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": 1.5856379270553589, "rewards/margins": 13.790130615234375, "rewards/rejected": -12.204492568969727, "step": 3840 }, { "epoch": 1.31, "learning_rate": 3.132317764068991e-07, "logits/chosen": 1.086814045906067, "logits/rejected": 2.504542112350464, "logps/chosen": -333.9427185058594, "logps/rejected": -730.4444580078125, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 1.555995225906372, "rewards/margins": 12.766497611999512, "rewards/rejected": -11.210501670837402, "step": 3850 }, { "epoch": 1.31, "learning_rate": 3.1260229132569553e-07, "logits/chosen": 1.3077691793441772, "logits/rejected": 2.9703030586242676, "logps/chosen": -427.02716064453125, "logps/rejected": -514.3189697265625, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": 1.2280203104019165, "rewards/margins": 14.560205459594727, "rewards/rejected": -13.332185745239258, "step": 3860 }, { "epoch": 1.32, "learning_rate": 3.11972806244492e-07, "logits/chosen": 1.1198952198028564, "logits/rejected": 2.6473917961120605, "logps/chosen": -377.2010498046875, "logps/rejected": -621.55322265625, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": 1.3931939601898193, "rewards/margins": 12.233184814453125, "rewards/rejected": -10.839990615844727, "step": 3870 }, { "epoch": 1.32, "learning_rate": 3.1134332116328843e-07, "logits/chosen": 0.918321430683136, "logits/rejected": 2.5897388458251953, "logps/chosen": -374.2272033691406, "logps/rejected": -711.2734375, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": 1.9573631286621094, "rewards/margins": 13.664125442504883, "rewards/rejected": -11.70676040649414, "step": 3880 }, { "epoch": 1.32, "learning_rate": 3.1071383608208485e-07, "logits/chosen": 1.1157448291778564, "logits/rejected": 2.8496346473693848, "logps/chosen": -408.95294189453125, "logps/rejected": -447.25384521484375, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": 1.7573410272598267, "rewards/margins": 14.136436462402344, "rewards/rejected": -12.379095077514648, "step": 3890 }, { "epoch": 1.33, "learning_rate": 3.1008435100088127e-07, "logits/chosen": 1.2130072116851807, "logits/rejected": 2.2291102409362793, "logps/chosen": -332.9510192871094, "logps/rejected": -833.26025390625, "loss": 0.0114, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7235133647918701, "rewards/margins": 15.338064193725586, "rewards/rejected": -13.614550590515137, "step": 3900 }, { "epoch": 1.33, "eval_logits/chosen": 0.672809898853302, "eval_logits/rejected": 2.7616159915924072, "eval_logps/chosen": -370.6764831542969, "eval_logps/rejected": -616.296875, "eval_loss": 0.01061132363975048, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.8313409090042114, "eval_rewards/margins": 13.197998046875, "eval_rewards/rejected": -11.366658210754395, "eval_runtime": 267.1258, "eval_samples_per_second": 35.564, "eval_steps_per_second": 1.112, "step": 3900 }, { "epoch": 1.33, "learning_rate": 3.094548659196777e-07, "logits/chosen": 1.6785743236541748, "logits/rejected": 2.301844358444214, "logps/chosen": -524.2055053710938, "logps/rejected": -739.4368286132812, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 1.7645349502563477, "rewards/margins": 12.248418807983398, "rewards/rejected": -10.483884811401367, "step": 3910 }, { "epoch": 1.33, "learning_rate": 3.0882538083847407e-07, "logits/chosen": 1.4525395631790161, "logits/rejected": 2.9662253856658936, "logps/chosen": -405.4144592285156, "logps/rejected": -511.0747985839844, "loss": 0.0056, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7320349216461182, "rewards/margins": 12.92414665222168, "rewards/rejected": -11.192111015319824, "step": 3920 }, { "epoch": 1.34, "learning_rate": 3.0819589575727054e-07, "logits/chosen": 1.3361444473266602, "logits/rejected": 2.703052520751953, "logps/chosen": -534.5265502929688, "logps/rejected": -433.0003967285156, "loss": 0.0114, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5415256023406982, "rewards/margins": 12.53891372680664, "rewards/rejected": -10.99738883972168, "step": 3930 }, { "epoch": 1.34, "learning_rate": 3.0756641067606696e-07, "logits/chosen": 1.4039337635040283, "logits/rejected": 2.760995388031006, "logps/chosen": -337.70904541015625, "logps/rejected": -551.9937744140625, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": 1.6379754543304443, "rewards/margins": 12.422950744628906, "rewards/rejected": -10.7849760055542, "step": 3940 }, { "epoch": 1.34, "learning_rate": 3.069369255948634e-07, "logits/chosen": 1.0360331535339355, "logits/rejected": 2.4552958011627197, "logps/chosen": -329.26422119140625, "logps/rejected": -655.5435791015625, "loss": 0.0113, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.4171547889709473, "rewards/margins": 15.259183883666992, "rewards/rejected": -12.84202766418457, "step": 3950 }, { "epoch": 1.35, "learning_rate": 3.063074405136598e-07, "logits/chosen": 0.8999547958374023, "logits/rejected": 2.6917226314544678, "logps/chosen": -338.5829162597656, "logps/rejected": -592.2154541015625, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 2.708500385284424, "rewards/margins": 14.965312004089355, "rewards/rejected": -12.256811141967773, "step": 3960 }, { "epoch": 1.35, "learning_rate": 3.0567795543245623e-07, "logits/chosen": 0.8781677484512329, "logits/rejected": 2.3897573947906494, "logps/chosen": -380.5794372558594, "logps/rejected": -688.5714721679688, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 2.071242094039917, "rewards/margins": 14.585981369018555, "rewards/rejected": -12.514738082885742, "step": 3970 }, { "epoch": 1.35, "learning_rate": 3.0504847035125266e-07, "logits/chosen": 1.2194125652313232, "logits/rejected": 2.6145756244659424, "logps/chosen": -408.1900939941406, "logps/rejected": -582.1531372070312, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": 2.0091769695281982, "rewards/margins": 13.346948623657227, "rewards/rejected": -11.337770462036133, "step": 3980 }, { "epoch": 1.36, "learning_rate": 3.0441898527004913e-07, "logits/chosen": 1.122051477432251, "logits/rejected": 2.9787373542785645, "logps/chosen": -309.53192138671875, "logps/rejected": -560.3518676757812, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": 2.181328296661377, "rewards/margins": 13.925869941711426, "rewards/rejected": -11.74454116821289, "step": 3990 }, { "epoch": 1.36, "learning_rate": 3.037895001888455e-07, "logits/chosen": 0.9208853840827942, "logits/rejected": 2.722747325897217, "logps/chosen": -386.92669677734375, "logps/rejected": -594.4363403320312, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": 2.3495748043060303, "rewards/margins": 13.152547836303711, "rewards/rejected": -10.802971839904785, "step": 4000 }, { "epoch": 1.36, "eval_logits/chosen": 0.6694169640541077, "eval_logits/rejected": 2.7534499168395996, "eval_logps/chosen": -369.82958984375, "eval_logps/rejected": -618.1146850585938, "eval_loss": 0.010111239738762379, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.9160256385803223, "eval_rewards/margins": 13.464466094970703, "eval_rewards/rejected": -11.548439979553223, "eval_runtime": 267.2091, "eval_samples_per_second": 35.553, "eval_steps_per_second": 1.111, "step": 4000 }, { "epoch": 1.36, "learning_rate": 3.031600151076419e-07, "logits/chosen": 1.105791449546814, "logits/rejected": 2.6624977588653564, "logps/chosen": -329.08905029296875, "logps/rejected": -635.0185546875, "loss": 0.0111, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.752402901649475, "rewards/margins": 14.056722640991211, "rewards/rejected": -12.304319381713867, "step": 4010 }, { "epoch": 1.37, "learning_rate": 3.0253053002643835e-07, "logits/chosen": 1.1525877714157104, "logits/rejected": 2.8269333839416504, "logps/chosen": -328.6435546875, "logps/rejected": -626.5671997070312, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 2.0284876823425293, "rewards/margins": 15.253156661987305, "rewards/rejected": -13.224668502807617, "step": 4020 }, { "epoch": 1.37, "learning_rate": 3.0190104494523477e-07, "logits/chosen": 1.3486201763153076, "logits/rejected": 2.1660232543945312, "logps/chosen": -320.6233215332031, "logps/rejected": -756.1076049804688, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": 1.2695963382720947, "rewards/margins": 11.98314380645752, "rewards/rejected": -10.71354866027832, "step": 4030 }, { "epoch": 1.37, "learning_rate": 3.012715598640312e-07, "logits/chosen": 1.2375625371932983, "logits/rejected": 2.5896694660186768, "logps/chosen": -448.31658935546875, "logps/rejected": -623.9146728515625, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 2.4820594787597656, "rewards/margins": 15.964773178100586, "rewards/rejected": -13.482714653015137, "step": 4040 }, { "epoch": 1.38, "learning_rate": 3.0064207478282767e-07, "logits/chosen": 1.529763102531433, "logits/rejected": 2.7555646896362305, "logps/chosen": -449.84686279296875, "logps/rejected": -620.9822998046875, "loss": 0.0103, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5289344787597656, "rewards/margins": 13.241473197937012, "rewards/rejected": -11.712538719177246, "step": 4050 }, { "epoch": 1.38, "learning_rate": 3.000125897016241e-07, "logits/chosen": 1.3951947689056396, "logits/rejected": 2.7864015102386475, "logps/chosen": -348.7859802246094, "logps/rejected": -626.5861206054688, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 1.3628493547439575, "rewards/margins": 13.327616691589355, "rewards/rejected": -11.964765548706055, "step": 4060 }, { "epoch": 1.38, "learning_rate": 2.993831046204205e-07, "logits/chosen": 0.9522676467895508, "logits/rejected": 2.8154313564300537, "logps/chosen": -476.47412109375, "logps/rejected": -513.9017333984375, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": 1.842169165611267, "rewards/margins": 13.47119426727295, "rewards/rejected": -11.62902545928955, "step": 4070 }, { "epoch": 1.39, "learning_rate": 2.987536195392169e-07, "logits/chosen": 1.5384814739227295, "logits/rejected": 2.9053680896759033, "logps/chosen": -417.6114807128906, "logps/rejected": -534.3853759765625, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 2.0078911781311035, "rewards/margins": 14.087924003601074, "rewards/rejected": -12.080032348632812, "step": 4080 }, { "epoch": 1.39, "learning_rate": 2.981241344580133e-07, "logits/chosen": 1.3096532821655273, "logits/rejected": 2.828274965286255, "logps/chosen": -393.8643493652344, "logps/rejected": -567.3588256835938, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": 2.1737160682678223, "rewards/margins": 13.32548999786377, "rewards/rejected": -11.151773452758789, "step": 4090 }, { "epoch": 1.39, "learning_rate": 2.9749464937680973e-07, "logits/chosen": 1.554233193397522, "logits/rejected": 2.9938759803771973, "logps/chosen": -345.17071533203125, "logps/rejected": -388.05328369140625, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 2.1033272743225098, "rewards/margins": 13.004033088684082, "rewards/rejected": -10.900705337524414, "step": 4100 }, { "epoch": 1.39, "eval_logits/chosen": 0.6617211103439331, "eval_logits/rejected": 2.755298376083374, "eval_logps/chosen": -370.09149169921875, "eval_logps/rejected": -615.8171997070312, "eval_loss": 0.009758265689015388, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.8898385763168335, "eval_rewards/margins": 13.208524703979492, "eval_rewards/rejected": -11.318686485290527, "eval_runtime": 267.8618, "eval_samples_per_second": 35.466, "eval_steps_per_second": 1.109, "step": 4100 }, { "epoch": 1.4, "learning_rate": 2.968651642956062e-07, "logits/chosen": 0.6344437003135681, "logits/rejected": 2.64906644821167, "logps/chosen": -306.6768798828125, "logps/rejected": -594.3409423828125, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": 1.623979926109314, "rewards/margins": 12.556502342224121, "rewards/rejected": -10.932523727416992, "step": 4110 }, { "epoch": 1.4, "learning_rate": 2.9623567921440263e-07, "logits/chosen": 0.9452501535415649, "logits/rejected": 2.6657590866088867, "logps/chosen": -320.43115234375, "logps/rejected": -478.85504150390625, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": 1.9362001419067383, "rewards/margins": 12.062414169311523, "rewards/rejected": -10.126212120056152, "step": 4120 }, { "epoch": 1.4, "learning_rate": 2.9560619413319905e-07, "logits/chosen": 1.2213366031646729, "logits/rejected": 2.3294577598571777, "logps/chosen": -538.0032958984375, "logps/rejected": -738.4307861328125, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 1.5684618949890137, "rewards/margins": 13.005151748657227, "rewards/rejected": -11.436688423156738, "step": 4130 }, { "epoch": 1.41, "learning_rate": 2.9497670905199547e-07, "logits/chosen": 1.44203782081604, "logits/rejected": 2.5604937076568604, "logps/chosen": -520.1005249023438, "logps/rejected": -640.8972778320312, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": 2.205566644668579, "rewards/margins": 13.87377643585205, "rewards/rejected": -11.66821002960205, "step": 4140 }, { "epoch": 1.41, "learning_rate": 2.9434722397079184e-07, "logits/chosen": 1.284224271774292, "logits/rejected": 2.3379929065704346, "logps/chosen": -426.13909912109375, "logps/rejected": -782.0924072265625, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": 1.9227638244628906, "rewards/margins": 13.643468856811523, "rewards/rejected": -11.72070598602295, "step": 4150 }, { "epoch": 1.41, "learning_rate": 2.9371773888958827e-07, "logits/chosen": 1.3185603618621826, "logits/rejected": 2.7050728797912598, "logps/chosen": -348.4053649902344, "logps/rejected": -583.4497680664062, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": 2.2634260654449463, "rewards/margins": 14.19212818145752, "rewards/rejected": -11.92870044708252, "step": 4160 }, { "epoch": 1.42, "learning_rate": 2.9308825380838474e-07, "logits/chosen": 1.6224581003189087, "logits/rejected": 2.830731153488159, "logps/chosen": -413.7862854003906, "logps/rejected": -518.830322265625, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": 2.2098381519317627, "rewards/margins": 14.230868339538574, "rewards/rejected": -12.021029472351074, "step": 4170 }, { "epoch": 1.42, "learning_rate": 2.9245876872718116e-07, "logits/chosen": 1.1401679515838623, "logits/rejected": 2.6897270679473877, "logps/chosen": -365.6610107421875, "logps/rejected": -596.2195434570312, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 2.139878749847412, "rewards/margins": 13.742103576660156, "rewards/rejected": -11.602225303649902, "step": 4180 }, { "epoch": 1.42, "learning_rate": 2.918292836459776e-07, "logits/chosen": 0.6644213795661926, "logits/rejected": 1.8898484706878662, "logps/chosen": -343.75909423828125, "logps/rejected": -814.2130737304688, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": 2.0662014484405518, "rewards/margins": 13.086583137512207, "rewards/rejected": -11.020383834838867, "step": 4190 }, { "epoch": 1.43, "learning_rate": 2.91199798564774e-07, "logits/chosen": 0.5162476301193237, "logits/rejected": 2.710484266281128, "logps/chosen": -294.71026611328125, "logps/rejected": -589.5906982421875, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 1.821197509765625, "rewards/margins": 13.46678352355957, "rewards/rejected": -11.645587921142578, "step": 4200 }, { "epoch": 1.43, "eval_logits/chosen": 0.6265316009521484, "eval_logits/rejected": 2.7234370708465576, "eval_logps/chosen": -368.2688903808594, "eval_logps/rejected": -619.3782348632812, "eval_loss": 0.009091639891266823, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 2.072103977203369, "eval_rewards/margins": 13.746898651123047, "eval_rewards/rejected": -11.674796104431152, "eval_runtime": 268.1863, "eval_samples_per_second": 35.423, "eval_steps_per_second": 1.107, "step": 4200 }, { "epoch": 1.43, "learning_rate": 2.9057031348357043e-07, "logits/chosen": 0.5974520444869995, "logits/rejected": 2.310032844543457, "logps/chosen": -325.3004150390625, "logps/rejected": -749.6110229492188, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": 2.4108071327209473, "rewards/margins": 14.243759155273438, "rewards/rejected": -11.832952499389648, "step": 4210 }, { "epoch": 1.43, "learning_rate": 2.8994082840236686e-07, "logits/chosen": 0.5631410479545593, "logits/rejected": 2.089045286178589, "logps/chosen": -426.6354064941406, "logps/rejected": -681.105712890625, "loss": 0.0086, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.776482343673706, "rewards/margins": 13.32958698272705, "rewards/rejected": -11.553104400634766, "step": 4220 }, { "epoch": 1.44, "learning_rate": 2.893113433211632e-07, "logits/chosen": 0.693230926990509, "logits/rejected": 2.8969295024871826, "logps/chosen": -341.26263427734375, "logps/rejected": -527.8047485351562, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": 2.348053216934204, "rewards/margins": 13.852984428405762, "rewards/rejected": -11.504932403564453, "step": 4230 }, { "epoch": 1.44, "learning_rate": 2.886818582399597e-07, "logits/chosen": 1.3099323511123657, "logits/rejected": 2.9541726112365723, "logps/chosen": -371.8271179199219, "logps/rejected": -486.86260986328125, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": 1.005937099456787, "rewards/margins": 12.468096733093262, "rewards/rejected": -11.462160110473633, "step": 4240 }, { "epoch": 1.44, "learning_rate": 2.880523731587561e-07, "logits/chosen": 0.6574260592460632, "logits/rejected": 1.5284206867218018, "logps/chosen": -478.7499084472656, "logps/rejected": -949.9332275390625, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 1.561853051185608, "rewards/margins": 12.824320793151855, "rewards/rejected": -11.262468338012695, "step": 4250 }, { "epoch": 1.45, "learning_rate": 2.8742288807755255e-07, "logits/chosen": 0.3190085291862488, "logits/rejected": 2.2693285942077637, "logps/chosen": -366.81036376953125, "logps/rejected": -740.80322265625, "loss": 0.0072, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.0779354572296143, "rewards/margins": 12.947741508483887, "rewards/rejected": -10.869806289672852, "step": 4260 }, { "epoch": 1.45, "learning_rate": 2.8679340299634897e-07, "logits/chosen": 0.3005984425544739, "logits/rejected": 1.9869167804718018, "logps/chosen": -310.21075439453125, "logps/rejected": -915.5408325195312, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 2.2902274131774902, "rewards/margins": 12.991412162780762, "rewards/rejected": -10.701186180114746, "step": 4270 }, { "epoch": 1.45, "learning_rate": 2.861639179151454e-07, "logits/chosen": 1.2294288873672485, "logits/rejected": 2.8581976890563965, "logps/chosen": -345.51202392578125, "logps/rejected": -665.0847778320312, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 1.7379974126815796, "rewards/margins": 14.367947578430176, "rewards/rejected": -12.629948616027832, "step": 4280 }, { "epoch": 1.46, "learning_rate": 2.855344328339418e-07, "logits/chosen": 1.3199851512908936, "logits/rejected": 2.684046745300293, "logps/chosen": -319.9084777832031, "logps/rejected": -645.8856201171875, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 2.1213021278381348, "rewards/margins": 14.135685920715332, "rewards/rejected": -12.014384269714355, "step": 4290 }, { "epoch": 1.46, "learning_rate": 2.849049477527383e-07, "logits/chosen": 1.4351789951324463, "logits/rejected": 3.1019484996795654, "logps/chosen": -329.2397155761719, "logps/rejected": -460.0499572753906, "loss": 0.006, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.255934715270996, "rewards/margins": 14.838006973266602, "rewards/rejected": -12.582071304321289, "step": 4300 }, { "epoch": 1.46, "eval_logits/chosen": 0.6225088834762573, "eval_logits/rejected": 2.705798864364624, "eval_logps/chosen": -370.5738525390625, "eval_logps/rejected": -624.5148315429688, "eval_loss": 0.008837219327688217, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.841599941253662, "eval_rewards/margins": 14.030046463012695, "eval_rewards/rejected": -12.188445091247559, "eval_runtime": 268.1565, "eval_samples_per_second": 35.427, "eval_steps_per_second": 1.108, "step": 4300 }, { "epoch": 1.46, "learning_rate": 2.8427546267153466e-07, "logits/chosen": 0.8465960621833801, "logits/rejected": 2.358778476715088, "logps/chosen": -333.47857666015625, "logps/rejected": -711.8942260742188, "loss": 0.0124, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4469887018203735, "rewards/margins": 14.328710556030273, "rewards/rejected": -12.881719589233398, "step": 4310 }, { "epoch": 1.47, "learning_rate": 2.836459775903311e-07, "logits/chosen": 0.7382937669754028, "logits/rejected": 2.9191064834594727, "logps/chosen": -427.72296142578125, "logps/rejected": -541.3267822265625, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": 1.5231516361236572, "rewards/margins": 13.919703483581543, "rewards/rejected": -12.396551132202148, "step": 4320 }, { "epoch": 1.47, "learning_rate": 2.830164925091275e-07, "logits/chosen": 1.1782336235046387, "logits/rejected": 2.832035541534424, "logps/chosen": -327.30865478515625, "logps/rejected": -554.0316162109375, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": 1.5803128480911255, "rewards/margins": 13.803190231323242, "rewards/rejected": -12.222879409790039, "step": 4330 }, { "epoch": 1.48, "learning_rate": 2.8238700742792393e-07, "logits/chosen": 0.7898053526878357, "logits/rejected": 3.143005847930908, "logps/chosen": -373.89959716796875, "logps/rejected": -453.019775390625, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": 1.9527397155761719, "rewards/margins": 14.160921096801758, "rewards/rejected": -12.208181381225586, "step": 4340 }, { "epoch": 1.48, "learning_rate": 2.8175752234672035e-07, "logits/chosen": 1.5171926021575928, "logits/rejected": 2.6992270946502686, "logps/chosen": -432.3185119628906, "logps/rejected": -558.4224243164062, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": 2.192749261856079, "rewards/margins": 12.624958992004395, "rewards/rejected": -10.432210922241211, "step": 4350 }, { "epoch": 1.48, "learning_rate": 2.8112803726551683e-07, "logits/chosen": 1.8292335271835327, "logits/rejected": 3.312920093536377, "logps/chosen": -466.8578186035156, "logps/rejected": -418.66680908203125, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 1.5297528505325317, "rewards/margins": 10.969106674194336, "rewards/rejected": -9.439353942871094, "step": 4360 }, { "epoch": 1.49, "learning_rate": 2.8049855218431325e-07, "logits/chosen": 1.0215450525283813, "logits/rejected": 2.6072983741760254, "logps/chosen": -337.31915283203125, "logps/rejected": -533.4041748046875, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": 2.3369078636169434, "rewards/margins": 12.58057689666748, "rewards/rejected": -10.243669509887695, "step": 4370 }, { "epoch": 1.49, "learning_rate": 2.7986906710310967e-07, "logits/chosen": 1.2219336032867432, "logits/rejected": 1.8741023540496826, "logps/chosen": -400.1982727050781, "logps/rejected": -858.0445556640625, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": 2.0328240394592285, "rewards/margins": 14.159416198730469, "rewards/rejected": -12.126591682434082, "step": 4380 }, { "epoch": 1.49, "learning_rate": 2.7923958202190604e-07, "logits/chosen": 0.6931953430175781, "logits/rejected": 2.141230344772339, "logps/chosen": -295.72308349609375, "logps/rejected": -860.4769287109375, "loss": 0.0097, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6996008157730103, "rewards/margins": 14.573939323425293, "rewards/rejected": -12.87433910369873, "step": 4390 }, { "epoch": 1.5, "learning_rate": 2.7861009694070247e-07, "logits/chosen": 1.1773998737335205, "logits/rejected": 3.001295566558838, "logps/chosen": -404.4131774902344, "logps/rejected": -484.2671813964844, "loss": 0.0071, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9020360708236694, "rewards/margins": 13.961771011352539, "rewards/rejected": -12.059735298156738, "step": 4400 }, { "epoch": 1.5, "eval_logits/chosen": 0.6230810880661011, "eval_logits/rejected": 2.71238374710083, "eval_logps/chosen": -368.8385925292969, "eval_logps/rejected": -620.0233154296875, "eval_loss": 0.008326222188770771, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 2.0151307582855225, "eval_rewards/margins": 13.754433631896973, "eval_rewards/rejected": -11.739301681518555, "eval_runtime": 268.8898, "eval_samples_per_second": 35.33, "eval_steps_per_second": 1.105, "step": 4400 }, { "epoch": 1.5, "learning_rate": 2.779806118594989e-07, "logits/chosen": 0.8335103988647461, "logits/rejected": 2.1839518547058105, "logps/chosen": -392.66632080078125, "logps/rejected": -686.5694580078125, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 2.3597278594970703, "rewards/margins": 13.914251327514648, "rewards/rejected": -11.554522514343262, "step": 4410 }, { "epoch": 1.5, "learning_rate": 2.7735112677829536e-07, "logits/chosen": 0.4120512902736664, "logits/rejected": 2.686591625213623, "logps/chosen": -308.36334228515625, "logps/rejected": -576.0289306640625, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": 2.1544272899627686, "rewards/margins": 13.063441276550293, "rewards/rejected": -10.909013748168945, "step": 4420 }, { "epoch": 1.51, "learning_rate": 2.767216416970918e-07, "logits/chosen": 0.680091917514801, "logits/rejected": 2.9787096977233887, "logps/chosen": -311.6202087402344, "logps/rejected": -484.0185546875, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 1.9193477630615234, "rewards/margins": 14.170591354370117, "rewards/rejected": -12.251241683959961, "step": 4430 }, { "epoch": 1.51, "learning_rate": 2.760921566158882e-07, "logits/chosen": 0.24553918838500977, "logits/rejected": 2.527836322784424, "logps/chosen": -356.7384948730469, "logps/rejected": -613.1387939453125, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 2.243509531021118, "rewards/margins": 12.485934257507324, "rewards/rejected": -10.242424964904785, "step": 4440 }, { "epoch": 1.51, "learning_rate": 2.7546267153468463e-07, "logits/chosen": 1.4051676988601685, "logits/rejected": 2.4445412158966064, "logps/chosen": -537.8873291015625, "logps/rejected": -562.5985107421875, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": 2.1561007499694824, "rewards/margins": 13.01380729675293, "rewards/rejected": -10.857707023620605, "step": 4450 }, { "epoch": 1.52, "learning_rate": 2.74833186453481e-07, "logits/chosen": 1.222497582435608, "logits/rejected": 2.6060967445373535, "logps/chosen": -446.00848388671875, "logps/rejected": -619.6156616210938, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 2.138929843902588, "rewards/margins": 13.503074645996094, "rewards/rejected": -11.364145278930664, "step": 4460 }, { "epoch": 1.52, "learning_rate": 2.742037013722774e-07, "logits/chosen": 1.1242297887802124, "logits/rejected": 2.2714600563049316, "logps/chosen": -385.5606994628906, "logps/rejected": -600.1583251953125, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": 2.1965062618255615, "rewards/margins": 13.274136543273926, "rewards/rejected": -11.077630996704102, "step": 4470 }, { "epoch": 1.52, "learning_rate": 2.735742162910739e-07, "logits/chosen": 1.6765056848526, "logits/rejected": 3.165526866912842, "logps/chosen": -424.5562438964844, "logps/rejected": -477.0459899902344, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": 2.1084578037261963, "rewards/margins": 13.339756965637207, "rewards/rejected": -11.23129940032959, "step": 4480 }, { "epoch": 1.53, "learning_rate": 2.729447312098703e-07, "logits/chosen": 0.5628236532211304, "logits/rejected": 2.7374966144561768, "logps/chosen": -480.1666564941406, "logps/rejected": -688.9700317382812, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": 2.3280537128448486, "rewards/margins": 13.522354125976562, "rewards/rejected": -11.194300651550293, "step": 4490 }, { "epoch": 1.53, "learning_rate": 2.7231524612866675e-07, "logits/chosen": 0.8318904042243958, "logits/rejected": 2.778688907623291, "logps/chosen": -321.25396728515625, "logps/rejected": -660.5576171875, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": 2.0317749977111816, "rewards/margins": 12.968721389770508, "rewards/rejected": -10.936944961547852, "step": 4500 }, { "epoch": 1.53, "eval_logits/chosen": 0.6407039761543274, "eval_logits/rejected": 2.7205677032470703, "eval_logps/chosen": -368.1263732910156, "eval_logps/rejected": -617.782958984375, "eval_loss": 0.008253191597759724, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 2.086351156234741, "eval_rewards/margins": 13.601614952087402, "eval_rewards/rejected": -11.515264511108398, "eval_runtime": 269.3071, "eval_samples_per_second": 35.276, "eval_steps_per_second": 1.103, "step": 4500 }, { "epoch": 1.53, "learning_rate": 2.7168576104746317e-07, "logits/chosen": 0.6592813730239868, "logits/rejected": 2.668743848800659, "logps/chosen": -340.2607116699219, "logps/rejected": -683.1118774414062, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": 2.177586555480957, "rewards/margins": 14.914339065551758, "rewards/rejected": -12.736750602722168, "step": 4510 }, { "epoch": 1.54, "learning_rate": 2.710562759662596e-07, "logits/chosen": 0.7379637956619263, "logits/rejected": 2.5914015769958496, "logps/chosen": -390.2478942871094, "logps/rejected": -640.7365112304688, "loss": 0.0057, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.2496771812438965, "rewards/margins": 13.077062606811523, "rewards/rejected": -10.827384948730469, "step": 4520 }, { "epoch": 1.54, "learning_rate": 2.70426790885056e-07, "logits/chosen": 0.9463948011398315, "logits/rejected": 2.068814754486084, "logps/chosen": -361.550048828125, "logps/rejected": -774.712158203125, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 2.1057486534118652, "rewards/margins": 15.126858711242676, "rewards/rejected": -13.021112442016602, "step": 4530 }, { "epoch": 1.54, "learning_rate": 2.6979730580385244e-07, "logits/chosen": 1.441889762878418, "logits/rejected": 2.6717936992645264, "logps/chosen": -447.90020751953125, "logps/rejected": -669.8250732421875, "loss": 0.0139, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5580735206604004, "rewards/margins": 12.329019546508789, "rewards/rejected": -10.770946502685547, "step": 4540 }, { "epoch": 1.55, "learning_rate": 2.6916782072264886e-07, "logits/chosen": 0.7209632396697998, "logits/rejected": 2.7818751335144043, "logps/chosen": -372.5408630371094, "logps/rejected": -507.08544921875, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": 2.4891321659088135, "rewards/margins": 13.217214584350586, "rewards/rejected": -10.728082656860352, "step": 4550 }, { "epoch": 1.55, "learning_rate": 2.685383356414453e-07, "logits/chosen": 1.1761372089385986, "logits/rejected": 2.7067036628723145, "logps/chosen": -387.07568359375, "logps/rejected": -529.3436279296875, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": 1.7527261972427368, "rewards/margins": 14.56840705871582, "rewards/rejected": -12.815679550170898, "step": 4560 }, { "epoch": 1.55, "learning_rate": 2.679088505602417e-07, "logits/chosen": 0.8417810201644897, "logits/rejected": 2.6834988594055176, "logps/chosen": -416.536865234375, "logps/rejected": -719.8585205078125, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": 2.182037115097046, "rewards/margins": 14.083086967468262, "rewards/rejected": -11.901049613952637, "step": 4570 }, { "epoch": 1.56, "learning_rate": 2.6727936547903813e-07, "logits/chosen": 0.8248815536499023, "logits/rejected": 2.0804429054260254, "logps/chosen": -317.86749267578125, "logps/rejected": -642.9418334960938, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": 1.9505695104599, "rewards/margins": 12.340843200683594, "rewards/rejected": -10.390274047851562, "step": 4580 }, { "epoch": 1.56, "learning_rate": 2.6664988039783455e-07, "logits/chosen": 1.028735876083374, "logits/rejected": 3.0316011905670166, "logps/chosen": -300.3289794921875, "logps/rejected": -475.13543701171875, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 1.5529836416244507, "rewards/margins": 12.989659309387207, "rewards/rejected": -11.436675071716309, "step": 4590 }, { "epoch": 1.56, "learning_rate": 2.66020395316631e-07, "logits/chosen": 1.1963709592819214, "logits/rejected": 3.01017689704895, "logps/chosen": -311.89825439453125, "logps/rejected": -495.96893310546875, "loss": 0.0054, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5140174627304077, "rewards/margins": 13.036733627319336, "rewards/rejected": -11.522716522216797, "step": 4600 }, { "epoch": 1.56, "eval_logits/chosen": 0.6099021434783936, "eval_logits/rejected": 2.7246458530426025, "eval_logps/chosen": -369.0596618652344, "eval_logps/rejected": -616.05419921875, "eval_loss": 0.008312725462019444, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 1.9930243492126465, "eval_rewards/margins": 13.335411071777344, "eval_rewards/rejected": -11.342387199401855, "eval_runtime": 268.6015, "eval_samples_per_second": 35.368, "eval_steps_per_second": 1.106, "step": 4600 }, { "epoch": 1.57, "learning_rate": 2.6539091023542745e-07, "logits/chosen": 1.0765217542648315, "logits/rejected": 2.4071598052978516, "logps/chosen": -315.82122802734375, "logps/rejected": -714.0283203125, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": 2.855968952178955, "rewards/margins": 15.264370918273926, "rewards/rejected": -12.408401489257812, "step": 4610 }, { "epoch": 1.57, "learning_rate": 2.647614251542238e-07, "logits/chosen": 1.5065996646881104, "logits/rejected": 2.300039052963257, "logps/chosen": -351.83367919921875, "logps/rejected": -720.9744873046875, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": 1.7324097156524658, "rewards/margins": 13.242719650268555, "rewards/rejected": -11.510311126708984, "step": 4620 }, { "epoch": 1.57, "learning_rate": 2.6413194007302024e-07, "logits/chosen": 1.575552225112915, "logits/rejected": 2.6150310039520264, "logps/chosen": -331.12310791015625, "logps/rejected": -626.9486694335938, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": 1.5365428924560547, "rewards/margins": 11.85051441192627, "rewards/rejected": -10.313970565795898, "step": 4630 }, { "epoch": 1.58, "learning_rate": 2.6350245499181666e-07, "logits/chosen": 1.4718683958053589, "logits/rejected": 2.4823708534240723, "logps/chosen": -396.23748779296875, "logps/rejected": -616.1155395507812, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 2.2771496772766113, "rewards/margins": 13.974538803100586, "rewards/rejected": -11.697389602661133, "step": 4640 }, { "epoch": 1.58, "learning_rate": 2.628729699106131e-07, "logits/chosen": 0.8800075650215149, "logits/rejected": 3.012760877609253, "logps/chosen": -323.5577087402344, "logps/rejected": -583.1554565429688, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": 2.246222496032715, "rewards/margins": 14.201080322265625, "rewards/rejected": -11.95485782623291, "step": 4650 }, { "epoch": 1.58, "learning_rate": 2.6224348482940956e-07, "logits/chosen": 0.7633311152458191, "logits/rejected": 2.2152318954467773, "logps/chosen": -466.2403869628906, "logps/rejected": -722.082763671875, "loss": 0.0159, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.236987829208374, "rewards/margins": 13.460619926452637, "rewards/rejected": -11.2236328125, "step": 4660 }, { "epoch": 1.59, "learning_rate": 2.61613999748206e-07, "logits/chosen": 1.5745208263397217, "logits/rejected": 3.095592975616455, "logps/chosen": -404.6357116699219, "logps/rejected": -453.40130615234375, "loss": 0.0079, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7302166223526, "rewards/margins": 13.1705961227417, "rewards/rejected": -11.44037914276123, "step": 4670 }, { "epoch": 1.59, "learning_rate": 2.609845146670024e-07, "logits/chosen": 0.7493244409561157, "logits/rejected": 2.8759987354278564, "logps/chosen": -296.1628112792969, "logps/rejected": -457.446044921875, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": 2.227241277694702, "rewards/margins": 13.656562805175781, "rewards/rejected": -11.4293212890625, "step": 4680 }, { "epoch": 1.59, "learning_rate": 2.603550295857988e-07, "logits/chosen": 0.9803324937820435, "logits/rejected": 2.721207618713379, "logps/chosen": -336.9967346191406, "logps/rejected": -632.5643920898438, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": 1.8605949878692627, "rewards/margins": 12.091894149780273, "rewards/rejected": -10.23129940032959, "step": 4690 }, { "epoch": 1.6, "learning_rate": 2.597255445045952e-07, "logits/chosen": 1.1940078735351562, "logits/rejected": 3.1348912715911865, "logps/chosen": -424.3228454589844, "logps/rejected": -489.9722595214844, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": 1.4344812631607056, "rewards/margins": 13.698277473449707, "rewards/rejected": -12.263795852661133, "step": 4700 }, { "epoch": 1.6, "eval_logits/chosen": 0.6007880568504333, "eval_logits/rejected": 2.7200093269348145, "eval_logps/chosen": -369.6922607421875, "eval_logps/rejected": -615.797119140625, "eval_loss": 0.008013113401830196, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 1.929763913154602, "eval_rewards/margins": 13.246444702148438, "eval_rewards/rejected": -11.316682815551758, "eval_runtime": 268.0276, "eval_samples_per_second": 35.444, "eval_steps_per_second": 1.108, "step": 4700 }, { "epoch": 1.6, "learning_rate": 2.590960594233916e-07, "logits/chosen": 1.1567070484161377, "logits/rejected": 2.7277112007141113, "logps/chosen": -341.4814147949219, "logps/rejected": -638.3734130859375, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": 1.9156405925750732, "rewards/margins": 12.364795684814453, "rewards/rejected": -10.449155807495117, "step": 4710 }, { "epoch": 1.6, "learning_rate": 2.584665743421881e-07, "logits/chosen": 0.7439510822296143, "logits/rejected": 3.0841832160949707, "logps/chosen": -308.150634765625, "logps/rejected": -454.5015563964844, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 2.00886869430542, "rewards/margins": 13.200039863586426, "rewards/rejected": -11.191170692443848, "step": 4720 }, { "epoch": 1.61, "learning_rate": 2.578370892609845e-07, "logits/chosen": 1.3363820314407349, "logits/rejected": 2.9260199069976807, "logps/chosen": -362.2496032714844, "logps/rejected": -518.860107421875, "loss": 0.0057, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.637078046798706, "rewards/margins": 13.22400188446045, "rewards/rejected": -11.58692455291748, "step": 4730 }, { "epoch": 1.61, "learning_rate": 2.5720760417978095e-07, "logits/chosen": 1.194959282875061, "logits/rejected": 2.6666481494903564, "logps/chosen": -426.196533203125, "logps/rejected": -553.5843505859375, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": 1.5456111431121826, "rewards/margins": 11.892793655395508, "rewards/rejected": -10.34718132019043, "step": 4740 }, { "epoch": 1.61, "learning_rate": 2.5657811909857737e-07, "logits/chosen": 1.7758777141571045, "logits/rejected": 2.655397653579712, "logps/chosen": -411.11578369140625, "logps/rejected": -534.5379028320312, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 1.7537777423858643, "rewards/margins": 11.740898132324219, "rewards/rejected": -9.987119674682617, "step": 4750 }, { "epoch": 1.62, "learning_rate": 2.559486340173738e-07, "logits/chosen": 0.6086001396179199, "logits/rejected": 2.924757242202759, "logps/chosen": -326.00897216796875, "logps/rejected": -520.2870483398438, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 1.8088676929473877, "rewards/margins": 14.248144149780273, "rewards/rejected": -12.439276695251465, "step": 4760 }, { "epoch": 1.62, "learning_rate": 2.5531914893617016e-07, "logits/chosen": 1.1811004877090454, "logits/rejected": 2.159785032272339, "logps/chosen": -328.2763977050781, "logps/rejected": -801.7093505859375, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 1.4686853885650635, "rewards/margins": 12.940330505371094, "rewards/rejected": -11.471644401550293, "step": 4770 }, { "epoch": 1.62, "learning_rate": 2.5468966385496664e-07, "logits/chosen": 1.3045234680175781, "logits/rejected": 2.996737241744995, "logps/chosen": -318.265625, "logps/rejected": -461.1734313964844, "loss": 0.0084, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1198580265045166, "rewards/margins": 13.443967819213867, "rewards/rejected": -11.32411003112793, "step": 4780 }, { "epoch": 1.63, "learning_rate": 2.5406017877376306e-07, "logits/chosen": 1.559531807899475, "logits/rejected": 2.582435131072998, "logps/chosen": -372.0412902832031, "logps/rejected": -553.0614624023438, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": 1.4875469207763672, "rewards/margins": 13.850107192993164, "rewards/rejected": -12.362558364868164, "step": 4790 }, { "epoch": 1.63, "learning_rate": 2.534306936925595e-07, "logits/chosen": 1.2863677740097046, "logits/rejected": 2.7517781257629395, "logps/chosen": -382.712890625, "logps/rejected": -572.390869140625, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": 2.2039921283721924, "rewards/margins": 13.863197326660156, "rewards/rejected": -11.659204483032227, "step": 4800 }, { "epoch": 1.63, "eval_logits/chosen": 0.5697704553604126, "eval_logits/rejected": 2.6917316913604736, "eval_logps/chosen": -370.18133544921875, "eval_logps/rejected": -617.3153686523438, "eval_loss": 0.007416225038468838, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 1.8808573484420776, "eval_rewards/margins": 13.349367141723633, "eval_rewards/rejected": -11.46850872039795, "eval_runtime": 267.3405, "eval_samples_per_second": 35.535, "eval_steps_per_second": 1.111, "step": 4800 }, { "epoch": 1.63, "learning_rate": 2.528012086113559e-07, "logits/chosen": 1.2882909774780273, "logits/rejected": 2.163443088531494, "logps/chosen": -332.18695068359375, "logps/rejected": -742.5114135742188, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 1.9539035558700562, "rewards/margins": 12.516752243041992, "rewards/rejected": -10.562848091125488, "step": 4810 }, { "epoch": 1.64, "learning_rate": 2.5217172353015233e-07, "logits/chosen": 0.6577471494674683, "logits/rejected": 2.275991916656494, "logps/chosen": -381.43731689453125, "logps/rejected": -725.3045043945312, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 2.548079252243042, "rewards/margins": 16.103429794311523, "rewards/rejected": -13.555349349975586, "step": 4820 }, { "epoch": 1.64, "learning_rate": 2.5154223844894875e-07, "logits/chosen": 1.2324239015579224, "logits/rejected": 3.194679021835327, "logps/chosen": -413.9017639160156, "logps/rejected": -476.4483947753906, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 1.6373205184936523, "rewards/margins": 12.455896377563477, "rewards/rejected": -10.818574905395508, "step": 4830 }, { "epoch": 1.65, "learning_rate": 2.509127533677452e-07, "logits/chosen": 0.9907386898994446, "logits/rejected": 2.874156951904297, "logps/chosen": -389.5166320800781, "logps/rejected": -418.999267578125, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": 2.1311802864074707, "rewards/margins": 13.799692153930664, "rewards/rejected": -11.668512344360352, "step": 4840 }, { "epoch": 1.65, "learning_rate": 2.502832682865416e-07, "logits/chosen": 0.8386165499687195, "logits/rejected": 2.858229160308838, "logps/chosen": -393.86126708984375, "logps/rejected": -599.6715698242188, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 2.1954259872436523, "rewards/margins": 12.808425903320312, "rewards/rejected": -10.613000869750977, "step": 4850 }, { "epoch": 1.65, "learning_rate": 2.49653783205338e-07, "logits/chosen": 1.4464737176895142, "logits/rejected": 2.7185616493225098, "logps/chosen": -326.30816650390625, "logps/rejected": -562.0698852539062, "loss": 0.0074, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9854040145874023, "rewards/margins": 13.104391098022461, "rewards/rejected": -11.118986129760742, "step": 4860 }, { "epoch": 1.66, "learning_rate": 2.4902429812413444e-07, "logits/chosen": 0.9704988598823547, "logits/rejected": 2.861062526702881, "logps/chosen": -322.04351806640625, "logps/rejected": -590.7166137695312, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 1.773932695388794, "rewards/margins": 13.489224433898926, "rewards/rejected": -11.715291976928711, "step": 4870 }, { "epoch": 1.66, "learning_rate": 2.4839481304293086e-07, "logits/chosen": 0.7275189161300659, "logits/rejected": 2.4044909477233887, "logps/chosen": -359.2979431152344, "logps/rejected": -687.754150390625, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": 1.7108738422393799, "rewards/margins": 13.818089485168457, "rewards/rejected": -12.107215881347656, "step": 4880 }, { "epoch": 1.66, "learning_rate": 2.477653279617273e-07, "logits/chosen": 1.3394114971160889, "logits/rejected": 2.08496356010437, "logps/chosen": -431.34332275390625, "logps/rejected": -859.7326049804688, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 2.0375823974609375, "rewards/margins": 14.49980354309082, "rewards/rejected": -12.462221145629883, "step": 4890 }, { "epoch": 1.67, "learning_rate": 2.471358428805237e-07, "logits/chosen": 1.1097261905670166, "logits/rejected": 2.604459285736084, "logps/chosen": -395.5567321777344, "logps/rejected": -643.832763671875, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": 2.0858750343322754, "rewards/margins": 13.381146430969238, "rewards/rejected": -11.295271873474121, "step": 4900 }, { "epoch": 1.67, "eval_logits/chosen": 0.5797955989837646, "eval_logits/rejected": 2.6860883235931396, "eval_logps/chosen": -369.99676513671875, "eval_logps/rejected": -621.4749145507812, "eval_loss": 0.007327604573220015, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.8993133306503296, "eval_rewards/margins": 13.783766746520996, "eval_rewards/rejected": -11.884452819824219, "eval_runtime": 267.4053, "eval_samples_per_second": 35.527, "eval_steps_per_second": 1.111, "step": 4900 }, { "epoch": 1.67, "learning_rate": 2.4650635779932013e-07, "logits/chosen": 1.3302220106124878, "logits/rejected": 2.934269666671753, "logps/chosen": -337.15960693359375, "logps/rejected": -503.8389587402344, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 1.697217345237732, "rewards/margins": 14.28808879852295, "rewards/rejected": -12.59087085723877, "step": 4910 }, { "epoch": 1.67, "learning_rate": 2.4587687271811656e-07, "logits/chosen": 1.211080551147461, "logits/rejected": 2.21518874168396, "logps/chosen": -342.48504638671875, "logps/rejected": -734.8573608398438, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": 2.1308863162994385, "rewards/margins": 13.767776489257812, "rewards/rejected": -11.63688850402832, "step": 4920 }, { "epoch": 1.68, "learning_rate": 2.45247387636913e-07, "logits/chosen": 0.7533133625984192, "logits/rejected": 2.5293025970458984, "logps/chosen": -349.9305114746094, "logps/rejected": -584.0184936523438, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 2.0276365280151367, "rewards/margins": 13.492365837097168, "rewards/rejected": -11.464729309082031, "step": 4930 }, { "epoch": 1.68, "learning_rate": 2.446179025557094e-07, "logits/chosen": 1.0548173189163208, "logits/rejected": 2.1759402751922607, "logps/chosen": -493.15142822265625, "logps/rejected": -702.56591796875, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": 1.9155597686767578, "rewards/margins": 13.1171236038208, "rewards/rejected": -11.201563835144043, "step": 4940 }, { "epoch": 1.68, "learning_rate": 2.439884174745059e-07, "logits/chosen": 0.9858635663986206, "logits/rejected": 2.9378695487976074, "logps/chosen": -381.10833740234375, "logps/rejected": -520.8736572265625, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": 1.3787133693695068, "rewards/margins": 13.84961986541748, "rewards/rejected": -12.470907211303711, "step": 4950 }, { "epoch": 1.69, "learning_rate": 2.4335893239330225e-07, "logits/chosen": 1.2095704078674316, "logits/rejected": 2.5496010780334473, "logps/chosen": -418.56024169921875, "logps/rejected": -585.3426513671875, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 1.7939611673355103, "rewards/margins": 12.407074928283691, "rewards/rejected": -10.613114356994629, "step": 4960 }, { "epoch": 1.69, "learning_rate": 2.4272944731209867e-07, "logits/chosen": 0.9482321739196777, "logits/rejected": 2.6138076782226562, "logps/chosen": -396.30633544921875, "logps/rejected": -573.6236572265625, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": 2.186265230178833, "rewards/margins": 15.410557746887207, "rewards/rejected": -13.224291801452637, "step": 4970 }, { "epoch": 1.69, "learning_rate": 2.4209996223089514e-07, "logits/chosen": 1.3051884174346924, "logits/rejected": 2.48286771774292, "logps/chosen": -393.8180847167969, "logps/rejected": -603.3321533203125, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 1.6906706094741821, "rewards/margins": 13.082013130187988, "rewards/rejected": -11.39134407043457, "step": 4980 }, { "epoch": 1.7, "learning_rate": 2.4147047714969157e-07, "logits/chosen": 0.8893558382987976, "logits/rejected": 2.5944838523864746, "logps/chosen": -335.33447265625, "logps/rejected": -661.49755859375, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 2.2299575805664062, "rewards/margins": 12.682855606079102, "rewards/rejected": -10.452896118164062, "step": 4990 }, { "epoch": 1.7, "learning_rate": 2.4084099206848794e-07, "logits/chosen": 1.099838137626648, "logits/rejected": 2.7704567909240723, "logps/chosen": -321.14569091796875, "logps/rejected": -554.14697265625, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 2.2605462074279785, "rewards/margins": 13.753137588500977, "rewards/rejected": -11.492591857910156, "step": 5000 }, { "epoch": 1.7, "eval_logits/chosen": 0.5411103367805481, "eval_logits/rejected": 2.6435320377349854, "eval_logps/chosen": -370.23480224609375, "eval_logps/rejected": -625.662353515625, "eval_loss": 0.007171071134507656, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 1.8755117654800415, "eval_rewards/margins": 14.178715705871582, "eval_rewards/rejected": -12.303203582763672, "eval_runtime": 268.4746, "eval_samples_per_second": 35.385, "eval_steps_per_second": 1.106, "step": 5000 }, { "epoch": 1.7, "learning_rate": 2.402115069872844e-07, "logits/chosen": 1.7221934795379639, "logits/rejected": 2.8443284034729004, "logps/chosen": -355.40570068359375, "logps/rejected": -497.22705078125, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": 1.781616449356079, "rewards/margins": 15.23070240020752, "rewards/rejected": -13.44908618927002, "step": 5010 }, { "epoch": 1.71, "learning_rate": 2.3958202190608084e-07, "logits/chosen": 0.4256567358970642, "logits/rejected": 2.852240800857544, "logps/chosen": -388.9455261230469, "logps/rejected": -534.2367553710938, "loss": 0.01, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.100231170654297, "rewards/margins": 13.96013355255127, "rewards/rejected": -11.859903335571289, "step": 5020 }, { "epoch": 1.71, "learning_rate": 2.3895253682487726e-07, "logits/chosen": 1.6765273809432983, "logits/rejected": 2.6882596015930176, "logps/chosen": -330.9751281738281, "logps/rejected": -621.0322265625, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": 2.1029813289642334, "rewards/margins": 12.23882007598877, "rewards/rejected": -10.13583755493164, "step": 5030 }, { "epoch": 1.71, "learning_rate": 2.3832305174367368e-07, "logits/chosen": 1.4308533668518066, "logits/rejected": 2.5526671409606934, "logps/chosen": -383.99676513671875, "logps/rejected": -571.560791015625, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": 1.778713583946228, "rewards/margins": 14.228243827819824, "rewards/rejected": -12.449530601501465, "step": 5040 }, { "epoch": 1.72, "learning_rate": 2.3769356666247008e-07, "logits/chosen": 0.7223809361457825, "logits/rejected": 2.534702777862549, "logps/chosen": -369.4510803222656, "logps/rejected": -601.6087646484375, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 1.5519216060638428, "rewards/margins": 13.02763557434082, "rewards/rejected": -11.475714683532715, "step": 5050 }, { "epoch": 1.72, "learning_rate": 2.370640815812665e-07, "logits/chosen": 1.0404523611068726, "logits/rejected": 2.5804059505462646, "logps/chosen": -396.8551330566406, "logps/rejected": -581.9253540039062, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 2.147939682006836, "rewards/margins": 14.98511028289795, "rewards/rejected": -12.837170600891113, "step": 5060 }, { "epoch": 1.72, "learning_rate": 2.3643459650006295e-07, "logits/chosen": 0.8765469789505005, "logits/rejected": 3.199450969696045, "logps/chosen": -391.029296875, "logps/rejected": -462.06317138671875, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": 2.0410709381103516, "rewards/margins": 13.092700004577637, "rewards/rejected": -11.051628112792969, "step": 5070 }, { "epoch": 1.73, "learning_rate": 2.3580511141885937e-07, "logits/chosen": 1.0518453121185303, "logits/rejected": 2.7533373832702637, "logps/chosen": -371.9391174316406, "logps/rejected": -500.344482421875, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": 2.1165008544921875, "rewards/margins": 14.174306869506836, "rewards/rejected": -12.057806015014648, "step": 5080 }, { "epoch": 1.73, "learning_rate": 2.3517562633765577e-07, "logits/chosen": 1.240299940109253, "logits/rejected": 2.9455935955047607, "logps/chosen": -300.67706298828125, "logps/rejected": -484.98553466796875, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 1.8077754974365234, "rewards/margins": 13.667498588562012, "rewards/rejected": -11.859723091125488, "step": 5090 }, { "epoch": 1.73, "learning_rate": 2.3454614125645222e-07, "logits/chosen": 0.7239997982978821, "logits/rejected": 2.22920823097229, "logps/chosen": -339.1505432128906, "logps/rejected": -665.8603515625, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": 1.847394347190857, "rewards/margins": 13.439477920532227, "rewards/rejected": -11.592084884643555, "step": 5100 }, { "epoch": 1.73, "eval_logits/chosen": 0.5392878651618958, "eval_logits/rejected": 2.6526615619659424, "eval_logps/chosen": -369.70660400390625, "eval_logps/rejected": -621.6979370117188, "eval_loss": 0.0076078129932284355, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": 1.9283288717269897, "eval_rewards/margins": 13.835088729858398, "eval_rewards/rejected": -11.906759262084961, "eval_runtime": 267.4123, "eval_samples_per_second": 35.526, "eval_steps_per_second": 1.111, "step": 5100 }, { "epoch": 1.74, "learning_rate": 2.3391665617524864e-07, "logits/chosen": 0.5402101278305054, "logits/rejected": 2.0428051948547363, "logps/chosen": -446.457763671875, "logps/rejected": -828.9158325195312, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": 2.1444571018218994, "rewards/margins": 14.292704582214355, "rewards/rejected": -12.148245811462402, "step": 5110 }, { "epoch": 1.74, "learning_rate": 2.3328717109404506e-07, "logits/chosen": 1.1088473796844482, "logits/rejected": 3.142504930496216, "logps/chosen": -327.00152587890625, "logps/rejected": -501.23016357421875, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": 2.154792070388794, "rewards/margins": 14.1039457321167, "rewards/rejected": -11.9491548538208, "step": 5120 }, { "epoch": 1.74, "learning_rate": 2.3265768601284149e-07, "logits/chosen": 1.1828131675720215, "logits/rejected": 2.618335247039795, "logps/chosen": -461.40521240234375, "logps/rejected": -630.9039916992188, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 2.291642904281616, "rewards/margins": 13.117799758911133, "rewards/rejected": -10.826155662536621, "step": 5130 }, { "epoch": 1.75, "learning_rate": 2.320282009316379e-07, "logits/chosen": 0.49540406465530396, "logits/rejected": 2.015763282775879, "logps/chosen": -457.4979553222656, "logps/rejected": -764.1224975585938, "loss": 0.0066, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.3074951171875, "rewards/margins": 13.635294914245605, "rewards/rejected": -11.327799797058105, "step": 5140 }, { "epoch": 1.75, "learning_rate": 2.3139871585043433e-07, "logits/chosen": 1.3483718633651733, "logits/rejected": 2.9165961742401123, "logps/chosen": -384.2827453613281, "logps/rejected": -517.1640625, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": 2.0184836387634277, "rewards/margins": 14.222018241882324, "rewards/rejected": -12.203533172607422, "step": 5150 }, { "epoch": 1.75, "learning_rate": 2.3076923076923078e-07, "logits/chosen": 0.9851890802383423, "logits/rejected": 2.5453128814697266, "logps/chosen": -472.58721923828125, "logps/rejected": -554.181884765625, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": 2.1495094299316406, "rewards/margins": 14.743972778320312, "rewards/rejected": -12.594462394714355, "step": 5160 }, { "epoch": 1.76, "learning_rate": 2.3013974568802718e-07, "logits/chosen": 1.0264804363250732, "logits/rejected": 2.838040590286255, "logps/chosen": -378.90606689453125, "logps/rejected": -520.4061279296875, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": 2.653881788253784, "rewards/margins": 14.299891471862793, "rewards/rejected": -11.64600944519043, "step": 5170 }, { "epoch": 1.76, "learning_rate": 2.295102606068236e-07, "logits/chosen": 0.6610434055328369, "logits/rejected": 2.055659055709839, "logps/chosen": -336.66229248046875, "logps/rejected": -809.7601318359375, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.5673797130584717, "rewards/margins": 14.333930969238281, "rewards/rejected": -12.766549110412598, "step": 5180 }, { "epoch": 1.76, "learning_rate": 2.2888077552562005e-07, "logits/chosen": 0.7072926163673401, "logits/rejected": 2.731778621673584, "logps/chosen": -338.4165954589844, "logps/rejected": -631.5391845703125, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": 2.0703439712524414, "rewards/margins": 13.521017074584961, "rewards/rejected": -11.450674057006836, "step": 5190 }, { "epoch": 1.77, "learning_rate": 2.2825129044441647e-07, "logits/chosen": 0.7150042653083801, "logits/rejected": 3.074216842651367, "logps/chosen": -312.5461120605469, "logps/rejected": -556.0842895507812, "loss": 0.0065, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9952480792999268, "rewards/margins": 13.600128173828125, "rewards/rejected": -11.604883193969727, "step": 5200 }, { "epoch": 1.77, "eval_logits/chosen": 0.5763381123542786, "eval_logits/rejected": 2.6790342330932617, "eval_logps/chosen": -369.119873046875, "eval_logps/rejected": -621.7357177734375, "eval_loss": 0.007353052031248808, "eval_rewards/accuracies": 0.9949495196342468, "eval_rewards/chosen": 1.9870065450668335, "eval_rewards/margins": 13.897536277770996, "eval_rewards/rejected": -11.910529136657715, "eval_runtime": 267.7726, "eval_samples_per_second": 35.478, "eval_steps_per_second": 1.109, "step": 5200 }, { "epoch": 1.77, "learning_rate": 2.2762180536321287e-07, "logits/chosen": 1.0697767734527588, "logits/rejected": 2.543888807296753, "logps/chosen": -324.13116455078125, "logps/rejected": -668.995361328125, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": 1.9455013275146484, "rewards/margins": 12.813148498535156, "rewards/rejected": -10.867646217346191, "step": 5210 }, { "epoch": 1.77, "learning_rate": 2.2699232028200932e-07, "logits/chosen": 1.0008208751678467, "logits/rejected": 2.202432155609131, "logps/chosen": -388.22332763671875, "logps/rejected": -730.78564453125, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 2.0636024475097656, "rewards/margins": 12.839284896850586, "rewards/rejected": -10.77568531036377, "step": 5220 }, { "epoch": 1.78, "learning_rate": 2.2636283520080574e-07, "logits/chosen": 1.6768519878387451, "logits/rejected": 2.8521502017974854, "logps/chosen": -432.42132568359375, "logps/rejected": -569.5505981445312, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 1.9164817333221436, "rewards/margins": 13.948224067687988, "rewards/rejected": -12.031743049621582, "step": 5230 }, { "epoch": 1.78, "learning_rate": 2.2573335011960216e-07, "logits/chosen": 0.578073263168335, "logits/rejected": 2.474010467529297, "logps/chosen": -319.8635559082031, "logps/rejected": -753.2205200195312, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 2.47343111038208, "rewards/margins": 15.181828498840332, "rewards/rejected": -12.708395957946777, "step": 5240 }, { "epoch": 1.78, "learning_rate": 2.2510386503839856e-07, "logits/chosen": 1.2054836750030518, "logits/rejected": 2.4689555168151855, "logps/chosen": -330.71197509765625, "logps/rejected": -639.7576293945312, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 2.0893828868865967, "rewards/margins": 14.316856384277344, "rewards/rejected": -12.227472305297852, "step": 5250 }, { "epoch": 1.79, "learning_rate": 2.24474379957195e-07, "logits/chosen": 0.8659073710441589, "logits/rejected": 2.069164514541626, "logps/chosen": -472.73992919921875, "logps/rejected": -751.3345336914062, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 1.6958650350570679, "rewards/margins": 14.968708992004395, "rewards/rejected": -13.272845268249512, "step": 5260 }, { "epoch": 1.79, "learning_rate": 2.2384489487599143e-07, "logits/chosen": 0.7720758318901062, "logits/rejected": 2.473846912384033, "logps/chosen": -453.7737731933594, "logps/rejected": -785.3414306640625, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 1.9351928234100342, "rewards/margins": 15.376632690429688, "rewards/rejected": -13.441439628601074, "step": 5270 }, { "epoch": 1.79, "learning_rate": 2.2321540979478783e-07, "logits/chosen": 0.5396759510040283, "logits/rejected": 2.619581699371338, "logps/chosen": -372.71575927734375, "logps/rejected": -683.4382934570312, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": 1.3975188732147217, "rewards/margins": 15.780477523803711, "rewards/rejected": -14.382959365844727, "step": 5280 }, { "epoch": 1.8, "learning_rate": 2.2258592471358428e-07, "logits/chosen": 1.0776684284210205, "logits/rejected": 2.364051342010498, "logps/chosen": -315.5631103515625, "logps/rejected": -682.8671875, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": 1.5904780626296997, "rewards/margins": 13.19941520690918, "rewards/rejected": -11.608936309814453, "step": 5290 }, { "epoch": 1.8, "learning_rate": 2.219564396323807e-07, "logits/chosen": 1.0182130336761475, "logits/rejected": 2.3595683574676514, "logps/chosen": -320.3334045410156, "logps/rejected": -739.2911376953125, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": 1.6678212881088257, "rewards/margins": 14.77850341796875, "rewards/rejected": -13.110682487487793, "step": 5300 }, { "epoch": 1.8, "eval_logits/chosen": 0.5392746329307556, "eval_logits/rejected": 2.6264493465423584, "eval_logps/chosen": -370.9958801269531, "eval_logps/rejected": -627.23095703125, "eval_loss": 0.006811817176640034, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": 1.799401879310608, "eval_rewards/margins": 14.259466171264648, "eval_rewards/rejected": -12.460063934326172, "eval_runtime": 268.9812, "eval_samples_per_second": 35.318, "eval_steps_per_second": 1.104, "step": 5300 }, { "epoch": 1.8, "learning_rate": 2.2132695455117712e-07, "logits/chosen": 1.2998424768447876, "logits/rejected": 3.0192737579345703, "logps/chosen": -341.30340576171875, "logps/rejected": -509.3768005371094, "loss": 0.005, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.990985631942749, "rewards/margins": 14.92119026184082, "rewards/rejected": -12.930203437805176, "step": 5310 }, { "epoch": 1.81, "learning_rate": 2.2069746946997355e-07, "logits/chosen": 0.29328662157058716, "logits/rejected": 3.271939516067505, "logps/chosen": -281.95684814453125, "logps/rejected": -424.2947692871094, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": 1.4099323749542236, "rewards/margins": 13.121679306030273, "rewards/rejected": -11.711746215820312, "step": 5320 }, { "epoch": 1.81, "learning_rate": 2.2006798438876997e-07, "logits/chosen": 1.244208574295044, "logits/rejected": 2.4942610263824463, "logps/chosen": -451.87091064453125, "logps/rejected": -706.4019165039062, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 1.5382120609283447, "rewards/margins": 12.513381004333496, "rewards/rejected": -10.97516918182373, "step": 5330 }, { "epoch": 1.82, "learning_rate": 2.194384993075664e-07, "logits/chosen": 0.7272036075592041, "logits/rejected": 2.511995792388916, "logps/chosen": -298.5057678222656, "logps/rejected": -636.6715087890625, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 2.117550849914551, "rewards/margins": 14.677660942077637, "rewards/rejected": -12.560112953186035, "step": 5340 }, { "epoch": 1.82, "learning_rate": 2.1880901422636284e-07, "logits/chosen": 1.2972975969314575, "logits/rejected": 2.9249444007873535, "logps/chosen": -325.5199279785156, "logps/rejected": -565.7622680664062, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": 2.2052669525146484, "rewards/margins": 14.870798110961914, "rewards/rejected": -12.665529251098633, "step": 5350 }, { "epoch": 1.82, "learning_rate": 2.1817952914515924e-07, "logits/chosen": 0.94163978099823, "logits/rejected": 2.520163059234619, "logps/chosen": -321.620361328125, "logps/rejected": -606.1392822265625, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 1.9230222702026367, "rewards/margins": 13.822883605957031, "rewards/rejected": -11.899862289428711, "step": 5360 }, { "epoch": 1.83, "learning_rate": 2.1755004406395566e-07, "logits/chosen": 0.6572908163070679, "logits/rejected": 2.8326666355133057, "logps/chosen": -399.23150634765625, "logps/rejected": -500.59326171875, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": 2.4918336868286133, "rewards/margins": 17.32343292236328, "rewards/rejected": -14.8316011428833, "step": 5370 }, { "epoch": 1.83, "learning_rate": 2.169205589827521e-07, "logits/chosen": 0.7684445381164551, "logits/rejected": 2.7367186546325684, "logps/chosen": -334.71453857421875, "logps/rejected": -572.8629150390625, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": 1.8657491207122803, "rewards/margins": 16.919837951660156, "rewards/rejected": -15.054089546203613, "step": 5380 }, { "epoch": 1.83, "learning_rate": 2.1629107390154853e-07, "logits/chosen": 0.946051299571991, "logits/rejected": 2.517101287841797, "logps/chosen": -322.4632568359375, "logps/rejected": -652.5506591796875, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 1.803297996520996, "rewards/margins": 14.255006790161133, "rewards/rejected": -12.451708793640137, "step": 5390 }, { "epoch": 1.84, "learning_rate": 2.1566158882034493e-07, "logits/chosen": 0.7960838675498962, "logits/rejected": 2.9010050296783447, "logps/chosen": -310.89794921875, "logps/rejected": -486.76348876953125, "loss": 0.0076, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1863255500793457, "rewards/margins": 14.510612487792969, "rewards/rejected": -12.324285507202148, "step": 5400 }, { "epoch": 1.84, "eval_logits/chosen": 0.5465123057365417, "eval_logits/rejected": 2.6408891677856445, "eval_logps/chosen": -368.5406799316406, "eval_logps/rejected": -624.6871337890625, "eval_loss": 0.006404118612408638, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": 2.04492449760437, "eval_rewards/margins": 14.250606536865234, "eval_rewards/rejected": -12.205682754516602, "eval_runtime": 267.8158, "eval_samples_per_second": 35.472, "eval_steps_per_second": 1.109, "step": 5400 }, { "epoch": 1.84, "learning_rate": 2.1503210373914138e-07, "logits/chosen": 1.2992589473724365, "logits/rejected": 2.140784978866577, "logps/chosen": -386.3299255371094, "logps/rejected": -728.1055908203125, "loss": 0.0063, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.298143148422241, "rewards/margins": 13.360751152038574, "rewards/rejected": -11.062607765197754, "step": 5410 }, { "epoch": 1.84, "learning_rate": 2.144026186579378e-07, "logits/chosen": 0.944604218006134, "logits/rejected": 2.4234421253204346, "logps/chosen": -299.80853271484375, "logps/rejected": -729.2354736328125, "loss": 0.0085, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.149277448654175, "rewards/margins": 15.382696151733398, "rewards/rejected": -13.233418464660645, "step": 5420 }, { "epoch": 1.85, "learning_rate": 2.1377313357673422e-07, "logits/chosen": 1.1390790939331055, "logits/rejected": 2.4424569606781006, "logps/chosen": -372.9580383300781, "logps/rejected": -661.5415649414062, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 1.522296667098999, "rewards/margins": 13.513280868530273, "rewards/rejected": -11.990983963012695, "step": 5430 }, { "epoch": 1.85, "learning_rate": 2.1314364849553065e-07, "logits/chosen": 0.9248983263969421, "logits/rejected": 2.6572163105010986, "logps/chosen": -358.212646484375, "logps/rejected": -549.5049438476562, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": 1.6326959133148193, "rewards/margins": 14.595819473266602, "rewards/rejected": -12.96312427520752, "step": 5440 }, { "epoch": 1.85, "learning_rate": 2.1251416341432707e-07, "logits/chosen": 0.878515899181366, "logits/rejected": 2.608353614807129, "logps/chosen": -400.9839782714844, "logps/rejected": -705.0433349609375, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": 2.141390800476074, "rewards/margins": 14.705162048339844, "rewards/rejected": -12.56377124786377, "step": 5450 }, { "epoch": 1.86, "learning_rate": 2.118846783331235e-07, "logits/chosen": 0.9696270227432251, "logits/rejected": 2.727121114730835, "logps/chosen": -410.68963623046875, "logps/rejected": -601.6516723632812, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": 1.737865686416626, "rewards/margins": 14.875020027160645, "rewards/rejected": -13.137155532836914, "step": 5460 }, { "epoch": 1.86, "learning_rate": 2.1125519325191994e-07, "logits/chosen": 1.1417778730392456, "logits/rejected": 2.4671216011047363, "logps/chosen": -338.8183288574219, "logps/rejected": -724.9601440429688, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 1.960103988647461, "rewards/margins": 15.837699890136719, "rewards/rejected": -13.877595901489258, "step": 5470 }, { "epoch": 1.86, "learning_rate": 2.1062570817071634e-07, "logits/chosen": 1.0455242395401, "logits/rejected": 2.3731436729431152, "logps/chosen": -434.44024658203125, "logps/rejected": -633.2236938476562, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 1.8375475406646729, "rewards/margins": 12.737271308898926, "rewards/rejected": -10.899724960327148, "step": 5480 }, { "epoch": 1.87, "learning_rate": 2.0999622308951276e-07, "logits/chosen": 1.118044137954712, "logits/rejected": 2.3109071254730225, "logps/chosen": -375.8702087402344, "logps/rejected": -674.3856811523438, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 2.1463160514831543, "rewards/margins": 15.210006713867188, "rewards/rejected": -13.063692092895508, "step": 5490 }, { "epoch": 1.87, "learning_rate": 2.093667380083092e-07, "logits/chosen": 0.9572780728340149, "logits/rejected": 2.776047945022583, "logps/chosen": -329.673583984375, "logps/rejected": -557.1834716796875, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 1.6945956945419312, "rewards/margins": 14.333562850952148, "rewards/rejected": -12.638967514038086, "step": 5500 }, { "epoch": 1.87, "eval_logits/chosen": 0.5432767271995544, "eval_logits/rejected": 2.6332035064697266, "eval_logps/chosen": -369.0491027832031, "eval_logps/rejected": -627.0294799804688, "eval_loss": 0.006227751262485981, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.9940775632858276, "eval_rewards/margins": 14.433989524841309, "eval_rewards/rejected": -12.439913749694824, "eval_runtime": 268.031, "eval_samples_per_second": 35.444, "eval_steps_per_second": 1.108, "step": 5500 }, { "epoch": 1.87, "learning_rate": 2.087372529271056e-07, "logits/chosen": 0.9330072402954102, "logits/rejected": 2.2522757053375244, "logps/chosen": -447.468994140625, "logps/rejected": -697.0299072265625, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 2.3345236778259277, "rewards/margins": 15.280471801757812, "rewards/rejected": -12.945945739746094, "step": 5510 }, { "epoch": 1.88, "learning_rate": 2.0810776784590203e-07, "logits/chosen": 1.5043458938598633, "logits/rejected": 3.0579121112823486, "logps/chosen": -326.2335205078125, "logps/rejected": -600.5350341796875, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 1.7955917119979858, "rewards/margins": 15.176149368286133, "rewards/rejected": -13.380559921264648, "step": 5520 }, { "epoch": 1.88, "learning_rate": 2.0747828276469848e-07, "logits/chosen": 0.7315915822982788, "logits/rejected": 1.9385446310043335, "logps/chosen": -429.71746826171875, "logps/rejected": -748.8201904296875, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 2.13820219039917, "rewards/margins": 13.658441543579102, "rewards/rejected": -11.520238876342773, "step": 5530 }, { "epoch": 1.88, "learning_rate": 2.068487976834949e-07, "logits/chosen": 1.217410683631897, "logits/rejected": 2.797996997833252, "logps/chosen": -355.1609802246094, "logps/rejected": -616.5965576171875, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 2.3493545055389404, "rewards/margins": 15.734540939331055, "rewards/rejected": -13.385187149047852, "step": 5540 }, { "epoch": 1.89, "learning_rate": 2.062193126022913e-07, "logits/chosen": 0.6537594795227051, "logits/rejected": 2.7036337852478027, "logps/chosen": -308.1028747558594, "logps/rejected": -716.1016235351562, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 1.7219940423965454, "rewards/margins": 14.988980293273926, "rewards/rejected": -13.266984939575195, "step": 5550 }, { "epoch": 1.89, "learning_rate": 2.0558982752108775e-07, "logits/chosen": 1.0059354305267334, "logits/rejected": 2.405097484588623, "logps/chosen": -354.9513854980469, "logps/rejected": -728.5120849609375, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 1.8416436910629272, "rewards/margins": 15.623313903808594, "rewards/rejected": -13.781671524047852, "step": 5560 }, { "epoch": 1.89, "learning_rate": 2.0496034243988417e-07, "logits/chosen": 0.7288120985031128, "logits/rejected": 2.4463515281677246, "logps/chosen": -386.6265869140625, "logps/rejected": -694.5880737304688, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 1.276694893836975, "rewards/margins": 13.237451553344727, "rewards/rejected": -11.960756301879883, "step": 5570 }, { "epoch": 1.9, "learning_rate": 2.043308573586806e-07, "logits/chosen": 0.9864422082901001, "logits/rejected": 2.416912794113159, "logps/chosen": -442.6741638183594, "logps/rejected": -745.8394775390625, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": 1.6303446292877197, "rewards/margins": 13.753680229187012, "rewards/rejected": -12.123335838317871, "step": 5580 }, { "epoch": 1.9, "learning_rate": 2.0370137227747701e-07, "logits/chosen": 1.0159143209457397, "logits/rejected": 2.404919147491455, "logps/chosen": -403.14892578125, "logps/rejected": -700.0591430664062, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 1.6275684833526611, "rewards/margins": 14.735166549682617, "rewards/rejected": -13.107600212097168, "step": 5590 }, { "epoch": 1.9, "learning_rate": 2.0307188719627344e-07, "logits/chosen": 0.8741198778152466, "logits/rejected": 2.7800064086914062, "logps/chosen": -452.5263671875, "logps/rejected": -546.5322265625, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": 1.4914939403533936, "rewards/margins": 13.270769119262695, "rewards/rejected": -11.779274940490723, "step": 5600 }, { "epoch": 1.9, "eval_logits/chosen": 0.5377217531204224, "eval_logits/rejected": 2.6299867630004883, "eval_logps/chosen": -369.87109375, "eval_logps/rejected": -626.6300048828125, "eval_loss": 0.006109884940087795, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.9118820428848267, "eval_rewards/margins": 14.311848640441895, "eval_rewards/rejected": -12.399968147277832, "eval_runtime": 268.6218, "eval_samples_per_second": 35.366, "eval_steps_per_second": 1.106, "step": 5600 }, { "epoch": 1.91, "learning_rate": 2.0244240211506986e-07, "logits/chosen": 1.6756540536880493, "logits/rejected": 2.825979709625244, "logps/chosen": -355.1505126953125, "logps/rejected": -458.744140625, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 1.7418196201324463, "rewards/margins": 14.322778701782227, "rewards/rejected": -12.580958366394043, "step": 5610 }, { "epoch": 1.91, "learning_rate": 2.018129170338663e-07, "logits/chosen": 1.3055442571640015, "logits/rejected": 2.628654956817627, "logps/chosen": -510.27685546875, "logps/rejected": -572.4608764648438, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 2.037275791168213, "rewards/margins": 15.664937019348145, "rewards/rejected": -13.627660751342773, "step": 5620 }, { "epoch": 1.91, "learning_rate": 2.011834319526627e-07, "logits/chosen": 0.9477977752685547, "logits/rejected": 2.4301483631134033, "logps/chosen": -390.4095458984375, "logps/rejected": -582.1488647460938, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": 2.3422436714172363, "rewards/margins": 14.721003532409668, "rewards/rejected": -12.378759384155273, "step": 5630 }, { "epoch": 1.92, "learning_rate": 2.0055394687145913e-07, "logits/chosen": 0.8035489916801453, "logits/rejected": 1.7046220302581787, "logps/chosen": -377.4579162597656, "logps/rejected": -827.44580078125, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 1.7721195220947266, "rewards/margins": 13.13145637512207, "rewards/rejected": -11.359336853027344, "step": 5640 }, { "epoch": 1.92, "learning_rate": 1.9992446179025558e-07, "logits/chosen": 0.450077623128891, "logits/rejected": 2.0916006565093994, "logps/chosen": -318.74053955078125, "logps/rejected": -677.7642822265625, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": 1.5363391637802124, "rewards/margins": 13.124608993530273, "rewards/rejected": -11.588269233703613, "step": 5650 }, { "epoch": 1.92, "learning_rate": 1.99294976709052e-07, "logits/chosen": 1.2795765399932861, "logits/rejected": 2.624028444290161, "logps/chosen": -337.2548828125, "logps/rejected": -615.2337036132812, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 1.5114516019821167, "rewards/margins": 14.363494873046875, "rewards/rejected": -12.852045059204102, "step": 5660 }, { "epoch": 1.93, "learning_rate": 1.986654916278484e-07, "logits/chosen": 1.387978434562683, "logits/rejected": 2.4463891983032227, "logps/chosen": -324.6473693847656, "logps/rejected": -653.7659301757812, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": 1.8007307052612305, "rewards/margins": 13.293085098266602, "rewards/rejected": -11.492353439331055, "step": 5670 }, { "epoch": 1.93, "learning_rate": 1.9803600654664484e-07, "logits/chosen": 0.9867205619812012, "logits/rejected": 1.7832437753677368, "logps/chosen": -331.4938049316406, "logps/rejected": -868.81591796875, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 2.198737382888794, "rewards/margins": 15.122095108032227, "rewards/rejected": -12.923357009887695, "step": 5680 }, { "epoch": 1.93, "learning_rate": 1.9740652146544127e-07, "logits/chosen": 0.9721837043762207, "logits/rejected": 2.441796064376831, "logps/chosen": -407.02264404296875, "logps/rejected": -610.9381103515625, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 2.146698474884033, "rewards/margins": 13.958742141723633, "rewards/rejected": -11.812042236328125, "step": 5690 }, { "epoch": 1.94, "learning_rate": 1.9677703638423766e-07, "logits/chosen": 1.1788890361785889, "logits/rejected": 2.8577880859375, "logps/chosen": -334.38848876953125, "logps/rejected": -539.4073486328125, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 2.0800750255584717, "rewards/margins": 15.230023384094238, "rewards/rejected": -13.149948120117188, "step": 5700 }, { "epoch": 1.94, "eval_logits/chosen": 0.5288003087043762, "eval_logits/rejected": 2.624809980392456, "eval_logps/chosen": -368.4456787109375, "eval_logps/rejected": -624.3119506835938, "eval_loss": 0.006175518035888672, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 2.0544216632843018, "eval_rewards/margins": 14.222590446472168, "eval_rewards/rejected": -12.168168067932129, "eval_runtime": 266.7992, "eval_samples_per_second": 35.607, "eval_steps_per_second": 1.113, "step": 5700 }, { "epoch": 1.94, "learning_rate": 1.961475513030341e-07, "logits/chosen": 1.1513116359710693, "logits/rejected": 2.2623372077941895, "logps/chosen": -439.71942138671875, "logps/rejected": -656.05224609375, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 1.9392688274383545, "rewards/margins": 13.81745433807373, "rewards/rejected": -11.878186225891113, "step": 5710 }, { "epoch": 1.94, "learning_rate": 1.9551806622183054e-07, "logits/chosen": 0.9612258672714233, "logits/rejected": 2.3724660873413086, "logps/chosen": -312.3752136230469, "logps/rejected": -670.6126708984375, "loss": 0.0081, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9849334955215454, "rewards/margins": 14.504766464233398, "rewards/rejected": -12.5198335647583, "step": 5720 }, { "epoch": 1.95, "learning_rate": 1.9488858114062696e-07, "logits/chosen": 1.6316070556640625, "logits/rejected": 2.7541658878326416, "logps/chosen": -328.0393981933594, "logps/rejected": -498.2681579589844, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": 2.378713846206665, "rewards/margins": 15.276588439941406, "rewards/rejected": -12.89787483215332, "step": 5730 }, { "epoch": 1.95, "learning_rate": 1.9425909605942338e-07, "logits/chosen": 0.5457426905632019, "logits/rejected": 2.7625725269317627, "logps/chosen": -515.2962646484375, "logps/rejected": -626.9390869140625, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": 1.7357877492904663, "rewards/margins": 13.215678215026855, "rewards/rejected": -11.479890823364258, "step": 5740 }, { "epoch": 1.95, "learning_rate": 1.936296109782198e-07, "logits/chosen": 0.8939191102981567, "logits/rejected": 1.777658462524414, "logps/chosen": -351.0246276855469, "logps/rejected": -832.22705078125, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 2.2989163398742676, "rewards/margins": 16.514678955078125, "rewards/rejected": -14.2157621383667, "step": 5750 }, { "epoch": 1.96, "learning_rate": 1.9300012589701623e-07, "logits/chosen": 0.9846128225326538, "logits/rejected": 1.9890168905258179, "logps/chosen": -417.22454833984375, "logps/rejected": -747.7069702148438, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 1.978386640548706, "rewards/margins": 13.060282707214355, "rewards/rejected": -11.08189582824707, "step": 5760 }, { "epoch": 1.96, "learning_rate": 1.9237064081581268e-07, "logits/chosen": 0.5852854251861572, "logits/rejected": 2.476715087890625, "logps/chosen": -331.2423095703125, "logps/rejected": -701.3380126953125, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": 2.2032055854797363, "rewards/margins": 15.220135688781738, "rewards/rejected": -13.016927719116211, "step": 5770 }, { "epoch": 1.96, "learning_rate": 1.9174115573460907e-07, "logits/chosen": 0.7734403610229492, "logits/rejected": 2.3022501468658447, "logps/chosen": -400.611328125, "logps/rejected": -658.062744140625, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 1.9118320941925049, "rewards/margins": 15.38043212890625, "rewards/rejected": -13.468599319458008, "step": 5780 }, { "epoch": 1.97, "learning_rate": 1.911116706534055e-07, "logits/chosen": 1.4819624423980713, "logits/rejected": 2.5294859409332275, "logps/chosen": -466.876708984375, "logps/rejected": -613.1146240234375, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 1.820359230041504, "rewards/margins": 13.872444152832031, "rewards/rejected": -12.052085876464844, "step": 5790 }, { "epoch": 1.97, "learning_rate": 1.9048218557220194e-07, "logits/chosen": 0.6445799469947815, "logits/rejected": 3.186220407485962, "logps/chosen": -382.71282958984375, "logps/rejected": -496.46221923828125, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": 1.8192764520645142, "rewards/margins": 14.626768112182617, "rewards/rejected": -12.807493209838867, "step": 5800 }, { "epoch": 1.97, "eval_logits/chosen": 0.5421663522720337, "eval_logits/rejected": 2.624814510345459, "eval_logps/chosen": -368.0468444824219, "eval_logps/rejected": -625.33251953125, "eval_loss": 0.006073611788451672, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 2.0943007469177246, "eval_rewards/margins": 14.364514350891113, "eval_rewards/rejected": -12.270215034484863, "eval_runtime": 267.5626, "eval_samples_per_second": 35.506, "eval_steps_per_second": 1.11, "step": 5800 }, { "epoch": 1.97, "learning_rate": 1.8985270049099837e-07, "logits/chosen": 1.1447639465332031, "logits/rejected": 2.342881441116333, "logps/chosen": -317.5584411621094, "logps/rejected": -704.2658081054688, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 1.8849990367889404, "rewards/margins": 13.785212516784668, "rewards/rejected": -11.900214195251465, "step": 5810 }, { "epoch": 1.98, "learning_rate": 1.8922321540979476e-07, "logits/chosen": 1.2115243673324585, "logits/rejected": 2.874825954437256, "logps/chosen": -470.36224365234375, "logps/rejected": -491.6048278808594, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 1.872240662574768, "rewards/margins": 15.113815307617188, "rewards/rejected": -13.241575241088867, "step": 5820 }, { "epoch": 1.98, "learning_rate": 1.885937303285912e-07, "logits/chosen": 0.2891542315483093, "logits/rejected": 2.481403350830078, "logps/chosen": -302.67083740234375, "logps/rejected": -679.9458618164062, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 2.0729470252990723, "rewards/margins": 15.368230819702148, "rewards/rejected": -13.29528522491455, "step": 5830 }, { "epoch": 1.99, "learning_rate": 1.8796424524738764e-07, "logits/chosen": 0.47113484144210815, "logits/rejected": 2.459672451019287, "logps/chosen": -395.92315673828125, "logps/rejected": -567.0223388671875, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 2.37471866607666, "rewards/margins": 14.61865234375, "rewards/rejected": -12.243932723999023, "step": 5840 }, { "epoch": 1.99, "learning_rate": 1.8733476016618406e-07, "logits/chosen": 0.5824601054191589, "logits/rejected": 2.3544952869415283, "logps/chosen": -366.3118591308594, "logps/rejected": -600.1126708984375, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 2.0960936546325684, "rewards/margins": 14.047538757324219, "rewards/rejected": -11.951444625854492, "step": 5850 }, { "epoch": 1.99, "learning_rate": 1.8670527508498048e-07, "logits/chosen": 0.888663649559021, "logits/rejected": 2.1462912559509277, "logps/chosen": -315.76434326171875, "logps/rejected": -801.0203857421875, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 1.9007564783096313, "rewards/margins": 15.119722366333008, "rewards/rejected": -13.218966484069824, "step": 5860 }, { "epoch": 2.0, "learning_rate": 1.860757900037769e-07, "logits/chosen": 0.42253509163856506, "logits/rejected": 2.834648609161377, "logps/chosen": -320.17999267578125, "logps/rejected": -539.3760375976562, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 2.2486395835876465, "rewards/margins": 15.82959270477295, "rewards/rejected": -13.580953598022461, "step": 5870 }, { "epoch": 2.0, "learning_rate": 1.8544630492257333e-07, "logits/chosen": 0.5946919322013855, "logits/rejected": 2.0946545600891113, "logps/chosen": -306.47149658203125, "logps/rejected": -675.6048583984375, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 1.677959680557251, "rewards/margins": 14.716924667358398, "rewards/rejected": -13.038965225219727, "step": 5880 }, { "epoch": 2.0, "learning_rate": 1.8481681984136978e-07, "logits/chosen": 1.366878867149353, "logits/rejected": 2.7167229652404785, "logps/chosen": -336.2856140136719, "logps/rejected": -511.2284240722656, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 1.7026660442352295, "rewards/margins": 14.071836471557617, "rewards/rejected": -12.369170188903809, "step": 5890 }, { "epoch": 2.01, "learning_rate": 1.8418733476016617e-07, "logits/chosen": 1.1604326963424683, "logits/rejected": 2.527015209197998, "logps/chosen": -380.0445251464844, "logps/rejected": -559.8151245117188, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 2.2937469482421875, "rewards/margins": 13.332115173339844, "rewards/rejected": -11.038368225097656, "step": 5900 }, { "epoch": 2.01, "eval_logits/chosen": 0.518649697303772, "eval_logits/rejected": 2.571194648742676, "eval_logps/chosen": -369.7945556640625, "eval_logps/rejected": -631.9784545898438, "eval_loss": 0.005714269354939461, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.919531226158142, "eval_rewards/margins": 14.854342460632324, "eval_rewards/rejected": -12.934809684753418, "eval_runtime": 268.1608, "eval_samples_per_second": 35.427, "eval_steps_per_second": 1.108, "step": 5900 }, { "epoch": 2.01, "learning_rate": 1.835578496789626e-07, "logits/chosen": 0.981165885925293, "logits/rejected": 2.4159669876098633, "logps/chosen": -390.6913757324219, "logps/rejected": -670.6868286132812, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 2.1235125064849854, "rewards/margins": 14.490254402160645, "rewards/rejected": -12.366741180419922, "step": 5910 }, { "epoch": 2.01, "learning_rate": 1.8292836459775904e-07, "logits/chosen": 1.076453447341919, "logits/rejected": 2.0716986656188965, "logps/chosen": -440.6607360839844, "logps/rejected": -772.1173706054688, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 2.22540020942688, "rewards/margins": 14.935269355773926, "rewards/rejected": -12.709869384765625, "step": 5920 }, { "epoch": 2.02, "learning_rate": 1.8229887951655544e-07, "logits/chosen": 1.0230543613433838, "logits/rejected": 2.3830807209014893, "logps/chosen": -298.8233947753906, "logps/rejected": -630.2376098632812, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 2.1085221767425537, "rewards/margins": 14.758903503417969, "rewards/rejected": -12.650381088256836, "step": 5930 }, { "epoch": 2.02, "learning_rate": 1.8166939443535186e-07, "logits/chosen": 1.2830191850662231, "logits/rejected": 2.8751425743103027, "logps/chosen": -410.10662841796875, "logps/rejected": -473.4539489746094, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 2.3019919395446777, "rewards/margins": 15.205785751342773, "rewards/rejected": -12.903793334960938, "step": 5940 }, { "epoch": 2.02, "learning_rate": 1.8103990935414829e-07, "logits/chosen": 1.0351749658584595, "logits/rejected": 2.504169464111328, "logps/chosen": -318.38153076171875, "logps/rejected": -625.9144287109375, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 1.910672903060913, "rewards/margins": 13.210138320922852, "rewards/rejected": -11.299463272094727, "step": 5950 }, { "epoch": 2.03, "learning_rate": 1.8041042427294474e-07, "logits/chosen": 1.23222815990448, "logits/rejected": 2.7579433917999268, "logps/chosen": -383.14959716796875, "logps/rejected": -414.42578125, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 1.964050531387329, "rewards/margins": 16.147647857666016, "rewards/rejected": -14.183601379394531, "step": 5960 }, { "epoch": 2.03, "learning_rate": 1.7978093919174113e-07, "logits/chosen": 0.6595891714096069, "logits/rejected": 2.0642571449279785, "logps/chosen": -358.0821838378906, "logps/rejected": -768.7884521484375, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.2330410480499268, "rewards/margins": 14.898645401000977, "rewards/rejected": -13.665603637695312, "step": 5970 }, { "epoch": 2.03, "learning_rate": 1.7915145411053755e-07, "logits/chosen": 0.7389932870864868, "logits/rejected": 1.9643020629882812, "logps/chosen": -365.7792663574219, "logps/rejected": -685.234619140625, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 1.6057651042938232, "rewards/margins": 13.977490425109863, "rewards/rejected": -12.371726036071777, "step": 5980 }, { "epoch": 2.04, "learning_rate": 1.78521969029334e-07, "logits/chosen": 1.0801981687545776, "logits/rejected": 2.0700907707214355, "logps/chosen": -374.2262268066406, "logps/rejected": -765.7568969726562, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.5652799606323242, "rewards/margins": 14.630142211914062, "rewards/rejected": -13.064863204956055, "step": 5990 }, { "epoch": 2.04, "learning_rate": 1.7789248394813043e-07, "logits/chosen": 1.2110086679458618, "logits/rejected": 2.5913567543029785, "logps/chosen": -442.3192443847656, "logps/rejected": -512.6320190429688, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 2.126986265182495, "rewards/margins": 16.897563934326172, "rewards/rejected": -14.770574569702148, "step": 6000 }, { "epoch": 2.04, "eval_logits/chosen": 0.49600929021835327, "eval_logits/rejected": 2.540468215942383, "eval_logps/chosen": -370.60565185546875, "eval_logps/rejected": -636.5339965820312, "eval_loss": 0.005733425263315439, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.838423728942871, "eval_rewards/margins": 15.228790283203125, "eval_rewards/rejected": -13.390366554260254, "eval_runtime": 267.8718, "eval_samples_per_second": 35.465, "eval_steps_per_second": 1.109, "step": 6000 }, { "epoch": 2.04, "learning_rate": 1.7726299886692682e-07, "logits/chosen": 0.9540095329284668, "logits/rejected": 2.4917612075805664, "logps/chosen": -344.8664245605469, "logps/rejected": -551.3218994140625, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 1.751217246055603, "rewards/margins": 13.663978576660156, "rewards/rejected": -11.912760734558105, "step": 6010 }, { "epoch": 2.05, "learning_rate": 1.7663351378572327e-07, "logits/chosen": 0.5778986215591431, "logits/rejected": 2.036611795425415, "logps/chosen": -324.55914306640625, "logps/rejected": -816.7264404296875, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 1.4912192821502686, "rewards/margins": 14.945416450500488, "rewards/rejected": -13.454197883605957, "step": 6020 }, { "epoch": 2.05, "learning_rate": 1.760040287045197e-07, "logits/chosen": 0.8682858347892761, "logits/rejected": 2.310798168182373, "logps/chosen": -511.52667236328125, "logps/rejected": -588.7957153320312, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 1.5532984733581543, "rewards/margins": 14.495959281921387, "rewards/rejected": -12.942662239074707, "step": 6030 }, { "epoch": 2.05, "learning_rate": 1.7537454362331612e-07, "logits/chosen": 1.4959813356399536, "logits/rejected": 2.511289119720459, "logps/chosen": -370.686767578125, "logps/rejected": -587.7115478515625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.48617684841156, "rewards/margins": 15.93817138671875, "rewards/rejected": -14.451992988586426, "step": 6040 }, { "epoch": 2.06, "learning_rate": 1.7474505854211254e-07, "logits/chosen": 1.0743986368179321, "logits/rejected": 2.4483957290649414, "logps/chosen": -504.873779296875, "logps/rejected": -595.0511474609375, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": 1.765019178390503, "rewards/margins": 15.832463264465332, "rewards/rejected": -14.06744384765625, "step": 6050 }, { "epoch": 2.06, "learning_rate": 1.7411557346090896e-07, "logits/chosen": 1.304214596748352, "logits/rejected": 2.794304132461548, "logps/chosen": -357.6822204589844, "logps/rejected": -523.1092529296875, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 1.2659730911254883, "rewards/margins": 15.043255805969238, "rewards/rejected": -13.777284622192383, "step": 6060 }, { "epoch": 2.06, "learning_rate": 1.7348608837970539e-07, "logits/chosen": 0.8975250124931335, "logits/rejected": 2.321444511413574, "logps/chosen": -377.947998046875, "logps/rejected": -687.5652465820312, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.609063744544983, "rewards/margins": 16.110820770263672, "rewards/rejected": -14.501757621765137, "step": 6070 }, { "epoch": 2.07, "learning_rate": 1.7285660329850184e-07, "logits/chosen": 0.9074400663375854, "logits/rejected": 2.431576728820801, "logps/chosen": -313.75970458984375, "logps/rejected": -636.7111206054688, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.8432601690292358, "rewards/margins": 17.6544246673584, "rewards/rejected": -15.811162948608398, "step": 6080 }, { "epoch": 2.07, "learning_rate": 1.7222711821729823e-07, "logits/chosen": 1.2238930463790894, "logits/rejected": 2.038734197616577, "logps/chosen": -304.6903381347656, "logps/rejected": -741.0390625, "loss": 0.0061, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5827157497406006, "rewards/margins": 14.606298446655273, "rewards/rejected": -13.023582458496094, "step": 6090 }, { "epoch": 2.07, "learning_rate": 1.7159763313609465e-07, "logits/chosen": 1.0854783058166504, "logits/rejected": 2.659773111343384, "logps/chosen": -392.56781005859375, "logps/rejected": -585.4364013671875, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 1.6525754928588867, "rewards/margins": 17.150650024414062, "rewards/rejected": -15.498072624206543, "step": 6100 }, { "epoch": 2.07, "eval_logits/chosen": 0.44153305888175964, "eval_logits/rejected": 2.471759557723999, "eval_logps/chosen": -372.8395080566406, "eval_logps/rejected": -645.4886474609375, "eval_loss": 0.005611285101622343, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 1.6150366067886353, "eval_rewards/margins": 15.900870323181152, "eval_rewards/rejected": -14.285835266113281, "eval_runtime": 267.9573, "eval_samples_per_second": 35.453, "eval_steps_per_second": 1.108, "step": 6100 }, { "epoch": 2.08, "learning_rate": 1.709681480548911e-07, "logits/chosen": 0.7842377424240112, "logits/rejected": 2.7997422218322754, "logps/chosen": -299.3103942871094, "logps/rejected": -551.2344970703125, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 1.5577961206436157, "rewards/margins": 16.915258407592773, "rewards/rejected": -15.357464790344238, "step": 6110 }, { "epoch": 2.08, "learning_rate": 1.7033866297368753e-07, "logits/chosen": 0.46014171838760376, "logits/rejected": 2.373453378677368, "logps/chosen": -398.37518310546875, "logps/rejected": -673.5498046875, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 2.542041301727295, "rewards/margins": 17.008686065673828, "rewards/rejected": -14.466644287109375, "step": 6120 }, { "epoch": 2.08, "learning_rate": 1.6970917789248392e-07, "logits/chosen": 1.0954312086105347, "logits/rejected": 2.1974737644195557, "logps/chosen": -378.20831298828125, "logps/rejected": -785.3609008789062, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 0.8533457517623901, "rewards/margins": 16.01312255859375, "rewards/rejected": -15.15977668762207, "step": 6130 }, { "epoch": 2.09, "learning_rate": 1.6907969281128037e-07, "logits/chosen": 1.350953459739685, "logits/rejected": 3.0693161487579346, "logps/chosen": -473.63397216796875, "logps/rejected": -589.5533447265625, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 0.7659844756126404, "rewards/margins": 14.892280578613281, "rewards/rejected": -14.126296997070312, "step": 6140 }, { "epoch": 2.09, "learning_rate": 1.684502077300768e-07, "logits/chosen": 0.7171489000320435, "logits/rejected": 2.2262418270111084, "logps/chosen": -406.3450012207031, "logps/rejected": -637.3649291992188, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 1.3386101722717285, "rewards/margins": 15.8351411819458, "rewards/rejected": -14.49653148651123, "step": 6150 }, { "epoch": 2.09, "learning_rate": 1.678207226488732e-07, "logits/chosen": 0.7702849507331848, "logits/rejected": 2.560065746307373, "logps/chosen": -375.14044189453125, "logps/rejected": -543.9547119140625, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.9042383432388306, "rewards/margins": 14.497563362121582, "rewards/rejected": -12.5933256149292, "step": 6160 }, { "epoch": 2.1, "learning_rate": 1.6719123756766964e-07, "logits/chosen": 0.8514900207519531, "logits/rejected": 1.924505591392517, "logps/chosen": -399.55474853515625, "logps/rejected": -754.6709594726562, "loss": 0.0036, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.312821626663208, "rewards/margins": 17.12882423400879, "rewards/rejected": -14.816003799438477, "step": 6170 }, { "epoch": 2.1, "learning_rate": 1.6656175248646606e-07, "logits/chosen": 1.1173584461212158, "logits/rejected": 2.6232028007507324, "logps/chosen": -346.4198913574219, "logps/rejected": -619.3607177734375, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 1.4097492694854736, "rewards/margins": 16.980304718017578, "rewards/rejected": -15.570554733276367, "step": 6180 }, { "epoch": 2.1, "learning_rate": 1.6593226740526249e-07, "logits/chosen": 0.5098138451576233, "logits/rejected": 3.2658188343048096, "logps/chosen": -288.40509033203125, "logps/rejected": -499.8128356933594, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 1.9584379196166992, "rewards/margins": 16.69192886352539, "rewards/rejected": -14.733491897583008, "step": 6190 }, { "epoch": 2.11, "learning_rate": 1.653027823240589e-07, "logits/chosen": 0.5227378010749817, "logits/rejected": 2.2451419830322266, "logps/chosen": -312.66033935546875, "logps/rejected": -674.2329711914062, "loss": 0.0053, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3434746265411377, "rewards/margins": 15.002885818481445, "rewards/rejected": -13.65941047668457, "step": 6200 }, { "epoch": 2.11, "eval_logits/chosen": 0.45763009786605835, "eval_logits/rejected": 2.4921276569366455, "eval_logps/chosen": -370.72222900390625, "eval_logps/rejected": -642.0590209960938, "eval_loss": 0.00525397714227438, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.8267639875411987, "eval_rewards/margins": 15.769631385803223, "eval_rewards/rejected": -13.942869186401367, "eval_runtime": 266.8449, "eval_samples_per_second": 35.601, "eval_steps_per_second": 1.113, "step": 6200 }, { "epoch": 2.11, "learning_rate": 1.6467329724285533e-07, "logits/chosen": 0.7600902915000916, "logits/rejected": 2.226156234741211, "logps/chosen": -328.57281494140625, "logps/rejected": -729.6641235351562, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 1.5251786708831787, "rewards/margins": 14.667379379272461, "rewards/rejected": -13.14220142364502, "step": 6210 }, { "epoch": 2.11, "learning_rate": 1.6404381216165175e-07, "logits/chosen": 0.9222061038017273, "logits/rejected": 2.259605884552002, "logps/chosen": -317.9562072753906, "logps/rejected": -591.8802490234375, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 2.16776704788208, "rewards/margins": 17.08612060546875, "rewards/rejected": -14.918353080749512, "step": 6220 }, { "epoch": 2.12, "learning_rate": 1.634143270804482e-07, "logits/chosen": 0.06134549900889397, "logits/rejected": 2.560732126235962, "logps/chosen": -352.6381530761719, "logps/rejected": -583.1201171875, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 1.7368847131729126, "rewards/margins": 15.019735336303711, "rewards/rejected": -13.282852172851562, "step": 6230 }, { "epoch": 2.12, "learning_rate": 1.627848419992446e-07, "logits/chosen": 1.1012592315673828, "logits/rejected": 1.9502675533294678, "logps/chosen": -429.45751953125, "logps/rejected": -795.875244140625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 1.9872167110443115, "rewards/margins": 16.9797420501709, "rewards/rejected": -14.992526054382324, "step": 6240 }, { "epoch": 2.12, "learning_rate": 1.6215535691804102e-07, "logits/chosen": 1.477541208267212, "logits/rejected": 2.3593482971191406, "logps/chosen": -375.87860107421875, "logps/rejected": -531.9874267578125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 1.8726123571395874, "rewards/margins": 14.83423137664795, "rewards/rejected": -12.96161937713623, "step": 6250 }, { "epoch": 2.13, "learning_rate": 1.6152587183683747e-07, "logits/chosen": 0.6256676316261292, "logits/rejected": 2.4760072231292725, "logps/chosen": -315.6788024902344, "logps/rejected": -704.2340698242188, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 1.721914291381836, "rewards/margins": 16.100740432739258, "rewards/rejected": -14.378824234008789, "step": 6260 }, { "epoch": 2.13, "learning_rate": 1.608963867556339e-07, "logits/chosen": 0.6289627552032471, "logits/rejected": 3.0689284801483154, "logps/chosen": -312.2763366699219, "logps/rejected": -525.0361938476562, "loss": 0.0071, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7409350872039795, "rewards/margins": 14.935763359069824, "rewards/rejected": -13.194829940795898, "step": 6270 }, { "epoch": 2.13, "learning_rate": 1.602669016744303e-07, "logits/chosen": 1.0302293300628662, "logits/rejected": 1.7600761651992798, "logps/chosen": -456.14898681640625, "logps/rejected": -906.0274658203125, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 2.196922540664673, "rewards/margins": 15.269792556762695, "rewards/rejected": -13.072870254516602, "step": 6280 }, { "epoch": 2.14, "learning_rate": 1.5963741659322674e-07, "logits/chosen": 1.2055107355117798, "logits/rejected": 2.174511432647705, "logps/chosen": -343.38134765625, "logps/rejected": -685.1343994140625, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 2.0983285903930664, "rewards/margins": 15.914003372192383, "rewards/rejected": -13.815675735473633, "step": 6290 }, { "epoch": 2.14, "learning_rate": 1.5900793151202316e-07, "logits/chosen": 0.9002892374992371, "logits/rejected": 2.5622847080230713, "logps/chosen": -310.75067138671875, "logps/rejected": -680.5726318359375, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 1.9516767263412476, "rewards/margins": 14.449353218078613, "rewards/rejected": -12.497675895690918, "step": 6300 }, { "epoch": 2.14, "eval_logits/chosen": 0.470478355884552, "eval_logits/rejected": 2.507901906967163, "eval_logps/chosen": -369.54644775390625, "eval_logps/rejected": -640.7470092773438, "eval_loss": 0.005234770942479372, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 1.9443472623825073, "eval_rewards/margins": 15.756011962890625, "eval_rewards/rejected": -13.811664581298828, "eval_runtime": 268.1768, "eval_samples_per_second": 35.424, "eval_steps_per_second": 1.107, "step": 6300 }, { "epoch": 2.14, "learning_rate": 1.5837844643081959e-07, "logits/chosen": 0.47189411520957947, "logits/rejected": 2.19954776763916, "logps/chosen": -294.6665954589844, "logps/rejected": -810.7994995117188, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 2.0598158836364746, "rewards/margins": 16.0883846282959, "rewards/rejected": -14.028569221496582, "step": 6310 }, { "epoch": 2.15, "learning_rate": 1.57748961349616e-07, "logits/chosen": 0.6236362457275391, "logits/rejected": 2.104513168334961, "logps/chosen": -388.9352111816406, "logps/rejected": -850.1434326171875, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 2.145151376724243, "rewards/margins": 16.31357192993164, "rewards/rejected": -14.168418884277344, "step": 6320 }, { "epoch": 2.15, "learning_rate": 1.5711947626841243e-07, "logits/chosen": 1.140409231185913, "logits/rejected": 2.624236822128296, "logps/chosen": -472.2935485839844, "logps/rejected": -509.56182861328125, "loss": 0.0065, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7926658391952515, "rewards/margins": 14.899357795715332, "rewards/rejected": -13.106691360473633, "step": 6330 }, { "epoch": 2.15, "learning_rate": 1.5648999118720885e-07, "logits/chosen": 1.207027554512024, "logits/rejected": 2.677910566329956, "logps/chosen": -399.31866455078125, "logps/rejected": -542.0875244140625, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 2.7966368198394775, "rewards/margins": 17.7134952545166, "rewards/rejected": -14.916857719421387, "step": 6340 }, { "epoch": 2.16, "learning_rate": 1.558605061060053e-07, "logits/chosen": 1.527717113494873, "logits/rejected": 2.574253559112549, "logps/chosen": -340.0840148925781, "logps/rejected": -657.302978515625, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 2.499192953109741, "rewards/margins": 17.460071563720703, "rewards/rejected": -14.9608793258667, "step": 6350 }, { "epoch": 2.16, "learning_rate": 1.552310210248017e-07, "logits/chosen": 1.2479125261306763, "logits/rejected": 2.030150890350342, "logps/chosen": -445.0252990722656, "logps/rejected": -781.627197265625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 1.9407612085342407, "rewards/margins": 16.6387996673584, "rewards/rejected": -14.698038101196289, "step": 6360 }, { "epoch": 2.17, "learning_rate": 1.5460153594359812e-07, "logits/chosen": 1.2388721704483032, "logits/rejected": 2.1668925285339355, "logps/chosen": -449.60107421875, "logps/rejected": -652.534423828125, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.8397868871688843, "rewards/margins": 16.469173431396484, "rewards/rejected": -14.629384994506836, "step": 6370 }, { "epoch": 2.17, "learning_rate": 1.5397205086239457e-07, "logits/chosen": 0.750697135925293, "logits/rejected": 2.147480010986328, "logps/chosen": -381.23333740234375, "logps/rejected": -724.5010986328125, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 2.091524124145508, "rewards/margins": 16.812833786010742, "rewards/rejected": -14.72131061553955, "step": 6380 }, { "epoch": 2.17, "learning_rate": 1.5334256578119097e-07, "logits/chosen": 1.033517599105835, "logits/rejected": 2.331897735595703, "logps/chosen": -393.97259521484375, "logps/rejected": -627.6727905273438, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 2.801288604736328, "rewards/margins": 15.796916007995605, "rewards/rejected": -12.995626449584961, "step": 6390 }, { "epoch": 2.18, "learning_rate": 1.527130806999874e-07, "logits/chosen": 1.0533195734024048, "logits/rejected": 2.7393133640289307, "logps/chosen": -460.53857421875, "logps/rejected": -512.9381103515625, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 1.899174690246582, "rewards/margins": 15.319025039672852, "rewards/rejected": -13.419851303100586, "step": 6400 }, { "epoch": 2.18, "eval_logits/chosen": 0.4822961986064911, "eval_logits/rejected": 2.5138680934906006, "eval_logps/chosen": -368.5343017578125, "eval_logps/rejected": -640.0852661132812, "eval_loss": 0.005253070965409279, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 2.045560121536255, "eval_rewards/margins": 15.791056632995605, "eval_rewards/rejected": -13.745494842529297, "eval_runtime": 268.0763, "eval_samples_per_second": 35.438, "eval_steps_per_second": 1.108, "step": 6400 }, { "epoch": 2.18, "learning_rate": 1.5208359561878384e-07, "logits/chosen": 1.100904107093811, "logits/rejected": 2.4689745903015137, "logps/chosen": -373.98675537109375, "logps/rejected": -577.3536987304688, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.8111152648925781, "rewards/margins": 15.343942642211914, "rewards/rejected": -13.53282642364502, "step": 6410 }, { "epoch": 2.18, "learning_rate": 1.5145411053758026e-07, "logits/chosen": 1.3738195896148682, "logits/rejected": 2.6699304580688477, "logps/chosen": -375.9863586425781, "logps/rejected": -624.0731201171875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.6937096118927002, "rewards/margins": 17.86076545715332, "rewards/rejected": -16.167057037353516, "step": 6420 }, { "epoch": 2.19, "learning_rate": 1.5082462545637666e-07, "logits/chosen": 0.3751014173030853, "logits/rejected": 2.8960976600646973, "logps/chosen": -321.5099792480469, "logps/rejected": -479.81268310546875, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 1.7421153783798218, "rewards/margins": 16.647037506103516, "rewards/rejected": -14.904919624328613, "step": 6430 }, { "epoch": 2.19, "learning_rate": 1.501951403751731e-07, "logits/chosen": 1.0028069019317627, "logits/rejected": 2.3100218772888184, "logps/chosen": -312.1414794921875, "logps/rejected": -645.2931518554688, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 1.828717589378357, "rewards/margins": 17.236507415771484, "rewards/rejected": -15.407788276672363, "step": 6440 }, { "epoch": 2.19, "learning_rate": 1.4956565529396953e-07, "logits/chosen": 1.4677902460098267, "logits/rejected": 2.6699881553649902, "logps/chosen": -346.54742431640625, "logps/rejected": -606.3338623046875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 2.090097665786743, "rewards/margins": 18.220773696899414, "rewards/rejected": -16.13067626953125, "step": 6450 }, { "epoch": 2.2, "learning_rate": 1.4893617021276595e-07, "logits/chosen": 1.2526359558105469, "logits/rejected": 2.1189627647399902, "logps/chosen": -328.7201232910156, "logps/rejected": -692.3485107421875, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 2.1243045330047607, "rewards/margins": 17.358163833618164, "rewards/rejected": -15.233858108520508, "step": 6460 }, { "epoch": 2.2, "learning_rate": 1.4830668513156238e-07, "logits/chosen": 0.1407167762517929, "logits/rejected": 3.113206148147583, "logps/chosen": -287.0426330566406, "logps/rejected": -430.408935546875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 1.6383129358291626, "rewards/margins": 17.28408432006836, "rewards/rejected": -15.645770072937012, "step": 6470 }, { "epoch": 2.2, "learning_rate": 1.476772000503588e-07, "logits/chosen": 0.8808428645133972, "logits/rejected": 2.396165132522583, "logps/chosen": -320.5928649902344, "logps/rejected": -543.3441772460938, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 2.2573113441467285, "rewards/margins": 16.72433853149414, "rewards/rejected": -14.467025756835938, "step": 6480 }, { "epoch": 2.21, "learning_rate": 1.4704771496915522e-07, "logits/chosen": 1.4290237426757812, "logits/rejected": 2.7337679862976074, "logps/chosen": -343.0905456542969, "logps/rejected": -558.6626586914062, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 1.977564811706543, "rewards/margins": 18.43790054321289, "rewards/rejected": -16.4603328704834, "step": 6490 }, { "epoch": 2.21, "learning_rate": 1.4641822988795167e-07, "logits/chosen": 1.210451364517212, "logits/rejected": 2.4949653148651123, "logps/chosen": -313.0892639160156, "logps/rejected": -633.3006591796875, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 1.7542457580566406, "rewards/margins": 14.538752555847168, "rewards/rejected": -12.784505844116211, "step": 6500 }, { "epoch": 2.21, "eval_logits/chosen": 0.48234444856643677, "eval_logits/rejected": 2.513512372970581, "eval_logps/chosen": -368.9617919921875, "eval_logps/rejected": -639.1259765625, "eval_loss": 0.005004484672099352, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 2.002810478210449, "eval_rewards/margins": 15.652373313903809, "eval_rewards/rejected": -13.649561882019043, "eval_runtime": 268.1176, "eval_samples_per_second": 35.432, "eval_steps_per_second": 1.108, "step": 6500 }, { "epoch": 2.21, "learning_rate": 1.4578874480674807e-07, "logits/chosen": 0.5491858124732971, "logits/rejected": 2.644583225250244, "logps/chosen": -417.05865478515625, "logps/rejected": -580.2625732421875, "loss": 0.0052, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1090285778045654, "rewards/margins": 15.84637451171875, "rewards/rejected": -13.737344741821289, "step": 6510 }, { "epoch": 2.22, "learning_rate": 1.451592597255445e-07, "logits/chosen": 1.1235544681549072, "logits/rejected": 2.5771431922912598, "logps/chosen": -411.65399169921875, "logps/rejected": -593.0504760742188, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 1.7683136463165283, "rewards/margins": 15.734085083007812, "rewards/rejected": -13.96576976776123, "step": 6520 }, { "epoch": 2.22, "learning_rate": 1.4452977464434094e-07, "logits/chosen": 0.9458175897598267, "logits/rejected": 2.292898654937744, "logps/chosen": -356.336669921875, "logps/rejected": -608.5050048828125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 1.6671857833862305, "rewards/margins": 15.176393508911133, "rewards/rejected": -13.509206771850586, "step": 6530 }, { "epoch": 2.22, "learning_rate": 1.4390028956313736e-07, "logits/chosen": 0.9157311320304871, "logits/rejected": 2.6453652381896973, "logps/chosen": -327.6547546386719, "logps/rejected": -647.6570434570312, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 2.3297417163848877, "rewards/margins": 17.928281784057617, "rewards/rejected": -15.598539352416992, "step": 6540 }, { "epoch": 2.23, "learning_rate": 1.4327080448193376e-07, "logits/chosen": 0.9368341565132141, "logits/rejected": 2.830165386199951, "logps/chosen": -491.01385498046875, "logps/rejected": -593.498291015625, "loss": 0.0045, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.416783094406128, "rewards/margins": 17.794885635375977, "rewards/rejected": -16.378103256225586, "step": 6550 }, { "epoch": 2.23, "learning_rate": 1.426413194007302e-07, "logits/chosen": 0.7394753098487854, "logits/rejected": 2.6355130672454834, "logps/chosen": -335.5205078125, "logps/rejected": -617.947265625, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 2.2752904891967773, "rewards/margins": 17.116397857666016, "rewards/rejected": -14.841107368469238, "step": 6560 }, { "epoch": 2.23, "learning_rate": 1.4201183431952663e-07, "logits/chosen": 0.645321249961853, "logits/rejected": 2.001403331756592, "logps/chosen": -383.0551452636719, "logps/rejected": -834.9183349609375, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 2.033926010131836, "rewards/margins": 16.158061981201172, "rewards/rejected": -14.12413501739502, "step": 6570 }, { "epoch": 2.24, "learning_rate": 1.4138234923832303e-07, "logits/chosen": 1.1049634218215942, "logits/rejected": 2.1733741760253906, "logps/chosen": -383.75164794921875, "logps/rejected": -690.3191528320312, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 2.4952940940856934, "rewards/margins": 17.292388916015625, "rewards/rejected": -14.797096252441406, "step": 6580 }, { "epoch": 2.24, "learning_rate": 1.4075286415711948e-07, "logits/chosen": 0.8609091639518738, "logits/rejected": 2.836751937866211, "logps/chosen": -399.3436279296875, "logps/rejected": -577.36474609375, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 1.5047794580459595, "rewards/margins": 16.921968460083008, "rewards/rejected": -15.417187690734863, "step": 6590 }, { "epoch": 2.24, "learning_rate": 1.401233790759159e-07, "logits/chosen": 0.3242916464805603, "logits/rejected": 2.3490653038024902, "logps/chosen": -412.41607666015625, "logps/rejected": -621.5950927734375, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 1.4823118448257446, "rewards/margins": 14.215707778930664, "rewards/rejected": -12.733394622802734, "step": 6600 }, { "epoch": 2.24, "eval_logits/chosen": 0.4459000825881958, "eval_logits/rejected": 2.4828238487243652, "eval_logps/chosen": -370.1336669921875, "eval_logps/rejected": -640.5563354492188, "eval_loss": 0.004967730492353439, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 1.8856240510940552, "eval_rewards/margins": 15.678227424621582, "eval_rewards/rejected": -13.792603492736816, "eval_runtime": 268.4797, "eval_samples_per_second": 35.384, "eval_steps_per_second": 1.106, "step": 6600 }, { "epoch": 2.25, "learning_rate": 1.3949389399471232e-07, "logits/chosen": 0.537019670009613, "logits/rejected": 2.592452049255371, "logps/chosen": -317.28826904296875, "logps/rejected": -648.7763671875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.9283174276351929, "rewards/margins": 16.98746681213379, "rewards/rejected": -15.059147834777832, "step": 6610 }, { "epoch": 2.25, "learning_rate": 1.3886440891350874e-07, "logits/chosen": 1.3698558807373047, "logits/rejected": 3.0117099285125732, "logps/chosen": -396.42108154296875, "logps/rejected": -505.86810302734375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.7210032939910889, "rewards/margins": 17.65418815612793, "rewards/rejected": -15.933184623718262, "step": 6620 }, { "epoch": 2.25, "learning_rate": 1.3823492383230517e-07, "logits/chosen": 0.719806432723999, "logits/rejected": 2.455599546432495, "logps/chosen": -329.3707275390625, "logps/rejected": -708.4951782226562, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 2.1283833980560303, "rewards/margins": 17.73508071899414, "rewards/rejected": -15.606698989868164, "step": 6630 }, { "epoch": 2.26, "learning_rate": 1.376054387511016e-07, "logits/chosen": 1.4004559516906738, "logits/rejected": 2.5099542140960693, "logps/chosen": -344.36944580078125, "logps/rejected": -640.68017578125, "loss": 0.0031, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6813615560531616, "rewards/margins": 16.262908935546875, "rewards/rejected": -14.58154582977295, "step": 6640 }, { "epoch": 2.26, "learning_rate": 1.36975953669898e-07, "logits/chosen": 1.0476455688476562, "logits/rejected": 2.747251033782959, "logps/chosen": -369.5597229003906, "logps/rejected": -523.4691162109375, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 1.7639362812042236, "rewards/margins": 15.755114555358887, "rewards/rejected": -13.991180419921875, "step": 6650 }, { "epoch": 2.26, "learning_rate": 1.3634646858869444e-07, "logits/chosen": 1.08604097366333, "logits/rejected": 2.80775785446167, "logps/chosen": -343.06317138671875, "logps/rejected": -491.69091796875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 1.5290716886520386, "rewards/margins": 15.872842788696289, "rewards/rejected": -14.343768119812012, "step": 6660 }, { "epoch": 2.27, "learning_rate": 1.3571698350749086e-07, "logits/chosen": 0.8772619366645813, "logits/rejected": 2.6861140727996826, "logps/chosen": -364.5007629394531, "logps/rejected": -608.0274658203125, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 2.6243185997009277, "rewards/margins": 15.49360179901123, "rewards/rejected": -12.869282722473145, "step": 6670 }, { "epoch": 2.27, "learning_rate": 1.3508749842628728e-07, "logits/chosen": 0.7491368055343628, "logits/rejected": 2.350585460662842, "logps/chosen": -357.9722900390625, "logps/rejected": -625.5552978515625, "loss": 0.0057, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.264483690261841, "rewards/margins": 14.255470275878906, "rewards/rejected": -11.990984916687012, "step": 6680 }, { "epoch": 2.27, "learning_rate": 1.3445801334508373e-07, "logits/chosen": 1.0448821783065796, "logits/rejected": 2.7055163383483887, "logps/chosen": -420.7792053222656, "logps/rejected": -438.79913330078125, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 1.5239304304122925, "rewards/margins": 17.5579776763916, "rewards/rejected": -16.034046173095703, "step": 6690 }, { "epoch": 2.28, "learning_rate": 1.3382852826388013e-07, "logits/chosen": 1.200180172920227, "logits/rejected": 2.6250221729278564, "logps/chosen": -337.2734069824219, "logps/rejected": -551.2679443359375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.7806466817855835, "rewards/margins": 16.01031494140625, "rewards/rejected": -14.229669570922852, "step": 6700 }, { "epoch": 2.28, "eval_logits/chosen": 0.4470595121383667, "eval_logits/rejected": 2.469754457473755, "eval_logps/chosen": -369.5677795410156, "eval_logps/rejected": -643.3903198242188, "eval_loss": 0.00491339759901166, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.9422091245651245, "eval_rewards/margins": 16.018211364746094, "eval_rewards/rejected": -14.07600212097168, "eval_runtime": 268.6508, "eval_samples_per_second": 35.362, "eval_steps_per_second": 1.106, "step": 6700 }, { "epoch": 2.28, "learning_rate": 1.3319904318267655e-07, "logits/chosen": 1.0116021633148193, "logits/rejected": 2.23539662361145, "logps/chosen": -368.14630126953125, "logps/rejected": -677.6047973632812, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 1.9957377910614014, "rewards/margins": 15.763692855834961, "rewards/rejected": -13.76795482635498, "step": 6710 }, { "epoch": 2.28, "learning_rate": 1.32569558101473e-07, "logits/chosen": 1.3611485958099365, "logits/rejected": 2.6158175468444824, "logps/chosen": -343.1318359375, "logps/rejected": -639.2994384765625, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 1.975050926208496, "rewards/margins": 16.714651107788086, "rewards/rejected": -14.739601135253906, "step": 6720 }, { "epoch": 2.29, "learning_rate": 1.3194007302026942e-07, "logits/chosen": 1.00726318359375, "logits/rejected": 2.7914206981658936, "logps/chosen": -364.0087585449219, "logps/rejected": -535.8576049804688, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 2.7007040977478027, "rewards/margins": 17.216846466064453, "rewards/rejected": -14.516143798828125, "step": 6730 }, { "epoch": 2.29, "learning_rate": 1.3131058793906582e-07, "logits/chosen": 1.64545476436615, "logits/rejected": 2.4137086868286133, "logps/chosen": -433.92205810546875, "logps/rejected": -574.2499389648438, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 1.9041860103607178, "rewards/margins": 14.816378593444824, "rewards/rejected": -12.912193298339844, "step": 6740 }, { "epoch": 2.29, "learning_rate": 1.3068110285786227e-07, "logits/chosen": 1.1354012489318848, "logits/rejected": 2.3269338607788086, "logps/chosen": -381.748046875, "logps/rejected": -552.319580078125, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 1.7223217487335205, "rewards/margins": 15.645159721374512, "rewards/rejected": -13.922839164733887, "step": 6750 }, { "epoch": 2.3, "learning_rate": 1.300516177766587e-07, "logits/chosen": 0.7011697292327881, "logits/rejected": 2.4489026069641113, "logps/chosen": -365.9461669921875, "logps/rejected": -589.5831298828125, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 2.280221939086914, "rewards/margins": 16.866567611694336, "rewards/rejected": -14.586347579956055, "step": 6760 }, { "epoch": 2.3, "learning_rate": 1.294221326954551e-07, "logits/chosen": 0.6479487419128418, "logits/rejected": 2.078508138656616, "logps/chosen": -388.99041748046875, "logps/rejected": -782.4954833984375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 1.5865205526351929, "rewards/margins": 15.948519706726074, "rewards/rejected": -14.361997604370117, "step": 6770 }, { "epoch": 2.3, "learning_rate": 1.2879264761425154e-07, "logits/chosen": 1.262549638748169, "logits/rejected": 2.834839344024658, "logps/chosen": -341.15655517578125, "logps/rejected": -506.4090270996094, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 2.049922466278076, "rewards/margins": 17.553844451904297, "rewards/rejected": -15.503921508789062, "step": 6780 }, { "epoch": 2.31, "learning_rate": 1.2816316253304796e-07, "logits/chosen": 0.232079416513443, "logits/rejected": 2.5545456409454346, "logps/chosen": -283.21435546875, "logps/rejected": -561.6546630859375, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 1.9523422718048096, "rewards/margins": 15.473034858703613, "rewards/rejected": -13.520692825317383, "step": 6790 }, { "epoch": 2.31, "learning_rate": 1.2753367745184438e-07, "logits/chosen": 1.0746533870697021, "logits/rejected": 2.7360405921936035, "logps/chosen": -378.8026123046875, "logps/rejected": -479.0779724121094, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 1.9980930089950562, "rewards/margins": 16.039087295532227, "rewards/rejected": -14.040995597839355, "step": 6800 }, { "epoch": 2.31, "eval_logits/chosen": 0.4561528265476227, "eval_logits/rejected": 2.4646434783935547, "eval_logps/chosen": -370.35699462890625, "eval_logps/rejected": -647.2789916992188, "eval_loss": 0.0048464760184288025, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.863288402557373, "eval_rewards/margins": 16.328153610229492, "eval_rewards/rejected": -14.464864730834961, "eval_runtime": 268.3678, "eval_samples_per_second": 35.399, "eval_steps_per_second": 1.107, "step": 6800 }, { "epoch": 2.31, "learning_rate": 1.2690419237064083e-07, "logits/chosen": 0.6407972574234009, "logits/rejected": 2.2658538818359375, "logps/chosen": -491.4794006347656, "logps/rejected": -666.0134887695312, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 1.881895661354065, "rewards/margins": 15.709803581237793, "rewards/rejected": -13.827908515930176, "step": 6810 }, { "epoch": 2.32, "learning_rate": 1.2627470728943723e-07, "logits/chosen": 0.7811010479927063, "logits/rejected": 2.0729830265045166, "logps/chosen": -472.3502502441406, "logps/rejected": -726.8089599609375, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 2.061863899230957, "rewards/margins": 17.083171844482422, "rewards/rejected": -15.021306991577148, "step": 6820 }, { "epoch": 2.32, "learning_rate": 1.2564522220823365e-07, "logits/chosen": 0.7845171689987183, "logits/rejected": 2.2855148315429688, "logps/chosen": -386.7796325683594, "logps/rejected": -688.0648803710938, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 2.226625919342041, "rewards/margins": 16.80321502685547, "rewards/rejected": -14.57658863067627, "step": 6830 }, { "epoch": 2.32, "learning_rate": 1.250157371270301e-07, "logits/chosen": 0.9920900464057922, "logits/rejected": 2.7270851135253906, "logps/chosen": -387.8033142089844, "logps/rejected": -535.3179931640625, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 2.0011491775512695, "rewards/margins": 17.804561614990234, "rewards/rejected": -15.803411483764648, "step": 6840 }, { "epoch": 2.33, "learning_rate": 1.243862520458265e-07, "logits/chosen": 0.44733184576034546, "logits/rejected": 2.3163790702819824, "logps/chosen": -367.1181335449219, "logps/rejected": -533.33203125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 1.8264821767807007, "rewards/margins": 17.479494094848633, "rewards/rejected": -15.6530122756958, "step": 6850 }, { "epoch": 2.33, "learning_rate": 1.2375676696462294e-07, "logits/chosen": 1.0215935707092285, "logits/rejected": 2.1925833225250244, "logps/chosen": -402.8360595703125, "logps/rejected": -763.5743408203125, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 1.4659521579742432, "rewards/margins": 15.616668701171875, "rewards/rejected": -14.150716781616211, "step": 6860 }, { "epoch": 2.34, "learning_rate": 1.2312728188341934e-07, "logits/chosen": 1.325761079788208, "logits/rejected": 2.252635955810547, "logps/chosen": -348.18145751953125, "logps/rejected": -673.8773193359375, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": 2.1558871269226074, "rewards/margins": 18.09050941467285, "rewards/rejected": -15.93462085723877, "step": 6870 }, { "epoch": 2.34, "learning_rate": 1.224977968022158e-07, "logits/chosen": 0.948559582233429, "logits/rejected": 2.580723762512207, "logps/chosen": -438.7425231933594, "logps/rejected": -598.1914672851562, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 1.4692356586456299, "rewards/margins": 15.825752258300781, "rewards/rejected": -14.356515884399414, "step": 6880 }, { "epoch": 2.34, "learning_rate": 1.218683117210122e-07, "logits/chosen": 0.8626810908317566, "logits/rejected": 2.2737300395965576, "logps/chosen": -320.7707214355469, "logps/rejected": -713.5247192382812, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": 2.643897771835327, "rewards/margins": 17.64309310913086, "rewards/rejected": -14.999194145202637, "step": 6890 }, { "epoch": 2.35, "learning_rate": 1.2123882663980863e-07, "logits/chosen": 1.4227879047393799, "logits/rejected": 2.165592670440674, "logps/chosen": -405.59130859375, "logps/rejected": -746.8988647460938, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 2.1885743141174316, "rewards/margins": 14.486371994018555, "rewards/rejected": -12.297799110412598, "step": 6900 }, { "epoch": 2.35, "eval_logits/chosen": 0.4292427599430084, "eval_logits/rejected": 2.427537202835083, "eval_logps/chosen": -370.9051208496094, "eval_logps/rejected": -651.1427001953125, "eval_loss": 0.004910214804112911, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 1.8084813356399536, "eval_rewards/margins": 16.659719467163086, "eval_rewards/rejected": -14.851237297058105, "eval_runtime": 267.9814, "eval_samples_per_second": 35.45, "eval_steps_per_second": 1.108, "step": 6900 }, { "epoch": 2.35, "learning_rate": 1.2060934155860506e-07, "logits/chosen": 1.6376726627349854, "logits/rejected": 2.818448543548584, "logps/chosen": -344.49267578125, "logps/rejected": -584.34130859375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 1.967023253440857, "rewards/margins": 17.524118423461914, "rewards/rejected": -15.557093620300293, "step": 6910 }, { "epoch": 2.35, "learning_rate": 1.1997985647740148e-07, "logits/chosen": 0.698118269443512, "logits/rejected": 2.1906166076660156, "logps/chosen": -346.07928466796875, "logps/rejected": -632.543212890625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 2.3526246547698975, "rewards/margins": 16.90323829650879, "rewards/rejected": -14.55061149597168, "step": 6920 }, { "epoch": 2.36, "learning_rate": 1.193503713961979e-07, "logits/chosen": 0.7867122888565063, "logits/rejected": 1.766857385635376, "logps/chosen": -498.2786560058594, "logps/rejected": -871.0455322265625, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 1.9483602046966553, "rewards/margins": 16.372447967529297, "rewards/rejected": -14.42408561706543, "step": 6930 }, { "epoch": 2.36, "learning_rate": 1.1872088631499433e-07, "logits/chosen": 0.10261068493127823, "logits/rejected": 2.4081060886383057, "logps/chosen": -343.291748046875, "logps/rejected": -690.6204833984375, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 1.3851890563964844, "rewards/margins": 16.051509857177734, "rewards/rejected": -14.666322708129883, "step": 6940 }, { "epoch": 2.36, "learning_rate": 1.1809140123379076e-07, "logits/chosen": 0.6996985673904419, "logits/rejected": 1.9651216268539429, "logps/chosen": -345.29193115234375, "logps/rejected": -695.8432006835938, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 2.2995944023132324, "rewards/margins": 15.940165519714355, "rewards/rejected": -13.640571594238281, "step": 6950 }, { "epoch": 2.37, "learning_rate": 1.1746191615258717e-07, "logits/chosen": 1.2867244482040405, "logits/rejected": 2.4483482837677, "logps/chosen": -334.0912170410156, "logps/rejected": -690.3890991210938, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 2.22255802154541, "rewards/margins": 19.06466293334961, "rewards/rejected": -16.842105865478516, "step": 6960 }, { "epoch": 2.37, "learning_rate": 1.1683243107138361e-07, "logits/chosen": 1.11300528049469, "logits/rejected": 2.5207343101501465, "logps/chosen": -405.8424377441406, "logps/rejected": -613.1907958984375, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 2.195126533508301, "rewards/margins": 18.852861404418945, "rewards/rejected": -16.65773582458496, "step": 6970 }, { "epoch": 2.37, "learning_rate": 1.1620294599018003e-07, "logits/chosen": 1.2959927320480347, "logits/rejected": 2.5358402729034424, "logps/chosen": -329.21905517578125, "logps/rejected": -549.5939331054688, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": 1.7862987518310547, "rewards/margins": 15.796490669250488, "rewards/rejected": -14.01019287109375, "step": 6980 }, { "epoch": 2.38, "learning_rate": 1.1557346090897645e-07, "logits/chosen": 0.9604352712631226, "logits/rejected": 2.0390219688415527, "logps/chosen": -319.5994873046875, "logps/rejected": -810.9552001953125, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 1.9633973836898804, "rewards/margins": 17.46154022216797, "rewards/rejected": -15.498143196105957, "step": 6990 }, { "epoch": 2.38, "learning_rate": 1.1494397582777288e-07, "logits/chosen": 0.10136137902736664, "logits/rejected": 2.167189836502075, "logps/chosen": -305.06890869140625, "logps/rejected": -681.5853881835938, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 1.6132564544677734, "rewards/margins": 15.553873062133789, "rewards/rejected": -13.9406156539917, "step": 7000 }, { "epoch": 2.38, "eval_logits/chosen": 0.44245606660842896, "eval_logits/rejected": 2.438732624053955, "eval_logps/chosen": -369.9842224121094, "eval_logps/rejected": -648.9703369140625, "eval_loss": 0.004800071474164724, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.9005694389343262, "eval_rewards/margins": 16.534570693969727, "eval_rewards/rejected": -14.634000778198242, "eval_runtime": 267.9901, "eval_samples_per_second": 35.449, "eval_steps_per_second": 1.108, "step": 7000 }, { "epoch": 2.38, "learning_rate": 1.1431449074656931e-07, "logits/chosen": 1.2209926843643188, "logits/rejected": 2.4831814765930176, "logps/chosen": -508.8880920410156, "logps/rejected": -517.89794921875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.8793928623199463, "rewards/margins": 17.297401428222656, "rewards/rejected": -15.418006896972656, "step": 7010 }, { "epoch": 2.39, "learning_rate": 1.1368500566536572e-07, "logits/chosen": 0.8192776441574097, "logits/rejected": 1.7947208881378174, "logps/chosen": -453.9606018066406, "logps/rejected": -689.9783325195312, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": 2.200059175491333, "rewards/margins": 15.344413757324219, "rewards/rejected": -13.144353866577148, "step": 7020 }, { "epoch": 2.39, "learning_rate": 1.1305552058416214e-07, "logits/chosen": 0.6392263174057007, "logits/rejected": 2.1331827640533447, "logps/chosen": -336.352294921875, "logps/rejected": -658.5803833007812, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 2.2526023387908936, "rewards/margins": 18.084606170654297, "rewards/rejected": -15.832002639770508, "step": 7030 }, { "epoch": 2.39, "learning_rate": 1.1242603550295858e-07, "logits/chosen": 0.29846692085266113, "logits/rejected": 2.1883413791656494, "logps/chosen": -304.7272033691406, "logps/rejected": -715.2191772460938, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 2.2608134746551514, "rewards/margins": 18.64798927307129, "rewards/rejected": -16.387174606323242, "step": 7040 }, { "epoch": 2.4, "learning_rate": 1.1179655042175499e-07, "logits/chosen": 1.5515304803848267, "logits/rejected": 2.8250606060028076, "logps/chosen": -342.275634765625, "logps/rejected": -492.2900390625, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.4785754680633545, "rewards/margins": 15.609323501586914, "rewards/rejected": -14.13074779510498, "step": 7050 }, { "epoch": 2.4, "learning_rate": 1.1116706534055143e-07, "logits/chosen": 0.6783769726753235, "logits/rejected": 2.544703722000122, "logps/chosen": -383.35443115234375, "logps/rejected": -615.2085571289062, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 2.3915653228759766, "rewards/margins": 19.474990844726562, "rewards/rejected": -17.083423614501953, "step": 7060 }, { "epoch": 2.4, "learning_rate": 1.1053758025934785e-07, "logits/chosen": 0.6153481006622314, "logits/rejected": 1.9820592403411865, "logps/chosen": -319.31768798828125, "logps/rejected": -786.171875, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 1.718113899230957, "rewards/margins": 15.51591682434082, "rewards/rejected": -13.797802925109863, "step": 7070 }, { "epoch": 2.41, "learning_rate": 1.0990809517814427e-07, "logits/chosen": 1.1642529964447021, "logits/rejected": 2.6903514862060547, "logps/chosen": -388.19854736328125, "logps/rejected": -527.0130004882812, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 1.783219337463379, "rewards/margins": 16.403539657592773, "rewards/rejected": -14.620318412780762, "step": 7080 }, { "epoch": 2.41, "learning_rate": 1.092786100969407e-07, "logits/chosen": 1.280145287513733, "logits/rejected": 2.317441463470459, "logps/chosen": -338.6867370605469, "logps/rejected": -638.913330078125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 2.0096359252929688, "rewards/margins": 15.663032531738281, "rewards/rejected": -13.653398513793945, "step": 7090 }, { "epoch": 2.41, "learning_rate": 1.0864912501573713e-07, "logits/chosen": 0.9446396827697754, "logits/rejected": 2.444380283355713, "logps/chosen": -313.5970458984375, "logps/rejected": -557.501953125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 1.4033139944076538, "rewards/margins": 12.766210556030273, "rewards/rejected": -11.362897872924805, "step": 7100 }, { "epoch": 2.41, "eval_logits/chosen": 0.42960870265960693, "eval_logits/rejected": 2.4152705669403076, "eval_logps/chosen": -370.7745666503906, "eval_logps/rejected": -653.0066528320312, "eval_loss": 0.00468032481148839, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.8215343952178955, "eval_rewards/margins": 16.85917091369629, "eval_rewards/rejected": -15.037636756896973, "eval_runtime": 267.6607, "eval_samples_per_second": 35.493, "eval_steps_per_second": 1.11, "step": 7100 }, { "epoch": 2.42, "learning_rate": 1.0801963993453354e-07, "logits/chosen": 0.5551157593727112, "logits/rejected": 2.690716028213501, "logps/chosen": -337.18963623046875, "logps/rejected": -474.79754638671875, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 1.5211597681045532, "rewards/margins": 16.70118522644043, "rewards/rejected": -15.180026054382324, "step": 7110 }, { "epoch": 2.42, "learning_rate": 1.0739015485332998e-07, "logits/chosen": 0.7350689768791199, "logits/rejected": 2.314960479736328, "logps/chosen": -413.591796875, "logps/rejected": -702.0711059570312, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 1.877844214439392, "rewards/margins": 15.394256591796875, "rewards/rejected": -13.516412734985352, "step": 7120 }, { "epoch": 2.42, "learning_rate": 1.067606697721264e-07, "logits/chosen": 1.090998888015747, "logits/rejected": 2.3243186473846436, "logps/chosen": -394.4823913574219, "logps/rejected": -571.0720825195312, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 1.2767682075500488, "rewards/margins": 15.756521224975586, "rewards/rejected": -14.479754447937012, "step": 7130 }, { "epoch": 2.43, "learning_rate": 1.0613118469092282e-07, "logits/chosen": 0.48929041624069214, "logits/rejected": 1.5767260789871216, "logps/chosen": -382.69842529296875, "logps/rejected": -966.6769409179688, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 2.252427577972412, "rewards/margins": 15.286422729492188, "rewards/rejected": -13.033994674682617, "step": 7140 }, { "epoch": 2.43, "learning_rate": 1.0550169960971924e-07, "logits/chosen": 0.6126815676689148, "logits/rejected": 2.0698182582855225, "logps/chosen": -432.7958068847656, "logps/rejected": -761.7425537109375, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 1.890511155128479, "rewards/margins": 16.206470489501953, "rewards/rejected": -14.315958023071289, "step": 7150 }, { "epoch": 2.43, "learning_rate": 1.0487221452851568e-07, "logits/chosen": 0.43376216292381287, "logits/rejected": 2.1499080657958984, "logps/chosen": -360.62725830078125, "logps/rejected": -782.2716674804688, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 1.4807064533233643, "rewards/margins": 14.774923324584961, "rewards/rejected": -13.294217109680176, "step": 7160 }, { "epoch": 2.44, "learning_rate": 1.0424272944731209e-07, "logits/chosen": 1.1980317831039429, "logits/rejected": 2.1737492084503174, "logps/chosen": -398.87933349609375, "logps/rejected": -765.1239013671875, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 1.9391682147979736, "rewards/margins": 17.53460121154785, "rewards/rejected": -15.595433235168457, "step": 7170 }, { "epoch": 2.44, "learning_rate": 1.0361324436610853e-07, "logits/chosen": 1.1051430702209473, "logits/rejected": 2.751847743988037, "logps/chosen": -397.548095703125, "logps/rejected": -588.2581787109375, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 1.978877067565918, "rewards/margins": 19.947650909423828, "rewards/rejected": -17.968774795532227, "step": 7180 }, { "epoch": 2.44, "learning_rate": 1.0298375928490494e-07, "logits/chosen": 1.0062482357025146, "logits/rejected": 2.347341299057007, "logps/chosen": -315.4201965332031, "logps/rejected": -673.93896484375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.7003400325775146, "rewards/margins": 18.57695770263672, "rewards/rejected": -16.876617431640625, "step": 7190 }, { "epoch": 2.45, "learning_rate": 1.0235427420370137e-07, "logits/chosen": 1.1636916399002075, "logits/rejected": 2.218010425567627, "logps/chosen": -342.746826171875, "logps/rejected": -663.3795166015625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 1.88577139377594, "rewards/margins": 18.186866760253906, "rewards/rejected": -16.301095962524414, "step": 7200 }, { "epoch": 2.45, "eval_logits/chosen": 0.42482060194015503, "eval_logits/rejected": 2.4152801036834717, "eval_logps/chosen": -370.7949523925781, "eval_logps/rejected": -652.7421875, "eval_loss": 0.004596411250531673, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.8194925785064697, "eval_rewards/margins": 16.83067512512207, "eval_rewards/rejected": -15.01118278503418, "eval_runtime": 268.6365, "eval_samples_per_second": 35.364, "eval_steps_per_second": 1.106, "step": 7200 }, { "epoch": 2.45, "learning_rate": 1.017247891224978e-07, "logits/chosen": 0.6930907964706421, "logits/rejected": 3.015477418899536, "logps/chosen": -323.17840576171875, "logps/rejected": -471.4164123535156, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 2.021176338195801, "rewards/margins": 17.877004623413086, "rewards/rejected": -15.855827331542969, "step": 7210 }, { "epoch": 2.45, "learning_rate": 1.0109530404129422e-07, "logits/chosen": 1.0820677280426025, "logits/rejected": 2.5256545543670654, "logps/chosen": -402.204833984375, "logps/rejected": -561.5673217773438, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.3005762100219727, "rewards/margins": 18.47879409790039, "rewards/rejected": -17.178218841552734, "step": 7220 }, { "epoch": 2.46, "learning_rate": 1.0046581896009064e-07, "logits/chosen": 1.2416799068450928, "logits/rejected": 2.559627056121826, "logps/chosen": -369.97821044921875, "logps/rejected": -608.08251953125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 1.5119627714157104, "rewards/margins": 18.328359603881836, "rewards/rejected": -16.81639862060547, "step": 7230 }, { "epoch": 2.46, "learning_rate": 9.983633387888708e-08, "logits/chosen": 0.5990116000175476, "logits/rejected": 1.6854044198989868, "logps/chosen": -371.9095764160156, "logps/rejected": -837.9762573242188, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.8408832550048828, "rewards/margins": 17.786670684814453, "rewards/rejected": -15.945785522460938, "step": 7240 }, { "epoch": 2.46, "learning_rate": 9.920684879768348e-08, "logits/chosen": 1.354547381401062, "logits/rejected": 2.4100263118743896, "logps/chosen": -451.39874267578125, "logps/rejected": -672.2733154296875, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 2.4303014278411865, "rewards/margins": 15.943695068359375, "rewards/rejected": -13.513392448425293, "step": 7250 }, { "epoch": 2.47, "learning_rate": 9.857736371647991e-08, "logits/chosen": 0.8520407676696777, "logits/rejected": 2.337068557739258, "logps/chosen": -460.9368591308594, "logps/rejected": -715.7650146484375, "loss": 0.0053, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.073951482772827, "rewards/margins": 16.552248001098633, "rewards/rejected": -14.478296279907227, "step": 7260 }, { "epoch": 2.47, "learning_rate": 9.794787863527634e-08, "logits/chosen": 1.0755064487457275, "logits/rejected": 2.564039707183838, "logps/chosen": -459.77691650390625, "logps/rejected": -473.631591796875, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": 2.273218870162964, "rewards/margins": 18.604473114013672, "rewards/rejected": -16.331254959106445, "step": 7270 }, { "epoch": 2.47, "learning_rate": 9.731839355407275e-08, "logits/chosen": 0.769875705242157, "logits/rejected": 2.427485942840576, "logps/chosen": -370.8426208496094, "logps/rejected": -486.0166015625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.9036086797714233, "rewards/margins": 14.996641159057617, "rewards/rejected": -13.09303092956543, "step": 7280 }, { "epoch": 2.48, "learning_rate": 9.668890847286919e-08, "logits/chosen": 1.0000903606414795, "logits/rejected": 2.689542770385742, "logps/chosen": -446.7685546875, "logps/rejected": -504.4189453125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 1.9139716625213623, "rewards/margins": 16.287158966064453, "rewards/rejected": -14.373187065124512, "step": 7290 }, { "epoch": 2.48, "learning_rate": 9.605942339166561e-08, "logits/chosen": 0.08227036148309708, "logits/rejected": 1.5266690254211426, "logps/chosen": -370.2149658203125, "logps/rejected": -883.3233642578125, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 2.4059269428253174, "rewards/margins": 16.193233489990234, "rewards/rejected": -13.787304878234863, "step": 7300 }, { "epoch": 2.48, "eval_logits/chosen": 0.4234156608581543, "eval_logits/rejected": 2.4336464405059814, "eval_logps/chosen": -370.06939697265625, "eval_logps/rejected": -646.7868041992188, "eval_loss": 0.004505614284425974, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.8920491933822632, "eval_rewards/margins": 16.307695388793945, "eval_rewards/rejected": -14.415645599365234, "eval_runtime": 267.5767, "eval_samples_per_second": 35.504, "eval_steps_per_second": 1.11, "step": 7300 }, { "epoch": 2.48, "learning_rate": 9.542993831046203e-08, "logits/chosen": 1.335982084274292, "logits/rejected": 2.717874050140381, "logps/chosen": -380.83026123046875, "logps/rejected": -566.5546264648438, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 1.8207553625106812, "rewards/margins": 15.22517204284668, "rewards/rejected": -13.40441608428955, "step": 7310 }, { "epoch": 2.49, "learning_rate": 9.480045322925846e-08, "logits/chosen": 0.7195658683776855, "logits/rejected": 2.657860279083252, "logps/chosen": -369.21490478515625, "logps/rejected": -649.6099853515625, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 1.9588321447372437, "rewards/margins": 16.215194702148438, "rewards/rejected": -14.256364822387695, "step": 7320 }, { "epoch": 2.49, "learning_rate": 9.41709681480549e-08, "logits/chosen": 1.2745463848114014, "logits/rejected": 2.756723165512085, "logps/chosen": -319.5630187988281, "logps/rejected": -561.6343383789062, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 2.1544394493103027, "rewards/margins": 16.32255744934082, "rewards/rejected": -14.168116569519043, "step": 7330 }, { "epoch": 2.49, "learning_rate": 9.35414830668513e-08, "logits/chosen": 0.10680651664733887, "logits/rejected": 2.273437023162842, "logps/chosen": -281.9779968261719, "logps/rejected": -660.8433837890625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.5733397006988525, "rewards/margins": 15.828512191772461, "rewards/rejected": -14.25517463684082, "step": 7340 }, { "epoch": 2.5, "learning_rate": 9.291199798564774e-08, "logits/chosen": 0.892861545085907, "logits/rejected": 2.3723952770233154, "logps/chosen": -346.60345458984375, "logps/rejected": -658.56689453125, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 2.2672369480133057, "rewards/margins": 16.97647476196289, "rewards/rejected": -14.70923900604248, "step": 7350 }, { "epoch": 2.5, "learning_rate": 9.228251290444416e-08, "logits/chosen": 0.6463754773139954, "logits/rejected": 1.9796451330184937, "logps/chosen": -372.3808288574219, "logps/rejected": -662.8326416015625, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 2.2259275913238525, "rewards/margins": 15.783831596374512, "rewards/rejected": -13.557904243469238, "step": 7360 }, { "epoch": 2.51, "learning_rate": 9.165302782324058e-08, "logits/chosen": 1.0133006572723389, "logits/rejected": 2.180471897125244, "logps/chosen": -337.8099060058594, "logps/rejected": -624.0775146484375, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.715206503868103, "rewards/margins": 16.924972534179688, "rewards/rejected": -15.209765434265137, "step": 7370 }, { "epoch": 2.51, "learning_rate": 9.102354274203701e-08, "logits/chosen": 0.553676962852478, "logits/rejected": 2.322350263595581, "logps/chosen": -360.35675048828125, "logps/rejected": -678.6683349609375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 1.9695672988891602, "rewards/margins": 17.130569458007812, "rewards/rejected": -15.161005020141602, "step": 7380 }, { "epoch": 2.51, "learning_rate": 9.039405766083344e-08, "logits/chosen": 1.1079872846603394, "logits/rejected": 1.9017117023468018, "logps/chosen": -328.49505615234375, "logps/rejected": -786.6644287109375, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 1.4413007497787476, "rewards/margins": 14.29881763458252, "rewards/rejected": -12.857516288757324, "step": 7390 }, { "epoch": 2.52, "learning_rate": 8.976457257962985e-08, "logits/chosen": 1.005599021911621, "logits/rejected": 2.1879096031188965, "logps/chosen": -335.03521728515625, "logps/rejected": -697.0205688476562, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 1.6808414459228516, "rewards/margins": 15.54613208770752, "rewards/rejected": -13.865290641784668, "step": 7400 }, { "epoch": 2.52, "eval_logits/chosen": 0.41171300411224365, "eval_logits/rejected": 2.4101154804229736, "eval_logps/chosen": -371.1637878417969, "eval_logps/rejected": -649.152587890625, "eval_loss": 0.004430423025041819, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.7826143503189087, "eval_rewards/margins": 16.434837341308594, "eval_rewards/rejected": -14.652223587036133, "eval_runtime": 268.1809, "eval_samples_per_second": 35.424, "eval_steps_per_second": 1.107, "step": 7400 }, { "epoch": 2.52, "learning_rate": 8.913508749842629e-08, "logits/chosen": 0.9914455413818359, "logits/rejected": 2.459635019302368, "logps/chosen": -328.3199768066406, "logps/rejected": -625.0198974609375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 1.8556253910064697, "rewards/margins": 16.197139739990234, "rewards/rejected": -14.341516494750977, "step": 7410 }, { "epoch": 2.52, "learning_rate": 8.850560241722271e-08, "logits/chosen": 1.4172303676605225, "logits/rejected": 2.113933563232422, "logps/chosen": -391.9002990722656, "logps/rejected": -634.8341064453125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 1.3552289009094238, "rewards/margins": 14.91827392578125, "rewards/rejected": -13.563047409057617, "step": 7420 }, { "epoch": 2.53, "learning_rate": 8.787611733601913e-08, "logits/chosen": 1.3201543092727661, "logits/rejected": 2.8121328353881836, "logps/chosen": -495.87701416015625, "logps/rejected": -456.218017578125, "loss": 0.0042, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4872987270355225, "rewards/margins": 16.895530700683594, "rewards/rejected": -15.408231735229492, "step": 7430 }, { "epoch": 2.53, "learning_rate": 8.724663225481556e-08, "logits/chosen": 0.6333423852920532, "logits/rejected": 2.0092949867248535, "logps/chosen": -378.5120849609375, "logps/rejected": -762.9464111328125, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 2.560328960418701, "rewards/margins": 19.02144432067871, "rewards/rejected": -16.461116790771484, "step": 7440 }, { "epoch": 2.53, "learning_rate": 8.6617147173612e-08, "logits/chosen": 1.240923285484314, "logits/rejected": 2.028729200363159, "logps/chosen": -340.8464660644531, "logps/rejected": -809.6685180664062, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 1.5228168964385986, "rewards/margins": 17.846059799194336, "rewards/rejected": -16.3232421875, "step": 7450 }, { "epoch": 2.54, "learning_rate": 8.59876620924084e-08, "logits/chosen": 0.6715523600578308, "logits/rejected": 2.028759717941284, "logps/chosen": -512.22705078125, "logps/rejected": -689.6047973632812, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 2.257208824157715, "rewards/margins": 17.234987258911133, "rewards/rejected": -14.977777481079102, "step": 7460 }, { "epoch": 2.54, "learning_rate": 8.535817701120483e-08, "logits/chosen": 0.9357270002365112, "logits/rejected": 1.8091167211532593, "logps/chosen": -338.4635925292969, "logps/rejected": -813.4832763671875, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 2.0143938064575195, "rewards/margins": 15.582258224487305, "rewards/rejected": -13.567866325378418, "step": 7470 }, { "epoch": 2.54, "learning_rate": 8.472869193000126e-08, "logits/chosen": 0.7832736968994141, "logits/rejected": 2.180711269378662, "logps/chosen": -516.1185913085938, "logps/rejected": -668.5208740234375, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 1.3803958892822266, "rewards/margins": 17.23041534423828, "rewards/rejected": -15.850018501281738, "step": 7480 }, { "epoch": 2.55, "learning_rate": 8.409920684879767e-08, "logits/chosen": 1.112322449684143, "logits/rejected": 2.341900110244751, "logps/chosen": -383.68701171875, "logps/rejected": -630.36279296875, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 1.8344999551773071, "rewards/margins": 16.628482818603516, "rewards/rejected": -14.793981552124023, "step": 7490 }, { "epoch": 2.55, "learning_rate": 8.346972176759411e-08, "logits/chosen": 0.7441210746765137, "logits/rejected": 1.9885940551757812, "logps/chosen": -481.265869140625, "logps/rejected": -694.01416015625, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 1.1450316905975342, "rewards/margins": 17.184598922729492, "rewards/rejected": -16.039567947387695, "step": 7500 }, { "epoch": 2.55, "eval_logits/chosen": 0.40691351890563965, "eval_logits/rejected": 2.4040136337280273, "eval_logps/chosen": -370.7875061035156, "eval_logps/rejected": -649.6731567382812, "eval_loss": 0.004426932893693447, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.8202381134033203, "eval_rewards/margins": 16.524518966674805, "eval_rewards/rejected": -14.704280853271484, "eval_runtime": 270.7903, "eval_samples_per_second": 35.083, "eval_steps_per_second": 1.097, "step": 7500 }, { "epoch": 2.55, "learning_rate": 8.284023668639053e-08, "logits/chosen": 0.9541034698486328, "logits/rejected": 2.3612122535705566, "logps/chosen": -464.33599853515625, "logps/rejected": -559.6075439453125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 1.3160302639007568, "rewards/margins": 14.19543743133545, "rewards/rejected": -12.879406929016113, "step": 7510 }, { "epoch": 2.56, "learning_rate": 8.221075160518695e-08, "logits/chosen": 0.6179080009460449, "logits/rejected": 2.113412380218506, "logps/chosen": -405.10809326171875, "logps/rejected": -635.8704833984375, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 1.670983910560608, "rewards/margins": 15.743799209594727, "rewards/rejected": -14.072813034057617, "step": 7520 }, { "epoch": 2.56, "learning_rate": 8.158126652398338e-08, "logits/chosen": 0.9145607948303223, "logits/rejected": 2.355287790298462, "logps/chosen": -437.69085693359375, "logps/rejected": -519.5291748046875, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 2.611969232559204, "rewards/margins": 17.811548233032227, "rewards/rejected": -15.19958209991455, "step": 7530 }, { "epoch": 2.56, "learning_rate": 8.09517814427798e-08, "logits/chosen": 0.7780656814575195, "logits/rejected": 2.4244775772094727, "logps/chosen": -307.7718200683594, "logps/rejected": -620.1722412109375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 2.0296103954315186, "rewards/margins": 17.512142181396484, "rewards/rejected": -15.48253059387207, "step": 7540 }, { "epoch": 2.57, "learning_rate": 8.032229636157622e-08, "logits/chosen": 0.618104100227356, "logits/rejected": 1.8979343175888062, "logps/chosen": -301.73431396484375, "logps/rejected": -839.3603515625, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 1.6576316356658936, "rewards/margins": 17.44778823852539, "rewards/rejected": -15.790155410766602, "step": 7550 }, { "epoch": 2.57, "learning_rate": 7.969281128037266e-08, "logits/chosen": 0.9188628196716309, "logits/rejected": 2.692812919616699, "logps/chosen": -330.3370361328125, "logps/rejected": -587.1077880859375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.176570415496826, "rewards/margins": 16.620418548583984, "rewards/rejected": -14.44384765625, "step": 7560 }, { "epoch": 2.57, "learning_rate": 7.906332619916907e-08, "logits/chosen": 0.35936424136161804, "logits/rejected": 2.5122036933898926, "logps/chosen": -386.1283874511719, "logps/rejected": -658.3817138671875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 1.331057071685791, "rewards/margins": 15.813084602355957, "rewards/rejected": -14.482028007507324, "step": 7570 }, { "epoch": 2.58, "learning_rate": 7.84338411179655e-08, "logits/chosen": 1.2575255632400513, "logits/rejected": 2.4196267127990723, "logps/chosen": -432.21771240234375, "logps/rejected": -659.4260864257812, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 1.846247673034668, "rewards/margins": 18.127437591552734, "rewards/rejected": -16.281190872192383, "step": 7580 }, { "epoch": 2.58, "learning_rate": 7.780435603676193e-08, "logits/chosen": 0.49740689992904663, "logits/rejected": 2.428440809249878, "logps/chosen": -310.4542541503906, "logps/rejected": -639.364990234375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 1.8841445446014404, "rewards/margins": 18.3008975982666, "rewards/rejected": -16.416751861572266, "step": 7590 }, { "epoch": 2.58, "learning_rate": 7.717487095555835e-08, "logits/chosen": 1.0477213859558105, "logits/rejected": 2.2913241386413574, "logps/chosen": -339.94976806640625, "logps/rejected": -717.8651123046875, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 1.7267446517944336, "rewards/margins": 17.785921096801758, "rewards/rejected": -16.05917739868164, "step": 7600 }, { "epoch": 2.58, "eval_logits/chosen": 0.40868476033210754, "eval_logits/rejected": 2.4019243717193604, "eval_logps/chosen": -370.2781677246094, "eval_logps/rejected": -650.1920776367188, "eval_loss": 0.004376308061182499, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.8711698055267334, "eval_rewards/margins": 16.627342224121094, "eval_rewards/rejected": -14.756170272827148, "eval_runtime": 269.9357, "eval_samples_per_second": 35.194, "eval_steps_per_second": 1.1, "step": 7600 }, { "epoch": 2.59, "learning_rate": 7.654538587435477e-08, "logits/chosen": 0.8364641070365906, "logits/rejected": 2.4752678871154785, "logps/chosen": -344.04510498046875, "logps/rejected": -617.2227172851562, "loss": 0.0034, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.264724016189575, "rewards/margins": 17.439924240112305, "rewards/rejected": -15.175201416015625, "step": 7610 }, { "epoch": 2.59, "learning_rate": 7.591590079315121e-08, "logits/chosen": 0.7828987836837769, "logits/rejected": 2.141719341278076, "logps/chosen": -364.89471435546875, "logps/rejected": -642.4763793945312, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 1.7357105016708374, "rewards/margins": 15.866876602172852, "rewards/rejected": -14.1311674118042, "step": 7620 }, { "epoch": 2.59, "learning_rate": 7.528641571194762e-08, "logits/chosen": 0.6405404806137085, "logits/rejected": 1.9401140213012695, "logps/chosen": -305.2798156738281, "logps/rejected": -775.0665283203125, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 2.271310329437256, "rewards/margins": 16.512285232543945, "rewards/rejected": -14.240976333618164, "step": 7630 }, { "epoch": 2.6, "learning_rate": 7.465693063074405e-08, "logits/chosen": 1.0794910192489624, "logits/rejected": 2.674074172973633, "logps/chosen": -394.71795654296875, "logps/rejected": -641.1694946289062, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 1.7397727966308594, "rewards/margins": 17.497838973999023, "rewards/rejected": -15.758066177368164, "step": 7640 }, { "epoch": 2.6, "learning_rate": 7.402744554954048e-08, "logits/chosen": 1.1082245111465454, "logits/rejected": 2.569019317626953, "logps/chosen": -394.66278076171875, "logps/rejected": -548.3233642578125, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.965540885925293, "rewards/margins": 15.515264511108398, "rewards/rejected": -13.549725532531738, "step": 7650 }, { "epoch": 2.6, "learning_rate": 7.33979604683369e-08, "logits/chosen": 1.0259652137756348, "logits/rejected": 2.4335434436798096, "logps/chosen": -407.17694091796875, "logps/rejected": -562.71826171875, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 2.1270415782928467, "rewards/margins": 15.186243057250977, "rewards/rejected": -13.05920124053955, "step": 7660 }, { "epoch": 2.61, "learning_rate": 7.276847538713332e-08, "logits/chosen": 0.868080735206604, "logits/rejected": 2.634049892425537, "logps/chosen": -417.909423828125, "logps/rejected": -581.9749145507812, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 1.856361746788025, "rewards/margins": 17.1064395904541, "rewards/rejected": -15.250079154968262, "step": 7670 }, { "epoch": 2.61, "learning_rate": 7.213899030592976e-08, "logits/chosen": 1.3671529293060303, "logits/rejected": 2.712268829345703, "logps/chosen": -409.2546081542969, "logps/rejected": -549.1846313476562, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 1.8779876232147217, "rewards/margins": 15.200727462768555, "rewards/rejected": -13.322738647460938, "step": 7680 }, { "epoch": 2.61, "learning_rate": 7.150950522472617e-08, "logits/chosen": 0.3646644949913025, "logits/rejected": 2.458853244781494, "logps/chosen": -398.7442932128906, "logps/rejected": -553.7511596679688, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 2.1193220615386963, "rewards/margins": 18.095134735107422, "rewards/rejected": -15.975812911987305, "step": 7690 }, { "epoch": 2.62, "learning_rate": 7.088002014352259e-08, "logits/chosen": 1.1231930255889893, "logits/rejected": 2.2001609802246094, "logps/chosen": -459.5314025878906, "logps/rejected": -600.494384765625, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 1.8814414739608765, "rewards/margins": 16.648889541625977, "rewards/rejected": -14.767448425292969, "step": 7700 }, { "epoch": 2.62, "eval_logits/chosen": 0.4114474952220917, "eval_logits/rejected": 2.3995602130889893, "eval_logps/chosen": -370.58355712890625, "eval_logps/rejected": -651.2406616210938, "eval_loss": 0.004253820516169071, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.8406320810317993, "eval_rewards/margins": 16.70166778564453, "eval_rewards/rejected": -14.861037254333496, "eval_runtime": 271.0204, "eval_samples_per_second": 35.053, "eval_steps_per_second": 1.096, "step": 7700 }, { "epoch": 2.62, "learning_rate": 7.025053506231903e-08, "logits/chosen": 0.5934748649597168, "logits/rejected": 2.5598092079162598, "logps/chosen": -348.83203125, "logps/rejected": -445.65087890625, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 1.9641309976577759, "rewards/margins": 17.064300537109375, "rewards/rejected": -15.10016918182373, "step": 7710 }, { "epoch": 2.62, "learning_rate": 6.962104998111543e-08, "logits/chosen": 1.384304404258728, "logits/rejected": 2.5968146324157715, "logps/chosen": -346.9498596191406, "logps/rejected": -583.7931518554688, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 2.3904762268066406, "rewards/margins": 17.099462509155273, "rewards/rejected": -14.708989143371582, "step": 7720 }, { "epoch": 2.63, "learning_rate": 6.899156489991187e-08, "logits/chosen": 0.6948369145393372, "logits/rejected": 2.5097079277038574, "logps/chosen": -368.5804138183594, "logps/rejected": -614.776611328125, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 2.0649654865264893, "rewards/margins": 18.139123916625977, "rewards/rejected": -16.074155807495117, "step": 7730 }, { "epoch": 2.63, "learning_rate": 6.83620798187083e-08, "logits/chosen": 0.633045494556427, "logits/rejected": 2.577214002609253, "logps/chosen": -442.4937438964844, "logps/rejected": -551.2289428710938, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.669154405593872, "rewards/margins": 16.923871994018555, "rewards/rejected": -15.254716873168945, "step": 7740 }, { "epoch": 2.63, "learning_rate": 6.773259473750472e-08, "logits/chosen": 0.40775442123413086, "logits/rejected": 2.2701003551483154, "logps/chosen": -307.6568908691406, "logps/rejected": -603.6043701171875, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 2.4765632152557373, "rewards/margins": 16.266742706298828, "rewards/rejected": -13.790179252624512, "step": 7750 }, { "epoch": 2.64, "learning_rate": 6.710310965630114e-08, "logits/chosen": 1.0287370681762695, "logits/rejected": 1.9805313348770142, "logps/chosen": -553.20068359375, "logps/rejected": -668.2408447265625, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 2.096820831298828, "rewards/margins": 16.469932556152344, "rewards/rejected": -14.373109817504883, "step": 7760 }, { "epoch": 2.64, "learning_rate": 6.647362457509758e-08, "logits/chosen": 0.6259018182754517, "logits/rejected": 2.5759027004241943, "logps/chosen": -360.9418029785156, "logps/rejected": -528.69384765625, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 2.0843729972839355, "rewards/margins": 16.262943267822266, "rewards/rejected": -14.178568840026855, "step": 7770 }, { "epoch": 2.64, "learning_rate": 6.584413949389398e-08, "logits/chosen": 0.9427730441093445, "logits/rejected": 2.1246232986450195, "logps/chosen": -437.2471618652344, "logps/rejected": -693.9963989257812, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 1.5401811599731445, "rewards/margins": 15.073539733886719, "rewards/rejected": -13.533358573913574, "step": 7780 }, { "epoch": 2.65, "learning_rate": 6.521465441269042e-08, "logits/chosen": 0.3761066794395447, "logits/rejected": 2.0467357635498047, "logps/chosen": -427.61212158203125, "logps/rejected": -732.0155029296875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 2.0713422298431396, "rewards/margins": 16.06171226501465, "rewards/rejected": -13.99036979675293, "step": 7790 }, { "epoch": 2.65, "learning_rate": 6.458516933148684e-08, "logits/chosen": 0.5202646851539612, "logits/rejected": 2.003371238708496, "logps/chosen": -361.00372314453125, "logps/rejected": -708.7975463867188, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 1.5457887649536133, "rewards/margins": 16.85226821899414, "rewards/rejected": -15.306478500366211, "step": 7800 }, { "epoch": 2.65, "eval_logits/chosen": 0.41474515199661255, "eval_logits/rejected": 2.3936104774475098, "eval_logps/chosen": -370.9483642578125, "eval_logps/rejected": -653.4503173828125, "eval_loss": 0.004259427078068256, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.8041552305221558, "eval_rewards/margins": 16.886152267456055, "eval_rewards/rejected": -15.08199691772461, "eval_runtime": 270.6896, "eval_samples_per_second": 35.096, "eval_steps_per_second": 1.097, "step": 7800 }, { "epoch": 2.65, "learning_rate": 6.395568425028327e-08, "logits/chosen": 0.537198543548584, "logits/rejected": 1.7283565998077393, "logps/chosen": -301.32806396484375, "logps/rejected": -793.250244140625, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 2.0168819427490234, "rewards/margins": 16.878694534301758, "rewards/rejected": -14.861811637878418, "step": 7810 }, { "epoch": 2.66, "learning_rate": 6.332619916907969e-08, "logits/chosen": 0.9977883100509644, "logits/rejected": 2.5318446159362793, "logps/chosen": -313.09185791015625, "logps/rejected": -679.9685668945312, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.932676076889038, "rewards/margins": 17.820940017700195, "rewards/rejected": -15.888264656066895, "step": 7820 }, { "epoch": 2.66, "learning_rate": 6.269671408787612e-08, "logits/chosen": 0.7894630432128906, "logits/rejected": 2.1625990867614746, "logps/chosen": -347.5708312988281, "logps/rejected": -755.1859130859375, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 1.6550207138061523, "rewards/margins": 16.34111213684082, "rewards/rejected": -14.686091423034668, "step": 7830 }, { "epoch": 2.66, "learning_rate": 6.206722900667253e-08, "logits/chosen": 0.3686595857143402, "logits/rejected": 1.8407968282699585, "logps/chosen": -387.30096435546875, "logps/rejected": -841.7135009765625, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 1.3304541110992432, "rewards/margins": 15.88392448425293, "rewards/rejected": -14.553471565246582, "step": 7840 }, { "epoch": 2.67, "learning_rate": 6.143774392546897e-08, "logits/chosen": 0.9401399493217468, "logits/rejected": 2.118645191192627, "logps/chosen": -467.0404357910156, "logps/rejected": -527.9627685546875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.6599762439727783, "rewards/margins": 16.71710968017578, "rewards/rejected": -14.057136535644531, "step": 7850 }, { "epoch": 2.67, "learning_rate": 6.080825884426539e-08, "logits/chosen": 1.330041527748108, "logits/rejected": 2.9794445037841797, "logps/chosen": -391.0253601074219, "logps/rejected": -560.0770874023438, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5674159526824951, "rewards/margins": 17.092731475830078, "rewards/rejected": -15.525317192077637, "step": 7860 }, { "epoch": 2.68, "learning_rate": 6.017877376306182e-08, "logits/chosen": 0.6090242266654968, "logits/rejected": 2.478968381881714, "logps/chosen": -394.5640563964844, "logps/rejected": -586.6937255859375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 1.5743919610977173, "rewards/margins": 19.863183975219727, "rewards/rejected": -18.28879165649414, "step": 7870 }, { "epoch": 2.68, "learning_rate": 5.954928868185824e-08, "logits/chosen": 1.0315090417861938, "logits/rejected": 1.8467708826065063, "logps/chosen": -454.67071533203125, "logps/rejected": -845.1062622070312, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 1.9140666723251343, "rewards/margins": 15.519999504089355, "rewards/rejected": -13.605932235717773, "step": 7880 }, { "epoch": 2.68, "learning_rate": 5.891980360065466e-08, "logits/chosen": 1.0535707473754883, "logits/rejected": 2.7900028228759766, "logps/chosen": -421.4190368652344, "logps/rejected": -460.0867614746094, "loss": 0.0047, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.1875920295715332, "rewards/margins": 16.28582763671875, "rewards/rejected": -15.098236083984375, "step": 7890 }, { "epoch": 2.69, "learning_rate": 5.8290318519451084e-08, "logits/chosen": 0.5873435139656067, "logits/rejected": 2.6181395053863525, "logps/chosen": -313.67864990234375, "logps/rejected": -563.8099365234375, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 1.6179535388946533, "rewards/margins": 15.549154281616211, "rewards/rejected": -13.93120002746582, "step": 7900 }, { "epoch": 2.69, "eval_logits/chosen": 0.39933204650878906, "eval_logits/rejected": 2.3757200241088867, "eval_logps/chosen": -370.9472351074219, "eval_logps/rejected": -655.620361328125, "eval_loss": 0.004205311182886362, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.8042665719985962, "eval_rewards/margins": 17.1032657623291, "eval_rewards/rejected": -15.298998832702637, "eval_runtime": 270.2927, "eval_samples_per_second": 35.147, "eval_steps_per_second": 1.099, "step": 7900 }, { "epoch": 2.69, "learning_rate": 5.7660833438247514e-08, "logits/chosen": 0.811974048614502, "logits/rejected": 2.5337677001953125, "logps/chosen": -392.13482666015625, "logps/rejected": -626.2708740234375, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 2.0893046855926514, "rewards/margins": 17.98527717590332, "rewards/rejected": -15.895971298217773, "step": 7910 }, { "epoch": 2.69, "learning_rate": 5.7031348357043937e-08, "logits/chosen": 0.7521673440933228, "logits/rejected": 1.7344005107879639, "logps/chosen": -343.30523681640625, "logps/rejected": -839.4095458984375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 1.6643670797348022, "rewards/margins": 17.1582088470459, "rewards/rejected": -15.493840217590332, "step": 7920 }, { "epoch": 2.7, "learning_rate": 5.640186327584036e-08, "logits/chosen": 0.5547946095466614, "logits/rejected": 2.3363373279571533, "logps/chosen": -346.8823547363281, "logps/rejected": -543.2503662109375, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 1.493767499923706, "rewards/margins": 17.63357925415039, "rewards/rejected": -16.139812469482422, "step": 7930 }, { "epoch": 2.7, "learning_rate": 5.577237819463679e-08, "logits/chosen": 1.02409827709198, "logits/rejected": 2.826772451400757, "logps/chosen": -388.55462646484375, "logps/rejected": -478.57049560546875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.4650802612304688, "rewards/margins": 19.572603225708008, "rewards/rejected": -18.107524871826172, "step": 7940 }, { "epoch": 2.7, "learning_rate": 5.514289311343321e-08, "logits/chosen": 1.1151471138000488, "logits/rejected": 2.1015961170196533, "logps/chosen": -437.498291015625, "logps/rejected": -720.4129028320312, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 1.7610559463500977, "rewards/margins": 18.574838638305664, "rewards/rejected": -16.813783645629883, "step": 7950 }, { "epoch": 2.71, "learning_rate": 5.4513408032229634e-08, "logits/chosen": 0.18950991332530975, "logits/rejected": 1.766977071762085, "logps/chosen": -432.38104248046875, "logps/rejected": -803.8573608398438, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 1.5149455070495605, "rewards/margins": 17.260913848876953, "rewards/rejected": -15.74596881866455, "step": 7960 }, { "epoch": 2.71, "learning_rate": 5.388392295102606e-08, "logits/chosen": 1.041501760482788, "logits/rejected": 2.8908143043518066, "logps/chosen": -372.1784362792969, "logps/rejected": -482.1966247558594, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 2.4550552368164062, "rewards/margins": 16.327838897705078, "rewards/rejected": -13.872782707214355, "step": 7970 }, { "epoch": 2.71, "learning_rate": 5.3254437869822486e-08, "logits/chosen": 0.7389962673187256, "logits/rejected": 2.473978281021118, "logps/chosen": -325.01739501953125, "logps/rejected": -567.4298095703125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 2.4190046787261963, "rewards/margins": 18.693714141845703, "rewards/rejected": -16.27471160888672, "step": 7980 }, { "epoch": 2.72, "learning_rate": 5.262495278861891e-08, "logits/chosen": 0.8442951440811157, "logits/rejected": 2.530892848968506, "logps/chosen": -294.4164123535156, "logps/rejected": -458.73583984375, "loss": 0.0039, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5132577419281006, "rewards/margins": 15.422558784484863, "rewards/rejected": -13.90929889678955, "step": 7990 }, { "epoch": 2.72, "learning_rate": 5.199546770741533e-08, "logits/chosen": 0.31688395142555237, "logits/rejected": 2.428586483001709, "logps/chosen": -317.95941162109375, "logps/rejected": -623.4954223632812, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 1.7186012268066406, "rewards/margins": 16.314571380615234, "rewards/rejected": -14.595973014831543, "step": 8000 }, { "epoch": 2.72, "eval_logits/chosen": 0.38528308272361755, "eval_logits/rejected": 2.363403558731079, "eval_logps/chosen": -370.7011413574219, "eval_logps/rejected": -655.7273559570312, "eval_loss": 0.004197005648165941, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.8288747072219849, "eval_rewards/margins": 17.13857650756836, "eval_rewards/rejected": -15.309701919555664, "eval_runtime": 271.0924, "eval_samples_per_second": 35.043, "eval_steps_per_second": 1.096, "step": 8000 }, { "epoch": 2.72, "learning_rate": 5.136598262621176e-08, "logits/chosen": 0.016552647575736046, "logits/rejected": 2.558486223220825, "logps/chosen": -297.2393798828125, "logps/rejected": -554.3638305664062, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 2.095482349395752, "rewards/margins": 19.343822479248047, "rewards/rejected": -17.248340606689453, "step": 8010 }, { "epoch": 2.73, "learning_rate": 5.073649754500818e-08, "logits/chosen": 0.5093849897384644, "logits/rejected": 2.8169074058532715, "logps/chosen": -334.86981201171875, "logps/rejected": -522.8923950195312, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 2.054229259490967, "rewards/margins": 17.593975067138672, "rewards/rejected": -15.53974437713623, "step": 8020 }, { "epoch": 2.73, "learning_rate": 5.01070124638046e-08, "logits/chosen": 1.1472276449203491, "logits/rejected": 2.4982829093933105, "logps/chosen": -412.49615478515625, "logps/rejected": -652.8995361328125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 1.8879344463348389, "rewards/margins": 18.21976661682129, "rewards/rejected": -16.331830978393555, "step": 8030 }, { "epoch": 2.73, "learning_rate": 4.947752738260103e-08, "logits/chosen": 1.0422632694244385, "logits/rejected": 2.024174213409424, "logps/chosen": -416.4352111816406, "logps/rejected": -743.892578125, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 1.8411505222320557, "rewards/margins": 18.881122589111328, "rewards/rejected": -17.03997230529785, "step": 8040 }, { "epoch": 2.74, "learning_rate": 4.884804230139745e-08, "logits/chosen": 0.41295504570007324, "logits/rejected": 1.4202079772949219, "logps/chosen": -311.0360412597656, "logps/rejected": -888.4528198242188, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 2.1342692375183105, "rewards/margins": 17.17261505126953, "rewards/rejected": -15.038345336914062, "step": 8050 }, { "epoch": 2.74, "learning_rate": 4.8218557220193875e-08, "logits/chosen": 0.8595215678215027, "logits/rejected": 2.0355379581451416, "logps/chosen": -602.7960205078125, "logps/rejected": -667.9617309570312, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.5357165336608887, "rewards/margins": 15.661369323730469, "rewards/rejected": -14.125653266906738, "step": 8060 }, { "epoch": 2.74, "learning_rate": 4.7589072138990305e-08, "logits/chosen": 0.7463659048080444, "logits/rejected": 2.516136646270752, "logps/chosen": -367.22406005859375, "logps/rejected": -535.6273193359375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.6163028478622437, "rewards/margins": 17.277376174926758, "rewards/rejected": -15.6610746383667, "step": 8070 }, { "epoch": 2.75, "learning_rate": 4.695958705778673e-08, "logits/chosen": 0.5243152379989624, "logits/rejected": 2.4038941860198975, "logps/chosen": -496.75860595703125, "logps/rejected": -419.94354248046875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.7436059713363647, "rewards/margins": 16.353389739990234, "rewards/rejected": -14.609784126281738, "step": 8080 }, { "epoch": 2.75, "learning_rate": 4.633010197658315e-08, "logits/chosen": 0.44321316480636597, "logits/rejected": 2.27062726020813, "logps/chosen": -394.5532531738281, "logps/rejected": -574.9754638671875, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 1.7736930847167969, "rewards/margins": 15.879064559936523, "rewards/rejected": -14.105372428894043, "step": 8090 }, { "epoch": 2.75, "learning_rate": 4.570061689537958e-08, "logits/chosen": 1.1519994735717773, "logits/rejected": 2.0376980304718018, "logps/chosen": -527.7027587890625, "logps/rejected": -629.1424560546875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.9384416341781616, "rewards/margins": 17.458274841308594, "rewards/rejected": -15.519834518432617, "step": 8100 }, { "epoch": 2.75, "eval_logits/chosen": 0.3778843879699707, "eval_logits/rejected": 2.361894130706787, "eval_logps/chosen": -370.99468994140625, "eval_logps/rejected": -655.0099487304688, "eval_loss": 0.004115123767405748, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.7995245456695557, "eval_rewards/margins": 17.03748321533203, "eval_rewards/rejected": -15.237958908081055, "eval_runtime": 270.8923, "eval_samples_per_second": 35.069, "eval_steps_per_second": 1.096, "step": 8100 }, { "epoch": 2.76, "learning_rate": 4.5071131814176e-08, "logits/chosen": 0.33518487215042114, "logits/rejected": 1.9586979150772095, "logps/chosen": -440.5299377441406, "logps/rejected": -661.0430297851562, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 1.8130890130996704, "rewards/margins": 15.587263107299805, "rewards/rejected": -13.77417278289795, "step": 8110 }, { "epoch": 2.76, "learning_rate": 4.4441646732972425e-08, "logits/chosen": 1.0590837001800537, "logits/rejected": 2.1017818450927734, "logps/chosen": -324.1203918457031, "logps/rejected": -874.7774658203125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 1.9442323446273804, "rewards/margins": 18.850128173828125, "rewards/rejected": -16.905895233154297, "step": 8120 }, { "epoch": 2.76, "learning_rate": 4.3812161651768855e-08, "logits/chosen": 0.7944759726524353, "logits/rejected": 2.6383702754974365, "logps/chosen": -334.22174072265625, "logps/rejected": -609.9515991210938, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 2.351963520050049, "rewards/margins": 18.316631317138672, "rewards/rejected": -15.964668273925781, "step": 8130 }, { "epoch": 2.77, "learning_rate": 4.318267657056528e-08, "logits/chosen": 1.4045004844665527, "logits/rejected": 2.415086269378662, "logps/chosen": -409.2455139160156, "logps/rejected": -547.939453125, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 1.6006851196289062, "rewards/margins": 14.521757125854492, "rewards/rejected": -12.921072006225586, "step": 8140 }, { "epoch": 2.77, "learning_rate": 4.25531914893617e-08, "logits/chosen": 0.8221235275268555, "logits/rejected": 2.161207675933838, "logps/chosen": -342.333251953125, "logps/rejected": -702.6590576171875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 1.9118928909301758, "rewards/margins": 16.797853469848633, "rewards/rejected": -14.885961532592773, "step": 8150 }, { "epoch": 2.77, "learning_rate": 4.192370640815812e-08, "logits/chosen": 1.462494134902954, "logits/rejected": 2.605686664581299, "logps/chosen": -357.91046142578125, "logps/rejected": -549.108642578125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 1.6379854679107666, "rewards/margins": 17.190990447998047, "rewards/rejected": -15.553003311157227, "step": 8160 }, { "epoch": 2.78, "learning_rate": 4.129422132695455e-08, "logits/chosen": 0.5041385889053345, "logits/rejected": 2.6867682933807373, "logps/chosen": -368.87445068359375, "logps/rejected": -542.053955078125, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 1.8026316165924072, "rewards/margins": 15.136019706726074, "rewards/rejected": -13.333389282226562, "step": 8170 }, { "epoch": 2.78, "learning_rate": 4.0664736245750975e-08, "logits/chosen": 0.9905563592910767, "logits/rejected": 2.741023302078247, "logps/chosen": -330.87286376953125, "logps/rejected": -528.2855224609375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.6960986852645874, "rewards/margins": 17.097124099731445, "rewards/rejected": -15.401025772094727, "step": 8180 }, { "epoch": 2.78, "learning_rate": 4.00352511645474e-08, "logits/chosen": 1.4236377477645874, "logits/rejected": 2.461068630218506, "logps/chosen": -409.6502990722656, "logps/rejected": -636.2116088867188, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 1.5380514860153198, "rewards/margins": 17.385438919067383, "rewards/rejected": -15.847389221191406, "step": 8190 }, { "epoch": 2.79, "learning_rate": 3.940576608334383e-08, "logits/chosen": 0.491749107837677, "logits/rejected": 2.350903034210205, "logps/chosen": -398.1735534667969, "logps/rejected": -623.8147583007812, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 2.1699795722961426, "rewards/margins": 16.77878761291504, "rewards/rejected": -14.608807563781738, "step": 8200 }, { "epoch": 2.79, "eval_logits/chosen": 0.3826829195022583, "eval_logits/rejected": 2.3667984008789062, "eval_logps/chosen": -370.9769287109375, "eval_logps/rejected": -655.0703125, "eval_loss": 0.004031539428979158, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.8012951612472534, "eval_rewards/margins": 17.0452880859375, "eval_rewards/rejected": -15.243992805480957, "eval_runtime": 270.261, "eval_samples_per_second": 35.151, "eval_steps_per_second": 1.099, "step": 8200 }, { "epoch": 2.79, "learning_rate": 3.877628100214025e-08, "logits/chosen": 1.0121322870254517, "logits/rejected": 2.2878799438476562, "logps/chosen": -380.6305236816406, "logps/rejected": -651.130859375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.8424866199493408, "rewards/margins": 17.168813705444336, "rewards/rejected": -15.326327323913574, "step": 8210 }, { "epoch": 2.79, "learning_rate": 3.814679592093667e-08, "logits/chosen": 0.3881423771381378, "logits/rejected": 1.7822697162628174, "logps/chosen": -427.33660888671875, "logps/rejected": -833.1677856445312, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 2.228376626968384, "rewards/margins": 19.179027557373047, "rewards/rejected": -16.95064926147461, "step": 8220 }, { "epoch": 2.8, "learning_rate": 3.75173108397331e-08, "logits/chosen": 0.7444356083869934, "logits/rejected": 1.9901525974273682, "logps/chosen": -360.2849426269531, "logps/rejected": -703.2940673828125, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 1.849193811416626, "rewards/margins": 18.03685188293457, "rewards/rejected": -16.18765640258789, "step": 8230 }, { "epoch": 2.8, "learning_rate": 3.688782575852952e-08, "logits/chosen": 0.6894891858100891, "logits/rejected": 2.414947986602783, "logps/chosen": -296.0263671875, "logps/rejected": -487.558349609375, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 2.251554012298584, "rewards/margins": 14.990411758422852, "rewards/rejected": -12.73885726928711, "step": 8240 }, { "epoch": 2.8, "learning_rate": 3.625834067732594e-08, "logits/chosen": 0.8461538553237915, "logits/rejected": 2.1160616874694824, "logps/chosen": -449.3296813964844, "logps/rejected": -682.245849609375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.6871671676635742, "rewards/margins": 17.47844123840332, "rewards/rejected": -15.79127311706543, "step": 8250 }, { "epoch": 2.81, "learning_rate": 3.562885559612237e-08, "logits/chosen": 0.7242621779441833, "logits/rejected": 2.284646511077881, "logps/chosen": -413.39923095703125, "logps/rejected": -709.4559326171875, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 2.405566453933716, "rewards/margins": 16.431108474731445, "rewards/rejected": -14.025540351867676, "step": 8260 }, { "epoch": 2.81, "learning_rate": 3.499937051491879e-08, "logits/chosen": 0.5801094174385071, "logits/rejected": 2.4470572471618652, "logps/chosen": -316.2228698730469, "logps/rejected": -567.92626953125, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 1.7270088195800781, "rewards/margins": 18.105016708374023, "rewards/rejected": -16.378009796142578, "step": 8270 }, { "epoch": 2.81, "learning_rate": 3.4369885433715216e-08, "logits/chosen": 0.23344922065734863, "logits/rejected": 2.7615671157836914, "logps/chosen": -292.5202331542969, "logps/rejected": -507.83056640625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 2.3381764888763428, "rewards/margins": 16.836694717407227, "rewards/rejected": -14.498517036437988, "step": 8280 }, { "epoch": 2.82, "learning_rate": 3.3740400352511645e-08, "logits/chosen": 0.16733908653259277, "logits/rejected": 1.8548532724380493, "logps/chosen": -291.5389709472656, "logps/rejected": -880.97705078125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 1.7905704975128174, "rewards/margins": 17.14309310913086, "rewards/rejected": -15.35252571105957, "step": 8290 }, { "epoch": 2.82, "learning_rate": 3.311091527130807e-08, "logits/chosen": 1.012211799621582, "logits/rejected": 2.654669761657715, "logps/chosen": -351.44061279296875, "logps/rejected": -489.4246520996094, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 1.4640682935714722, "rewards/margins": 16.484954833984375, "rewards/rejected": -15.020886421203613, "step": 8300 }, { "epoch": 2.82, "eval_logits/chosen": 0.38338133692741394, "eval_logits/rejected": 2.3660054206848145, "eval_logps/chosen": -370.94989013671875, "eval_logps/rejected": -654.731689453125, "eval_loss": 0.0040396335534751415, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.803999662399292, "eval_rewards/margins": 17.01413917541504, "eval_rewards/rejected": -15.210142135620117, "eval_runtime": 271.0037, "eval_samples_per_second": 35.055, "eval_steps_per_second": 1.096, "step": 8300 }, { "epoch": 2.82, "learning_rate": 3.248143019010449e-08, "logits/chosen": 0.5455238819122314, "logits/rejected": 2.2325034141540527, "logps/chosen": -296.95709228515625, "logps/rejected": -712.0809936523438, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 1.8315194845199585, "rewards/margins": 16.148942947387695, "rewards/rejected": -14.317422866821289, "step": 8310 }, { "epoch": 2.83, "learning_rate": 3.1851945108900914e-08, "logits/chosen": 0.4286075532436371, "logits/rejected": 2.1023879051208496, "logps/chosen": -386.9786376953125, "logps/rejected": -722.5575561523438, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 1.6828031539916992, "rewards/margins": 17.364538192749023, "rewards/rejected": -15.681735038757324, "step": 8320 }, { "epoch": 2.83, "learning_rate": 3.122246002769734e-08, "logits/chosen": 0.19908392429351807, "logits/rejected": 2.4876885414123535, "logps/chosen": -299.66400146484375, "logps/rejected": -569.06201171875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 2.1454601287841797, "rewards/margins": 18.544597625732422, "rewards/rejected": -16.39913558959961, "step": 8330 }, { "epoch": 2.83, "learning_rate": 3.0592974946493766e-08, "logits/chosen": 1.0194957256317139, "logits/rejected": 2.586066484451294, "logps/chosen": -370.0921325683594, "logps/rejected": -541.5203857421875, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 2.3220067024230957, "rewards/margins": 17.71945571899414, "rewards/rejected": -15.39744758605957, "step": 8340 }, { "epoch": 2.84, "learning_rate": 2.996348986529019e-08, "logits/chosen": 1.2256660461425781, "logits/rejected": 2.4342265129089355, "logps/chosen": -465.48046875, "logps/rejected": -524.5305786132812, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 1.7561149597167969, "rewards/margins": 16.329362869262695, "rewards/rejected": -14.573247909545898, "step": 8350 }, { "epoch": 2.84, "learning_rate": 2.9334004784086618e-08, "logits/chosen": 1.0791127681732178, "logits/rejected": 2.0485925674438477, "logps/chosen": -338.21142578125, "logps/rejected": -673.5530395507812, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 2.2146098613739014, "rewards/margins": 17.91824722290039, "rewards/rejected": -15.703636169433594, "step": 8360 }, { "epoch": 2.85, "learning_rate": 2.870451970288304e-08, "logits/chosen": 0.8204256892204285, "logits/rejected": 2.081756353378296, "logps/chosen": -440.00341796875, "logps/rejected": -725.962890625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 1.794062614440918, "rewards/margins": 16.563032150268555, "rewards/rejected": -14.768969535827637, "step": 8370 }, { "epoch": 2.85, "learning_rate": 2.8075034621679467e-08, "logits/chosen": 0.86540687084198, "logits/rejected": 2.551528215408325, "logps/chosen": -502.77386474609375, "logps/rejected": -431.47930908203125, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 2.017047643661499, "rewards/margins": 15.298959732055664, "rewards/rejected": -13.28191089630127, "step": 8380 }, { "epoch": 2.85, "learning_rate": 2.744554954047589e-08, "logits/chosen": 0.7401953935623169, "logits/rejected": 2.2178268432617188, "logps/chosen": -313.5390319824219, "logps/rejected": -635.0584716796875, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 1.3623398542404175, "rewards/margins": 17.631855010986328, "rewards/rejected": -16.269515991210938, "step": 8390 }, { "epoch": 2.86, "learning_rate": 2.6816064459272312e-08, "logits/chosen": 1.069883942604065, "logits/rejected": 2.098914623260498, "logps/chosen": -505.76971435546875, "logps/rejected": -673.7338256835938, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.7513952255249023, "rewards/margins": 16.28409767150879, "rewards/rejected": -14.53270149230957, "step": 8400 }, { "epoch": 2.86, "eval_logits/chosen": 0.3679710328578949, "eval_logits/rejected": 2.349754571914673, "eval_logps/chosen": -371.5492858886719, "eval_logps/rejected": -655.7620849609375, "eval_loss": 0.004049401730298996, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.7440600395202637, "eval_rewards/margins": 17.057235717773438, "eval_rewards/rejected": -15.3131742477417, "eval_runtime": 270.8521, "eval_samples_per_second": 35.074, "eval_steps_per_second": 1.097, "step": 8400 }, { "epoch": 2.86, "learning_rate": 2.618657937806874e-08, "logits/chosen": 0.24509386718273163, "logits/rejected": 2.538390636444092, "logps/chosen": -384.0797424316406, "logps/rejected": -612.6265869140625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.4948558807373047, "rewards/margins": 16.53910255432129, "rewards/rejected": -15.044245719909668, "step": 8410 }, { "epoch": 2.86, "learning_rate": 2.555709429686516e-08, "logits/chosen": 0.545474112033844, "logits/rejected": 2.173983097076416, "logps/chosen": -460.26141357421875, "logps/rejected": -642.0891723632812, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 2.5278213024139404, "rewards/margins": 17.008352279663086, "rewards/rejected": -14.480531692504883, "step": 8420 }, { "epoch": 2.87, "learning_rate": 2.4927609215661587e-08, "logits/chosen": 0.6959326863288879, "logits/rejected": 2.6381914615631104, "logps/chosen": -317.1097717285156, "logps/rejected": -466.00372314453125, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 1.5947946310043335, "rewards/margins": 17.090538024902344, "rewards/rejected": -15.49574089050293, "step": 8430 }, { "epoch": 2.87, "learning_rate": 2.4298124134458013e-08, "logits/chosen": 0.34631219506263733, "logits/rejected": 2.274941921234131, "logps/chosen": -319.5626525878906, "logps/rejected": -699.2388305664062, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 1.376375675201416, "rewards/margins": 16.615434646606445, "rewards/rejected": -15.239057540893555, "step": 8440 }, { "epoch": 2.87, "learning_rate": 2.3668639053254436e-08, "logits/chosen": 1.167719841003418, "logits/rejected": 2.4808707237243652, "logps/chosen": -312.66412353515625, "logps/rejected": -496.24755859375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.2384330034255981, "rewards/margins": 15.005212783813477, "rewards/rejected": -13.766778945922852, "step": 8450 }, { "epoch": 2.88, "learning_rate": 2.3039153972050862e-08, "logits/chosen": 0.39031848311424255, "logits/rejected": 1.9357246160507202, "logps/chosen": -392.21319580078125, "logps/rejected": -832.5721435546875, "loss": 0.0048, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8006553649902344, "rewards/margins": 18.051868438720703, "rewards/rejected": -16.251216888427734, "step": 8460 }, { "epoch": 2.88, "learning_rate": 2.2409668890847285e-08, "logits/chosen": 0.9093269109725952, "logits/rejected": 2.1801865100860596, "logps/chosen": -328.8011779785156, "logps/rejected": -662.0884399414062, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 1.8631995916366577, "rewards/margins": 18.073341369628906, "rewards/rejected": -16.210142135620117, "step": 8470 }, { "epoch": 2.88, "learning_rate": 2.178018380964371e-08, "logits/chosen": 0.610144317150116, "logits/rejected": 2.2706358432769775, "logps/chosen": -394.10028076171875, "logps/rejected": -649.7537841796875, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 1.4115517139434814, "rewards/margins": 18.226654052734375, "rewards/rejected": -16.815099716186523, "step": 8480 }, { "epoch": 2.89, "learning_rate": 2.1150698728440137e-08, "logits/chosen": 0.09516476094722748, "logits/rejected": 2.2865424156188965, "logps/chosen": -359.79754638671875, "logps/rejected": -595.9285888671875, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 1.8083724975585938, "rewards/margins": 17.95835304260254, "rewards/rejected": -16.149978637695312, "step": 8490 }, { "epoch": 2.89, "learning_rate": 2.052121364723656e-08, "logits/chosen": 0.7596250176429749, "logits/rejected": 2.644648551940918, "logps/chosen": -317.4564514160156, "logps/rejected": -603.7373046875, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 1.4471447467803955, "rewards/margins": 15.757558822631836, "rewards/rejected": -14.31041431427002, "step": 8500 }, { "epoch": 2.89, "eval_logits/chosen": 0.3713549077510834, "eval_logits/rejected": 2.350857734680176, "eval_logps/chosen": -371.4393310546875, "eval_logps/rejected": -655.9080200195312, "eval_loss": 0.004041966050863266, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.7550534009933472, "eval_rewards/margins": 17.08282470703125, "eval_rewards/rejected": -15.327771186828613, "eval_runtime": 269.8288, "eval_samples_per_second": 35.208, "eval_steps_per_second": 1.101, "step": 8500 }, { "epoch": 2.89, "learning_rate": 1.9891728566032983e-08, "logits/chosen": 1.1884275674819946, "logits/rejected": 2.863093614578247, "logps/chosen": -388.61187744140625, "logps/rejected": -472.49200439453125, "loss": 0.0097, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.506639838218689, "rewards/margins": 17.00218391418457, "rewards/rejected": -15.495542526245117, "step": 8510 }, { "epoch": 2.9, "learning_rate": 1.926224348482941e-08, "logits/chosen": 0.4562016427516937, "logits/rejected": 2.073929786682129, "logps/chosen": -438.0223083496094, "logps/rejected": -584.0213623046875, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 1.8205589056015015, "rewards/margins": 17.869468688964844, "rewards/rejected": -16.04891014099121, "step": 8520 }, { "epoch": 2.9, "learning_rate": 1.863275840362583e-08, "logits/chosen": 1.0397640466690063, "logits/rejected": 2.006401538848877, "logps/chosen": -317.5547790527344, "logps/rejected": -775.80322265625, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.4246604442596436, "rewards/margins": 18.71263313293457, "rewards/rejected": -17.2879695892334, "step": 8530 }, { "epoch": 2.9, "learning_rate": 1.8003273322422258e-08, "logits/chosen": 1.233269453048706, "logits/rejected": 2.318169355392456, "logps/chosen": -346.78643798828125, "logps/rejected": -649.5104370117188, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.6148817539215088, "rewards/margins": 16.13572883605957, "rewards/rejected": -14.520846366882324, "step": 8540 }, { "epoch": 2.91, "learning_rate": 1.737378824121868e-08, "logits/chosen": 1.0876624584197998, "logits/rejected": 2.3624844551086426, "logps/chosen": -310.1029052734375, "logps/rejected": -627.22802734375, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 1.419732689857483, "rewards/margins": 16.585172653198242, "rewards/rejected": -15.165440559387207, "step": 8550 }, { "epoch": 2.91, "learning_rate": 1.6744303160015107e-08, "logits/chosen": 0.5178649425506592, "logits/rejected": 2.642382860183716, "logps/chosen": -383.968505859375, "logps/rejected": -616.5057983398438, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 1.3818258047103882, "rewards/margins": 15.679946899414062, "rewards/rejected": -14.298121452331543, "step": 8560 }, { "epoch": 2.91, "learning_rate": 1.6114818078811533e-08, "logits/chosen": 0.7347376942634583, "logits/rejected": 1.8428224325180054, "logps/chosen": -402.1912841796875, "logps/rejected": -741.1703491210938, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 1.5274848937988281, "rewards/margins": 15.618868827819824, "rewards/rejected": -14.09138298034668, "step": 8570 }, { "epoch": 2.92, "learning_rate": 1.5485332997607955e-08, "logits/chosen": 0.6257011890411377, "logits/rejected": 2.5201776027679443, "logps/chosen": -305.34344482421875, "logps/rejected": -632.3414306640625, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 1.6069772243499756, "rewards/margins": 18.388032913208008, "rewards/rejected": -16.781055450439453, "step": 8580 }, { "epoch": 2.92, "learning_rate": 1.485584791640438e-08, "logits/chosen": 0.6669371128082275, "logits/rejected": 2.31857967376709, "logps/chosen": -358.6900634765625, "logps/rejected": -674.6116943359375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 1.514675498008728, "rewards/margins": 17.45252227783203, "rewards/rejected": -15.937848091125488, "step": 8590 }, { "epoch": 2.92, "learning_rate": 1.4226362835200804e-08, "logits/chosen": 0.5116016864776611, "logits/rejected": 2.267575740814209, "logps/chosen": -319.94268798828125, "logps/rejected": -688.0324096679688, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 1.873579978942871, "rewards/margins": 16.022626876831055, "rewards/rejected": -14.1490478515625, "step": 8600 }, { "epoch": 2.92, "eval_logits/chosen": 0.3700520396232605, "eval_logits/rejected": 2.351830005645752, "eval_logps/chosen": -371.48968505859375, "eval_logps/rejected": -655.9204711914062, "eval_loss": 0.004031510092318058, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.750023603439331, "eval_rewards/margins": 17.079038619995117, "eval_rewards/rejected": -15.329015731811523, "eval_runtime": 269.9511, "eval_samples_per_second": 35.192, "eval_steps_per_second": 1.1, "step": 8600 }, { "epoch": 2.93, "learning_rate": 1.3596877753997229e-08, "logits/chosen": 0.7066696882247925, "logits/rejected": 2.703484058380127, "logps/chosen": -447.2012634277344, "logps/rejected": -508.0310974121094, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.8840053081512451, "rewards/margins": 18.700672149658203, "rewards/rejected": -16.816667556762695, "step": 8610 }, { "epoch": 2.93, "learning_rate": 1.2967392672793655e-08, "logits/chosen": 0.5371155738830566, "logits/rejected": 1.6587845087051392, "logps/chosen": -340.7304992675781, "logps/rejected": -915.28173828125, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 1.9751720428466797, "rewards/margins": 17.403154373168945, "rewards/rejected": -15.42798137664795, "step": 8620 }, { "epoch": 2.93, "learning_rate": 1.233790759159008e-08, "logits/chosen": 1.0137431621551514, "logits/rejected": 2.462019443511963, "logps/chosen": -357.55096435546875, "logps/rejected": -623.3336791992188, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 1.7592010498046875, "rewards/margins": 16.47138023376465, "rewards/rejected": -14.712181091308594, "step": 8630 }, { "epoch": 2.94, "learning_rate": 1.1708422510386504e-08, "logits/chosen": 0.3507612347602844, "logits/rejected": 2.4512226581573486, "logps/chosen": -425.46441650390625, "logps/rejected": -606.0345458984375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.872849464416504, "rewards/margins": 17.761226654052734, "rewards/rejected": -15.88837718963623, "step": 8640 }, { "epoch": 2.94, "learning_rate": 1.1078937429182926e-08, "logits/chosen": 0.7619954943656921, "logits/rejected": 2.1373696327209473, "logps/chosen": -490.814697265625, "logps/rejected": -750.1331787109375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 1.4002864360809326, "rewards/margins": 15.68773365020752, "rewards/rejected": -14.287447929382324, "step": 8650 }, { "epoch": 2.94, "learning_rate": 1.0449452347979353e-08, "logits/chosen": 1.328615665435791, "logits/rejected": 2.1859679222106934, "logps/chosen": -433.30377197265625, "logps/rejected": -648.45947265625, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 2.117361068725586, "rewards/margins": 17.00444221496582, "rewards/rejected": -14.887080192565918, "step": 8660 }, { "epoch": 2.95, "learning_rate": 9.819967266775777e-09, "logits/chosen": 0.5871554017066956, "logits/rejected": 1.835679054260254, "logps/chosen": -314.6351013183594, "logps/rejected": -679.0066528320312, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.2678003311157227, "rewards/margins": 16.66663932800293, "rewards/rejected": -15.398837089538574, "step": 8670 }, { "epoch": 2.95, "learning_rate": 9.190482185572201e-09, "logits/chosen": 0.7893953323364258, "logits/rejected": 1.7981361150741577, "logps/chosen": -427.78192138671875, "logps/rejected": -786.5863647460938, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 1.6110270023345947, "rewards/margins": 16.643098831176758, "rewards/rejected": -15.032072067260742, "step": 8680 }, { "epoch": 2.95, "learning_rate": 8.560997104368626e-09, "logits/chosen": 1.0635652542114258, "logits/rejected": 2.400646686553955, "logps/chosen": -365.33770751953125, "logps/rejected": -511.2415466308594, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 2.2030410766601562, "rewards/margins": 16.05733871459961, "rewards/rejected": -13.854296684265137, "step": 8690 }, { "epoch": 2.96, "learning_rate": 7.931512023165052e-09, "logits/chosen": 1.1826716661453247, "logits/rejected": 2.120217800140381, "logps/chosen": -392.803955078125, "logps/rejected": -609.3681640625, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 1.2204407453536987, "rewards/margins": 15.291590690612793, "rewards/rejected": -14.071149826049805, "step": 8700 }, { "epoch": 2.96, "eval_logits/chosen": 0.3659575581550598, "eval_logits/rejected": 2.3477611541748047, "eval_logps/chosen": -371.695556640625, "eval_logps/rejected": -656.2755737304688, "eval_loss": 0.004046812187880278, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.7294360399246216, "eval_rewards/margins": 17.093955993652344, "eval_rewards/rejected": -15.364519119262695, "eval_runtime": 269.7042, "eval_samples_per_second": 35.224, "eval_steps_per_second": 1.101, "step": 8700 }, { "epoch": 2.96, "learning_rate": 7.3020269419614755e-09, "logits/chosen": 0.9278692007064819, "logits/rejected": 1.7400996685028076, "logps/chosen": -324.8660583496094, "logps/rejected": -832.1110229492188, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.9335384368896484, "rewards/margins": 16.6754207611084, "rewards/rejected": -14.74188232421875, "step": 8710 }, { "epoch": 2.96, "learning_rate": 6.6725418607579e-09, "logits/chosen": 0.9633650779724121, "logits/rejected": 2.232921838760376, "logps/chosen": -408.4643249511719, "logps/rejected": -730.5870361328125, "loss": 0.0036, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.055891513824463, "rewards/margins": 19.569120407104492, "rewards/rejected": -17.513227462768555, "step": 8720 }, { "epoch": 2.97, "learning_rate": 6.043056779554324e-09, "logits/chosen": 0.22244560718536377, "logits/rejected": 1.6365598440170288, "logps/chosen": -327.47320556640625, "logps/rejected": -794.9049072265625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 2.042677402496338, "rewards/margins": 14.997645378112793, "rewards/rejected": -12.954968452453613, "step": 8730 }, { "epoch": 2.97, "learning_rate": 5.41357169835075e-09, "logits/chosen": 1.1285948753356934, "logits/rejected": 2.2873001098632812, "logps/chosen": -411.16046142578125, "logps/rejected": -647.3367919921875, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 1.571076512336731, "rewards/margins": 20.033849716186523, "rewards/rejected": -18.4627742767334, "step": 8740 }, { "epoch": 2.97, "learning_rate": 4.784086617147173e-09, "logits/chosen": 0.8112198114395142, "logits/rejected": 2.69305419921875, "logps/chosen": -504.293701171875, "logps/rejected": -551.0088500976562, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 1.9363229274749756, "rewards/margins": 15.2676362991333, "rewards/rejected": -13.331314086914062, "step": 8750 }, { "epoch": 2.98, "learning_rate": 4.1546015359435984e-09, "logits/chosen": 0.9000552296638489, "logits/rejected": 2.208092212677002, "logps/chosen": -448.109375, "logps/rejected": -644.7918090820312, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 1.6445732116699219, "rewards/margins": 17.73623275756836, "rewards/rejected": -16.091657638549805, "step": 8760 }, { "epoch": 2.98, "learning_rate": 3.5251164547400225e-09, "logits/chosen": 1.073515772819519, "logits/rejected": 2.285383462905884, "logps/chosen": -423.59234619140625, "logps/rejected": -647.4371948242188, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.7608124017715454, "rewards/margins": 16.25401496887207, "rewards/rejected": -14.493202209472656, "step": 8770 }, { "epoch": 2.98, "learning_rate": 2.895631373536447e-09, "logits/chosen": 1.2739975452423096, "logits/rejected": 2.3136279582977295, "logps/chosen": -397.5133361816406, "logps/rejected": -485.2103576660156, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.536682367324829, "rewards/margins": 16.215782165527344, "rewards/rejected": -14.679100036621094, "step": 8780 }, { "epoch": 2.99, "learning_rate": 2.2661462923328713e-09, "logits/chosen": 0.6144751310348511, "logits/rejected": 2.3888256549835205, "logps/chosen": -377.27337646484375, "logps/rejected": -669.4237060546875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 1.5046803951263428, "rewards/margins": 16.96630096435547, "rewards/rejected": -15.461621284484863, "step": 8790 }, { "epoch": 2.99, "learning_rate": 1.6366612111292962e-09, "logits/chosen": 1.018761396408081, "logits/rejected": 2.47017765045166, "logps/chosen": -323.1969299316406, "logps/rejected": -533.069091796875, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 1.4490963220596313, "rewards/margins": 17.300067901611328, "rewards/rejected": -15.850972175598145, "step": 8800 }, { "epoch": 2.99, "eval_logits/chosen": 0.36471885442733765, "eval_logits/rejected": 2.3464467525482178, "eval_logps/chosen": -371.68450927734375, "eval_logps/rejected": -656.2389526367188, "eval_loss": 0.004039596766233444, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.730539083480835, "eval_rewards/margins": 17.091402053833008, "eval_rewards/rejected": -15.360862731933594, "eval_runtime": 269.8964, "eval_samples_per_second": 35.199, "eval_steps_per_second": 1.1, "step": 8800 }, { "epoch": 2.99, "learning_rate": 1.0071761299257208e-09, "logits/chosen": 0.8230178952217102, "logits/rejected": 2.178173780441284, "logps/chosen": -360.5575256347656, "logps/rejected": -679.1911010742188, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 2.107916831970215, "rewards/margins": 18.536773681640625, "rewards/rejected": -16.428855895996094, "step": 8810 }, { "epoch": 3.0, "learning_rate": 3.7769104872214527e-10, "logits/chosen": 0.9183434247970581, "logits/rejected": 2.830625057220459, "logps/chosen": -334.22283935546875, "logps/rejected": -536.5045776367188, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.4037619829177856, "rewards/margins": 16.705692291259766, "rewards/rejected": -15.301928520202637, "step": 8820 }, { "epoch": 3.0, "step": 8826, "total_flos": 0.0, "train_loss": 0.03725933715852631, "train_runtime": 48940.1741, "train_samples_per_second": 11.542, "train_steps_per_second": 0.18 } ], "logging_steps": 10, "max_steps": 8826, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "trial_name": null, "trial_params": null }