{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1388, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 3.5971223021582734e-09, "logits/chosen": -2.8839163780212402, "logits/rejected": -2.699483633041382, "logps/chosen": -106.361572265625, "logps/rejected": -50.8937873840332, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 3.597122302158273e-08, "logits/chosen": -2.9716877937316895, "logits/rejected": -2.8243343830108643, "logps/chosen": -148.80015563964844, "logps/rejected": -84.43142700195312, "loss": 0.6918, "rewards/accuracies": 0.5, "rewards/chosen": 0.006020313128829002, "rewards/margins": 0.0030713342130184174, "rewards/rejected": 0.0029489779844880104, "step": 10 }, { "epoch": 0.03, "learning_rate": 7.194244604316546e-08, "logits/chosen": -2.9206809997558594, "logits/rejected": -2.7788352966308594, "logps/chosen": -167.4009246826172, "logps/rejected": -95.04873657226562, "loss": 0.6525, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.06761552393436432, "rewards/margins": 0.0887872725725174, "rewards/rejected": -0.021171752363443375, "step": 20 }, { "epoch": 0.04, "learning_rate": 1.0791366906474819e-07, "logits/chosen": -2.907208204269409, "logits/rejected": -2.7389509677886963, "logps/chosen": -128.09487915039062, "logps/rejected": -80.83646392822266, "loss": 0.5577, "rewards/accuracies": 0.875, "rewards/chosen": 0.2007008045911789, "rewards/margins": 0.2701273560523987, "rewards/rejected": -0.06942657381296158, "step": 30 }, { "epoch": 0.06, "learning_rate": 1.4388489208633092e-07, "logits/chosen": -2.9200387001037598, "logits/rejected": -2.8407883644104004, "logps/chosen": -148.62106323242188, "logps/rejected": -105.0569839477539, "loss": 0.3744, "rewards/accuracies": 1.0, "rewards/chosen": 0.6417331099510193, "rewards/margins": 1.1058695316314697, "rewards/rejected": -0.46413642168045044, "step": 40 }, { "epoch": 0.07, "learning_rate": 1.7985611510791365e-07, "logits/chosen": -2.7872376441955566, "logits/rejected": -2.709198236465454, "logps/chosen": -146.15286254882812, "logps/rejected": -104.78489685058594, "loss": 0.2995, "rewards/accuracies": 0.875, "rewards/chosen": 0.24810883402824402, "rewards/margins": 1.5657349824905396, "rewards/rejected": -1.3176262378692627, "step": 50 }, { "epoch": 0.09, "learning_rate": 2.1582733812949638e-07, "logits/chosen": -2.8873581886291504, "logits/rejected": -2.7115185260772705, "logps/chosen": -146.1516571044922, "logps/rejected": -108.72274017333984, "loss": 0.1946, "rewards/accuracies": 1.0, "rewards/chosen": 0.5725937485694885, "rewards/margins": 2.431591033935547, "rewards/rejected": -1.8589973449707031, "step": 60 }, { "epoch": 0.1, "learning_rate": 2.517985611510791e-07, "logits/chosen": -2.838667392730713, "logits/rejected": -2.7343862056732178, "logps/chosen": -130.49063110351562, "logps/rejected": -113.7320327758789, "loss": 0.1533, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.05752415582537651, "rewards/margins": 2.663848400115967, "rewards/rejected": -2.6063244342803955, "step": 70 }, { "epoch": 0.12, "learning_rate": 2.8776978417266184e-07, "logits/chosen": -2.8950698375701904, "logits/rejected": -2.691622495651245, "logps/chosen": -138.45028686523438, "logps/rejected": -100.14655303955078, "loss": 0.1717, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.42671164870262146, "rewards/margins": 2.7645459175109863, "rewards/rejected": -2.337834358215332, "step": 80 }, { "epoch": 0.13, "learning_rate": 3.2374100719424457e-07, "logits/chosen": -2.7722439765930176, "logits/rejected": -2.690833330154419, "logps/chosen": -135.5113067626953, "logps/rejected": -121.447021484375, "loss": 0.1075, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.4925897717475891, "rewards/margins": 4.084370136260986, "rewards/rejected": -3.591780185699463, "step": 90 }, { "epoch": 0.14, "learning_rate": 3.597122302158273e-07, "logits/chosen": -2.888807773590088, "logits/rejected": -2.7131495475769043, "logps/chosen": -164.85647583007812, "logps/rejected": -124.16983795166016, "loss": 0.1169, "rewards/accuracies": 1.0, "rewards/chosen": 0.2238633632659912, "rewards/margins": 3.586012601852417, "rewards/rejected": -3.362149715423584, "step": 100 }, { "epoch": 0.16, "learning_rate": 3.9568345323741003e-07, "logits/chosen": -2.7212650775909424, "logits/rejected": -2.576204538345337, "logps/chosen": -136.82293701171875, "logps/rejected": -123.34732818603516, "loss": 0.0757, "rewards/accuracies": 1.0, "rewards/chosen": 0.43352875113487244, "rewards/margins": 4.669638633728027, "rewards/rejected": -4.236109733581543, "step": 110 }, { "epoch": 0.17, "learning_rate": 4.3165467625899276e-07, "logits/chosen": -2.7657124996185303, "logits/rejected": -2.6242692470550537, "logps/chosen": -165.17178344726562, "logps/rejected": -137.9097442626953, "loss": 0.0741, "rewards/accuracies": 1.0, "rewards/chosen": 0.4255678057670593, "rewards/margins": 4.908309459686279, "rewards/rejected": -4.482741355895996, "step": 120 }, { "epoch": 0.19, "learning_rate": 4.676258992805755e-07, "logits/chosen": -2.7995333671569824, "logits/rejected": -2.628948926925659, "logps/chosen": -143.13916015625, "logps/rejected": -135.01576232910156, "loss": 0.0557, "rewards/accuracies": 1.0, "rewards/chosen": 0.4051126539707184, "rewards/margins": 6.454569339752197, "rewards/rejected": -6.0494561195373535, "step": 130 }, { "epoch": 0.2, "learning_rate": 4.99599679743795e-07, "logits/chosen": -2.856595516204834, "logits/rejected": -2.64650297164917, "logps/chosen": -182.11863708496094, "logps/rejected": -170.402587890625, "loss": 0.0904, "rewards/accuracies": 1.0, "rewards/chosen": -0.6085208058357239, "rewards/margins": 5.310309886932373, "rewards/rejected": -5.918830871582031, "step": 140 }, { "epoch": 0.22, "learning_rate": 4.955964771817453e-07, "logits/chosen": -2.720083475112915, "logits/rejected": -2.5524630546569824, "logps/chosen": -138.2317352294922, "logps/rejected": -122.82208251953125, "loss": 0.0887, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.38773685693740845, "rewards/margins": 3.9186530113220215, "rewards/rejected": -4.306389808654785, "step": 150 }, { "epoch": 0.23, "learning_rate": 4.915932746196957e-07, "logits/chosen": -2.6586403846740723, "logits/rejected": -2.5184950828552246, "logps/chosen": -146.91192626953125, "logps/rejected": -146.19537353515625, "loss": 0.0528, "rewards/accuracies": 1.0, "rewards/chosen": -0.4298267364501953, "rewards/margins": 5.707052707672119, "rewards/rejected": -6.136878490447998, "step": 160 }, { "epoch": 0.24, "learning_rate": 4.875900720576461e-07, "logits/chosen": -2.7607617378234863, "logits/rejected": -2.59885573387146, "logps/chosen": -161.85403442382812, "logps/rejected": -172.4330596923828, "loss": 0.0589, "rewards/accuracies": 1.0, "rewards/chosen": 0.16717226803302765, "rewards/margins": 7.492938041687012, "rewards/rejected": -7.325766086578369, "step": 170 }, { "epoch": 0.26, "learning_rate": 4.835868694955965e-07, "logits/chosen": -2.6541225910186768, "logits/rejected": -2.5471792221069336, "logps/chosen": -148.55508422851562, "logps/rejected": -166.07257080078125, "loss": 0.0758, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.3034805059432983, "rewards/margins": 6.427194118499756, "rewards/rejected": -7.730674743652344, "step": 180 }, { "epoch": 0.27, "learning_rate": 4.795836669335467e-07, "logits/chosen": -2.652890205383301, "logits/rejected": -2.4126124382019043, "logps/chosen": -134.67652893066406, "logps/rejected": -130.59608459472656, "loss": 0.0909, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3820854127407074, "rewards/margins": 5.234072685241699, "rewards/rejected": -5.616158485412598, "step": 190 }, { "epoch": 0.29, "learning_rate": 4.755804643714972e-07, "logits/chosen": -2.6649863719940186, "logits/rejected": -2.4534084796905518, "logps/chosen": -154.67408752441406, "logps/rejected": -138.3970947265625, "loss": 0.1012, "rewards/accuracies": 1.0, "rewards/chosen": 0.47632989287376404, "rewards/margins": 5.756840229034424, "rewards/rejected": -5.2805094718933105, "step": 200 }, { "epoch": 0.3, "learning_rate": 4.715772618094475e-07, "logits/chosen": -2.7960267066955566, "logits/rejected": -2.5353095531463623, "logps/chosen": -171.26986694335938, "logps/rejected": -157.50350952148438, "loss": 0.0516, "rewards/accuracies": 1.0, "rewards/chosen": -0.2727116644382477, "rewards/margins": 5.267422676086426, "rewards/rejected": -5.540134429931641, "step": 210 }, { "epoch": 0.32, "learning_rate": 4.675740592473979e-07, "logits/chosen": -2.5903918743133545, "logits/rejected": -2.481639862060547, "logps/chosen": -164.92575073242188, "logps/rejected": -157.99099731445312, "loss": 0.0328, "rewards/accuracies": 1.0, "rewards/chosen": -0.6272125244140625, "rewards/margins": 6.131289005279541, "rewards/rejected": -6.7585015296936035, "step": 220 }, { "epoch": 0.33, "learning_rate": 4.635708566853482e-07, "logits/chosen": -2.683683156967163, "logits/rejected": -2.465529441833496, "logps/chosen": -179.80831909179688, "logps/rejected": -153.77401733398438, "loss": 0.0659, "rewards/accuracies": 1.0, "rewards/chosen": -0.9624654054641724, "rewards/margins": 5.732499599456787, "rewards/rejected": -6.6949663162231445, "step": 230 }, { "epoch": 0.35, "learning_rate": 4.595676541232986e-07, "logits/chosen": -2.538198709487915, "logits/rejected": -2.472057580947876, "logps/chosen": -134.72840881347656, "logps/rejected": -163.64105224609375, "loss": 0.0385, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.4930785596370697, "rewards/margins": 7.688417911529541, "rewards/rejected": -8.181497573852539, "step": 240 }, { "epoch": 0.36, "learning_rate": 4.5556445156124894e-07, "logits/chosen": -2.7781453132629395, "logits/rejected": -2.5276522636413574, "logps/chosen": -143.19754028320312, "logps/rejected": -150.41925048828125, "loss": 0.0375, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8589959144592285, "rewards/margins": 6.875401973724365, "rewards/rejected": -7.73439884185791, "step": 250 }, { "epoch": 0.37, "learning_rate": 4.515612489991993e-07, "logits/chosen": -2.6978352069854736, "logits/rejected": -2.4410510063171387, "logps/chosen": -163.37667846679688, "logps/rejected": -157.6444854736328, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": -0.5269836783409119, "rewards/margins": 6.9025750160217285, "rewards/rejected": -7.429558753967285, "step": 260 }, { "epoch": 0.39, "learning_rate": 4.4755804643714965e-07, "logits/chosen": -2.6981711387634277, "logits/rejected": -2.4240591526031494, "logps/chosen": -145.78114318847656, "logps/rejected": -153.4759521484375, "loss": 0.0466, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.961786150932312, "rewards/margins": 6.817984580993652, "rewards/rejected": -7.779770851135254, "step": 270 }, { "epoch": 0.4, "learning_rate": 4.4355484387510004e-07, "logits/chosen": -2.6088438034057617, "logits/rejected": -2.49806809425354, "logps/chosen": -167.5294952392578, "logps/rejected": -196.126953125, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": -1.763117790222168, "rewards/margins": 8.220617294311523, "rewards/rejected": -9.983735084533691, "step": 280 }, { "epoch": 0.42, "learning_rate": 4.3955164131305047e-07, "logits/chosen": -2.692411184310913, "logits/rejected": -2.426178455352783, "logps/chosen": -149.58558654785156, "logps/rejected": -154.73678588867188, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": -0.13495102524757385, "rewards/margins": 8.114767074584961, "rewards/rejected": -8.249719619750977, "step": 290 }, { "epoch": 0.43, "learning_rate": 4.355484387510008e-07, "logits/chosen": -2.507620334625244, "logits/rejected": -2.3317294120788574, "logps/chosen": -165.7401123046875, "logps/rejected": -168.38839721679688, "loss": 0.0765, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.2425320148468018, "rewards/margins": 5.922076225280762, "rewards/rejected": -8.164608001708984, "step": 300 }, { "epoch": 0.45, "learning_rate": 4.315452361889512e-07, "logits/chosen": -2.6149442195892334, "logits/rejected": -2.3606739044189453, "logps/chosen": -135.47378540039062, "logps/rejected": -146.0089569091797, "loss": 0.0298, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.3370214700698853, "rewards/margins": 6.675353050231934, "rewards/rejected": -8.012373924255371, "step": 310 }, { "epoch": 0.46, "learning_rate": 4.275420336269015e-07, "logits/chosen": -2.5880959033966064, "logits/rejected": -2.35605788230896, "logps/chosen": -182.91497802734375, "logps/rejected": -189.45333862304688, "loss": 0.0479, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.185288190841675, "rewards/margins": 7.346780300140381, "rewards/rejected": -9.532068252563477, "step": 320 }, { "epoch": 0.48, "learning_rate": 4.235388310648519e-07, "logits/chosen": -2.5866305828094482, "logits/rejected": -2.3037662506103516, "logps/chosen": -184.96115112304688, "logps/rejected": -184.9438934326172, "loss": 0.0831, "rewards/accuracies": 1.0, "rewards/chosen": -3.369056224822998, "rewards/margins": 7.428493499755859, "rewards/rejected": -10.797548294067383, "step": 330 }, { "epoch": 0.49, "learning_rate": 4.1953562850280223e-07, "logits/chosen": -2.519735336303711, "logits/rejected": -2.334322690963745, "logps/chosen": -173.40858459472656, "logps/rejected": -215.0013885498047, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": -2.5725579261779785, "rewards/margins": 9.760233879089355, "rewards/rejected": -12.332793235778809, "step": 340 }, { "epoch": 0.5, "learning_rate": 4.155324259407526e-07, "logits/chosen": -2.586958408355713, "logits/rejected": -2.3933067321777344, "logps/chosen": -180.2490234375, "logps/rejected": -214.9405059814453, "loss": 0.0285, "rewards/accuracies": 1.0, "rewards/chosen": -2.519786834716797, "rewards/margins": 9.661985397338867, "rewards/rejected": -12.181772232055664, "step": 350 }, { "epoch": 0.52, "learning_rate": 4.1152922337870295e-07, "logits/chosen": -2.4591023921966553, "logits/rejected": -2.2511239051818848, "logps/chosen": -131.71694946289062, "logps/rejected": -165.09132385253906, "loss": 0.0247, "rewards/accuracies": 1.0, "rewards/chosen": -0.7444407939910889, "rewards/margins": 9.10871410369873, "rewards/rejected": -9.853155136108398, "step": 360 }, { "epoch": 0.53, "learning_rate": 4.0752602081665333e-07, "logits/chosen": -2.651655912399292, "logits/rejected": -2.4027347564697266, "logps/chosen": -175.1227569580078, "logps/rejected": -186.62240600585938, "loss": 0.0247, "rewards/accuracies": 1.0, "rewards/chosen": -1.8536550998687744, "rewards/margins": 8.069160461425781, "rewards/rejected": -9.922816276550293, "step": 370 }, { "epoch": 0.55, "learning_rate": 4.0352281825460366e-07, "logits/chosen": -2.515545606613159, "logits/rejected": -2.379769802093506, "logps/chosen": -159.44683837890625, "logps/rejected": -185.20753479003906, "loss": 0.0349, "rewards/accuracies": 1.0, "rewards/chosen": -0.584536612033844, "rewards/margins": 8.809714317321777, "rewards/rejected": -9.394251823425293, "step": 380 }, { "epoch": 0.56, "learning_rate": 3.9951961569255404e-07, "logits/chosen": -2.880056142807007, "logits/rejected": -2.5667223930358887, "logps/chosen": -166.15879821777344, "logps/rejected": -169.60914611816406, "loss": 0.0666, "rewards/accuracies": 1.0, "rewards/chosen": -1.2807844877243042, "rewards/margins": 7.306063652038574, "rewards/rejected": -8.586848258972168, "step": 390 }, { "epoch": 0.58, "learning_rate": 3.9551641313050437e-07, "logits/chosen": -2.6910109519958496, "logits/rejected": -2.438204526901245, "logps/chosen": -167.52279663085938, "logps/rejected": -193.27291870117188, "loss": 0.0591, "rewards/accuracies": 1.0, "rewards/chosen": -1.8132003545761108, "rewards/margins": 8.618528366088867, "rewards/rejected": -10.431727409362793, "step": 400 }, { "epoch": 0.59, "learning_rate": 3.9151321056845476e-07, "logits/chosen": -2.5091681480407715, "logits/rejected": -2.2911810874938965, "logps/chosen": -140.93154907226562, "logps/rejected": -179.24794006347656, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": -1.7046592235565186, "rewards/margins": 9.365106582641602, "rewards/rejected": -11.0697660446167, "step": 410 }, { "epoch": 0.61, "learning_rate": 3.875100080064051e-07, "logits/chosen": -2.5162720680236816, "logits/rejected": -2.318171501159668, "logps/chosen": -140.7128143310547, "logps/rejected": -169.2827911376953, "loss": 0.0437, "rewards/accuracies": 1.0, "rewards/chosen": -1.0878578424453735, "rewards/margins": 8.321041107177734, "rewards/rejected": -9.408899307250977, "step": 420 }, { "epoch": 0.62, "learning_rate": 3.8350680544435547e-07, "logits/chosen": -2.5058627128601074, "logits/rejected": -2.285526752471924, "logps/chosen": -158.39208984375, "logps/rejected": -191.4017791748047, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": -1.077756643295288, "rewards/margins": 9.962626457214355, "rewards/rejected": -11.040384292602539, "step": 430 }, { "epoch": 0.63, "learning_rate": 3.795036028823058e-07, "logits/chosen": -2.5467917919158936, "logits/rejected": -2.293295383453369, "logps/chosen": -165.51400756835938, "logps/rejected": -176.27957153320312, "loss": 0.0429, "rewards/accuracies": 1.0, "rewards/chosen": -1.651084542274475, "rewards/margins": 8.067974090576172, "rewards/rejected": -9.719058990478516, "step": 440 }, { "epoch": 0.65, "learning_rate": 3.755004003202562e-07, "logits/chosen": -2.5901718139648438, "logits/rejected": -2.3814101219177246, "logps/chosen": -149.50888061523438, "logps/rejected": -192.05587768554688, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": -1.6450309753417969, "rewards/margins": 9.971355438232422, "rewards/rejected": -11.616386413574219, "step": 450 }, { "epoch": 0.66, "learning_rate": 3.714971977582065e-07, "logits/chosen": -2.676997661590576, "logits/rejected": -2.5134191513061523, "logps/chosen": -156.29513549804688, "logps/rejected": -190.77088928222656, "loss": 0.0357, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.1767628192901611, "rewards/margins": 9.212038040161133, "rewards/rejected": -10.388800621032715, "step": 460 }, { "epoch": 0.68, "learning_rate": 3.674939951961569e-07, "logits/chosen": -2.75260853767395, "logits/rejected": -2.4446868896484375, "logps/chosen": -167.54098510742188, "logps/rejected": -198.13467407226562, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -0.5461426973342896, "rewards/margins": 10.665987968444824, "rewards/rejected": -11.212130546569824, "step": 470 }, { "epoch": 0.69, "learning_rate": 3.634907926341073e-07, "logits/chosen": -2.7042384147644043, "logits/rejected": -2.5021824836730957, "logps/chosen": -177.01632690429688, "logps/rejected": -194.0543670654297, "loss": 0.0369, "rewards/accuracies": 1.0, "rewards/chosen": -2.1976065635681152, "rewards/margins": 7.7528533935546875, "rewards/rejected": -9.950460433959961, "step": 480 }, { "epoch": 0.71, "learning_rate": 3.5948759007205767e-07, "logits/chosen": -2.609654188156128, "logits/rejected": -2.442094326019287, "logps/chosen": -186.2327423095703, "logps/rejected": -238.8030242919922, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": -1.969175934791565, "rewards/margins": 12.21094036102295, "rewards/rejected": -14.18011474609375, "step": 490 }, { "epoch": 0.72, "learning_rate": 3.55484387510008e-07, "logits/chosen": -2.5849597454071045, "logits/rejected": -2.4151904582977295, "logps/chosen": -184.76492309570312, "logps/rejected": -203.13241577148438, "loss": 0.0268, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.5779895782470703, "rewards/margins": 8.900407791137695, "rewards/rejected": -11.478398323059082, "step": 500 }, { "epoch": 0.72, "eval_logits/chosen": -2.4085986614227295, "eval_logits/rejected": -2.233201026916504, "eval_logps/chosen": -157.914306640625, "eval_logps/rejected": -183.62203979492188, "eval_loss": 0.03143342584371567, "eval_rewards/accuracies": 0.9960317611694336, "eval_rewards/chosen": -0.9699568152427673, "eval_rewards/margins": 8.822220802307129, "eval_rewards/rejected": -9.7921781539917, "eval_runtime": 869.9338, "eval_samples_per_second": 2.299, "eval_steps_per_second": 0.072, "step": 500 }, { "epoch": 0.73, "learning_rate": 3.514811849479584e-07, "logits/chosen": -2.4951071739196777, "logits/rejected": -2.249694585800171, "logps/chosen": -164.900634765625, "logps/rejected": -189.8306427001953, "loss": 0.0301, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.6467971801757812, "rewards/margins": 9.143733024597168, "rewards/rejected": -10.79053020477295, "step": 510 }, { "epoch": 0.75, "learning_rate": 3.474779823859087e-07, "logits/chosen": -2.575314521789551, "logits/rejected": -2.233131170272827, "logps/chosen": -182.89932250976562, "logps/rejected": -243.9858856201172, "loss": 0.0217, "rewards/accuracies": 1.0, "rewards/chosen": -1.4116153717041016, "rewards/margins": 13.464820861816406, "rewards/rejected": -14.876436233520508, "step": 520 }, { "epoch": 0.76, "learning_rate": 3.434747798238591e-07, "logits/chosen": -2.4697818756103516, "logits/rejected": -2.356581926345825, "logps/chosen": -201.47634887695312, "logps/rejected": -548.7425537109375, "loss": 0.0351, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6284077167510986, "rewards/margins": 39.81824493408203, "rewards/rejected": -43.44664764404297, "step": 530 }, { "epoch": 0.78, "learning_rate": 3.394715772618094e-07, "logits/chosen": -2.418208599090576, "logits/rejected": -2.22477126121521, "logps/chosen": -165.20034790039062, "logps/rejected": -628.0509643554688, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -1.872553825378418, "rewards/margins": 52.413848876953125, "rewards/rejected": -54.286399841308594, "step": 540 }, { "epoch": 0.79, "learning_rate": 3.354683746997598e-07, "logits/chosen": -2.5192372798919678, "logits/rejected": -2.295382261276245, "logps/chosen": -176.81497192382812, "logps/rejected": -363.97174072265625, "loss": 0.0275, "rewards/accuracies": 1.0, "rewards/chosen": -1.7440744638442993, "rewards/margins": 24.896778106689453, "rewards/rejected": -26.640857696533203, "step": 550 }, { "epoch": 0.81, "learning_rate": 3.3146517213771014e-07, "logits/chosen": -2.6569995880126953, "logits/rejected": -2.2872815132141113, "logps/chosen": -215.7018585205078, "logps/rejected": -305.87799072265625, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": -2.8822388648986816, "rewards/margins": 16.257402420043945, "rewards/rejected": -19.1396427154541, "step": 560 }, { "epoch": 0.82, "learning_rate": 3.274619695756605e-07, "logits/chosen": -2.393214702606201, "logits/rejected": -2.0127763748168945, "logps/chosen": -185.98187255859375, "logps/rejected": -338.15576171875, "loss": 0.0316, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6043496131896973, "rewards/margins": 21.948997497558594, "rewards/rejected": -25.553346633911133, "step": 570 }, { "epoch": 0.84, "learning_rate": 3.2345876701361085e-07, "logits/chosen": -2.4011263847351074, "logits/rejected": -2.0070765018463135, "logps/chosen": -196.2584228515625, "logps/rejected": -484.80926513671875, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -3.4227471351623535, "rewards/margins": 34.711463928222656, "rewards/rejected": -38.13420867919922, "step": 580 }, { "epoch": 0.85, "learning_rate": 3.1945556445156124e-07, "logits/chosen": -1.9645344018936157, "logits/rejected": -1.2225711345672607, "logps/chosen": -209.5574951171875, "logps/rejected": -543.1734008789062, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": -3.6722359657287598, "rewards/margins": 40.712772369384766, "rewards/rejected": -44.385005950927734, "step": 590 }, { "epoch": 0.86, "learning_rate": 3.1545236188951157e-07, "logits/chosen": -1.978044867515564, "logits/rejected": -1.2324669361114502, "logps/chosen": -192.2949981689453, "logps/rejected": -480.520751953125, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": -3.5515990257263184, "rewards/margins": 35.85133743286133, "rewards/rejected": -39.40293884277344, "step": 600 }, { "epoch": 0.88, "learning_rate": 3.1144915932746195e-07, "logits/chosen": -2.275550365447998, "logits/rejected": -1.6282291412353516, "logps/chosen": -196.13803100585938, "logps/rejected": -297.6045837402344, "loss": 0.0368, "rewards/accuracies": 1.0, "rewards/chosen": -4.565072059631348, "rewards/margins": 17.173152923583984, "rewards/rejected": -21.738224029541016, "step": 610 }, { "epoch": 0.89, "learning_rate": 3.074459567654123e-07, "logits/chosen": -2.4316773414611816, "logits/rejected": -1.7663853168487549, "logps/chosen": -189.8267822265625, "logps/rejected": -321.9093017578125, "loss": 0.0363, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.32307505607605, "rewards/margins": 19.51993179321289, "rewards/rejected": -22.843008041381836, "step": 620 }, { "epoch": 0.91, "learning_rate": 3.0344275420336267e-07, "logits/chosen": -2.2213199138641357, "logits/rejected": -1.702415108680725, "logps/chosen": -189.29393005371094, "logps/rejected": -392.8915100097656, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -3.8561224937438965, "rewards/margins": 26.04262924194336, "rewards/rejected": -29.898754119873047, "step": 630 }, { "epoch": 0.92, "learning_rate": 2.99439551641313e-07, "logits/chosen": -2.0172629356384277, "logits/rejected": -1.2431235313415527, "logps/chosen": -208.375732421875, "logps/rejected": -378.9666748046875, "loss": 0.0228, "rewards/accuracies": 1.0, "rewards/chosen": -6.202768325805664, "rewards/margins": 22.777095794677734, "rewards/rejected": -28.9798641204834, "step": 640 }, { "epoch": 0.94, "learning_rate": 2.954363490792634e-07, "logits/chosen": -1.7751468420028687, "logits/rejected": -1.2209514379501343, "logps/chosen": -241.2014617919922, "logps/rejected": -514.4192504882812, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -7.191336154937744, "rewards/margins": 33.45779800415039, "rewards/rejected": -40.649131774902344, "step": 650 }, { "epoch": 0.95, "learning_rate": 2.914331465172137e-07, "logits/chosen": -1.6847556829452515, "logits/rejected": -1.00700843334198, "logps/chosen": -197.2582550048828, "logps/rejected": -443.4234313964844, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": -5.894466400146484, "rewards/margins": 29.733402252197266, "rewards/rejected": -35.627864837646484, "step": 660 }, { "epoch": 0.97, "learning_rate": 2.8742994395516415e-07, "logits/chosen": -2.154357433319092, "logits/rejected": -1.1991710662841797, "logps/chosen": -189.2927703857422, "logps/rejected": -484.61785888671875, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": -4.121435642242432, "rewards/margins": 36.48408508300781, "rewards/rejected": -40.60551834106445, "step": 670 }, { "epoch": 0.98, "learning_rate": 2.834267413931145e-07, "logits/chosen": -1.9125760793685913, "logits/rejected": -1.0993740558624268, "logps/chosen": -212.220947265625, "logps/rejected": -427.8121643066406, "loss": 0.0128, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.176814079284668, "rewards/margins": 28.130443572998047, "rewards/rejected": -33.30725860595703, "step": 680 }, { "epoch": 0.99, "learning_rate": 2.7942353883106486e-07, "logits/chosen": -2.2864699363708496, "logits/rejected": -1.399320363998413, "logps/chosen": -189.4803924560547, "logps/rejected": -217.796142578125, "loss": 0.0196, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.550410032272339, "rewards/margins": 9.777883529663086, "rewards/rejected": -13.328292846679688, "step": 690 }, { "epoch": 1.01, "learning_rate": 2.754203362690152e-07, "logits/chosen": -2.773916244506836, "logits/rejected": -2.527047872543335, "logps/chosen": -183.30543518066406, "logps/rejected": -429.06365966796875, "loss": 0.0442, "rewards/accuracies": 1.0, "rewards/chosen": -2.4481396675109863, "rewards/margins": 30.67641258239746, "rewards/rejected": -33.124549865722656, "step": 700 }, { "epoch": 1.02, "learning_rate": 2.714171337069656e-07, "logits/chosen": -2.882967948913574, "logits/rejected": -2.72076153755188, "logps/chosen": -167.57542419433594, "logps/rejected": -250.15274047851562, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -1.485212802886963, "rewards/margins": 14.637022018432617, "rewards/rejected": -16.122234344482422, "step": 710 }, { "epoch": 1.04, "learning_rate": 2.674139311449159e-07, "logits/chosen": -2.7207860946655273, "logits/rejected": -2.5453267097473145, "logps/chosen": -151.00723266601562, "logps/rejected": -295.5516357421875, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -1.0635440349578857, "rewards/margins": 20.84624671936035, "rewards/rejected": -21.9097900390625, "step": 720 }, { "epoch": 1.05, "learning_rate": 2.634107285828663e-07, "logits/chosen": -2.799225091934204, "logits/rejected": -2.6195671558380127, "logps/chosen": -207.414306640625, "logps/rejected": -471.4579162597656, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": -5.837430477142334, "rewards/margins": 32.82604217529297, "rewards/rejected": -38.66347885131836, "step": 730 }, { "epoch": 1.07, "learning_rate": 2.594075260208166e-07, "logits/chosen": -2.8809666633605957, "logits/rejected": -2.6668760776519775, "logps/chosen": -189.98104858398438, "logps/rejected": -271.88641357421875, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -2.0119662284851074, "rewards/margins": 15.516934394836426, "rewards/rejected": -17.528902053833008, "step": 740 }, { "epoch": 1.08, "learning_rate": 2.55404323458767e-07, "logits/chosen": -2.8235630989074707, "logits/rejected": -2.6322312355041504, "logps/chosen": -159.7130584716797, "logps/rejected": -270.10101318359375, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -1.5178642272949219, "rewards/margins": 17.20760154724121, "rewards/rejected": -18.725465774536133, "step": 750 }, { "epoch": 1.1, "learning_rate": 2.514011208967174e-07, "logits/chosen": -2.6800270080566406, "logits/rejected": -2.516126871109009, "logps/chosen": -163.38233947753906, "logps/rejected": -561.2886352539062, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -2.2010345458984375, "rewards/margins": 45.494956970214844, "rewards/rejected": -47.69599914550781, "step": 760 }, { "epoch": 1.11, "learning_rate": 2.473979183346677e-07, "logits/chosen": -2.7506349086761475, "logits/rejected": -2.5886902809143066, "logps/chosen": -141.7732696533203, "logps/rejected": -377.6195373535156, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": -1.402200698852539, "rewards/margins": 28.883886337280273, "rewards/rejected": -30.286090850830078, "step": 770 }, { "epoch": 1.12, "learning_rate": 2.433947157726181e-07, "logits/chosen": -2.842419147491455, "logits/rejected": -2.6155142784118652, "logps/chosen": -180.7171630859375, "logps/rejected": -282.32220458984375, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -2.1550252437591553, "rewards/margins": 17.016422271728516, "rewards/rejected": -19.17144775390625, "step": 780 }, { "epoch": 1.14, "learning_rate": 2.3939151321056843e-07, "logits/chosen": -2.924471378326416, "logits/rejected": -2.6854348182678223, "logps/chosen": -170.48342895507812, "logps/rejected": -230.1216583251953, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -2.0587759017944336, "rewards/margins": 12.352742195129395, "rewards/rejected": -14.411517143249512, "step": 790 }, { "epoch": 1.15, "learning_rate": 2.353883106485188e-07, "logits/chosen": -2.8059592247009277, "logits/rejected": -2.5042202472686768, "logps/chosen": -162.3031463623047, "logps/rejected": -295.27105712890625, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -1.9291874170303345, "rewards/margins": 20.166751861572266, "rewards/rejected": -22.09593963623047, "step": 800 }, { "epoch": 1.17, "learning_rate": 2.3138510808646917e-07, "logits/chosen": -2.7831666469573975, "logits/rejected": -2.5290932655334473, "logps/chosen": -156.5865478515625, "logps/rejected": -512.7086791992188, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.6812311410903931, "rewards/margins": 43.404396057128906, "rewards/rejected": -44.085628509521484, "step": 810 }, { "epoch": 1.18, "learning_rate": 2.2738190552441953e-07, "logits/chosen": -2.8566126823425293, "logits/rejected": -2.582984447479248, "logps/chosen": -172.5872802734375, "logps/rejected": -197.41136169433594, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -1.777562141418457, "rewards/margins": 9.757658958435059, "rewards/rejected": -11.535221099853516, "step": 820 }, { "epoch": 1.2, "learning_rate": 2.2337870296236989e-07, "logits/chosen": -2.78861927986145, "logits/rejected": -2.546877384185791, "logps/chosen": -144.1377716064453, "logps/rejected": -255.5492706298828, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -0.23271174728870392, "rewards/margins": 17.866634368896484, "rewards/rejected": -18.09934425354004, "step": 830 }, { "epoch": 1.21, "learning_rate": 2.1937550040032024e-07, "logits/chosen": -2.7955825328826904, "logits/rejected": -2.6178054809570312, "logps/chosen": -174.67660522460938, "logps/rejected": -284.92230224609375, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -2.2215218544006348, "rewards/margins": 17.042997360229492, "rewards/rejected": -19.264522552490234, "step": 840 }, { "epoch": 1.22, "learning_rate": 2.153722978382706e-07, "logits/chosen": -2.6289403438568115, "logits/rejected": -2.4111552238464355, "logps/chosen": -174.08555603027344, "logps/rejected": -477.3523864746094, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -2.3877720832824707, "rewards/margins": 36.872703552246094, "rewards/rejected": -39.260475158691406, "step": 850 }, { "epoch": 1.24, "learning_rate": 2.1136909527622096e-07, "logits/chosen": -2.8065085411071777, "logits/rejected": -2.5915045738220215, "logps/chosen": -155.339599609375, "logps/rejected": -247.944091796875, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -0.14878419041633606, "rewards/margins": 16.316041946411133, "rewards/rejected": -16.464826583862305, "step": 860 }, { "epoch": 1.25, "learning_rate": 2.0736589271417131e-07, "logits/chosen": -2.9013023376464844, "logits/rejected": -2.6518332958221436, "logps/chosen": -196.78958129882812, "logps/rejected": -258.1006774902344, "loss": 0.0071, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.3248391151428223, "rewards/margins": 12.6865816116333, "rewards/rejected": -16.011423110961914, "step": 870 }, { "epoch": 1.27, "learning_rate": 2.0336269015212167e-07, "logits/chosen": -2.7131872177124023, "logits/rejected": -2.4915966987609863, "logps/chosen": -186.66929626464844, "logps/rejected": -523.0761108398438, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -2.9156155586242676, "rewards/margins": 41.05046844482422, "rewards/rejected": -43.96608352661133, "step": 880 }, { "epoch": 1.28, "learning_rate": 1.9935948759007203e-07, "logits/chosen": -2.802422285079956, "logits/rejected": -2.6357274055480957, "logps/chosen": -171.12216186523438, "logps/rejected": -279.912109375, "loss": 0.0198, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.927133560180664, "rewards/margins": 16.069978713989258, "rewards/rejected": -19.997112274169922, "step": 890 }, { "epoch": 1.3, "learning_rate": 1.953562850280224e-07, "logits/chosen": -2.743924140930176, "logits/rejected": -2.569491147994995, "logps/chosen": -167.91322326660156, "logps/rejected": -284.1127624511719, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -2.3164217472076416, "rewards/margins": 17.26920509338379, "rewards/rejected": -19.585628509521484, "step": 900 }, { "epoch": 1.31, "learning_rate": 1.9135308246597277e-07, "logits/chosen": -2.8673033714294434, "logits/rejected": -2.6498348712921143, "logps/chosen": -142.06503295898438, "logps/rejected": -239.8588409423828, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.7565892338752747, "rewards/margins": 15.723353385925293, "rewards/rejected": -16.47994613647461, "step": 910 }, { "epoch": 1.33, "learning_rate": 1.8734987990392313e-07, "logits/chosen": -2.821174144744873, "logits/rejected": -2.6474757194519043, "logps/chosen": -171.9628143310547, "logps/rejected": -232.91439819335938, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -1.7939846515655518, "rewards/margins": 12.49173355102539, "rewards/rejected": -14.285717964172363, "step": 920 }, { "epoch": 1.34, "learning_rate": 1.8334667734187348e-07, "logits/chosen": -2.755138397216797, "logits/rejected": -2.5319771766662598, "logps/chosen": -180.15707397460938, "logps/rejected": -481.31512451171875, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -2.5247740745544434, "rewards/margins": 36.42875289916992, "rewards/rejected": -38.95352554321289, "step": 930 }, { "epoch": 1.35, "learning_rate": 1.7934347477982384e-07, "logits/chosen": -2.872758388519287, "logits/rejected": -2.609778881072998, "logps/chosen": -157.4242401123047, "logps/rejected": -223.9275360107422, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -0.7862883806228638, "rewards/margins": 13.082984924316406, "rewards/rejected": -13.869272232055664, "step": 940 }, { "epoch": 1.37, "learning_rate": 1.753402722177742e-07, "logits/chosen": -2.79685115814209, "logits/rejected": -2.5169830322265625, "logps/chosen": -157.01585388183594, "logps/rejected": -436.61602783203125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.8217869997024536, "rewards/margins": 34.775390625, "rewards/rejected": -35.59718322753906, "step": 950 }, { "epoch": 1.38, "learning_rate": 1.7133706965572455e-07, "logits/chosen": -2.896519660949707, "logits/rejected": -2.5488381385803223, "logps/chosen": -175.59397888183594, "logps/rejected": -218.03466796875, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -1.972507119178772, "rewards/margins": 11.009244918823242, "rewards/rejected": -12.981752395629883, "step": 960 }, { "epoch": 1.4, "learning_rate": 1.673338670936749e-07, "logits/chosen": -2.8404831886291504, "logits/rejected": -2.5836331844329834, "logps/chosen": -184.15939331054688, "logps/rejected": -292.54693603515625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -1.556754469871521, "rewards/margins": 17.97411346435547, "rewards/rejected": -19.530866622924805, "step": 970 }, { "epoch": 1.41, "learning_rate": 1.633306645316253e-07, "logits/chosen": -2.78204345703125, "logits/rejected": -2.52099609375, "logps/chosen": -180.96939086914062, "logps/rejected": -375.949462890625, "loss": 0.0131, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8274238109588623, "rewards/margins": 26.82219886779785, "rewards/rejected": -28.64962387084961, "step": 980 }, { "epoch": 1.43, "learning_rate": 1.5932746196957568e-07, "logits/chosen": -2.8116698265075684, "logits/rejected": -2.564847707748413, "logps/chosen": -159.66482543945312, "logps/rejected": -267.16583251953125, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -1.7409461736679077, "rewards/margins": 17.136062622070312, "rewards/rejected": -18.87700843811035, "step": 990 }, { "epoch": 1.44, "learning_rate": 1.5532425940752604e-07, "logits/chosen": -2.7542591094970703, "logits/rejected": -2.5157008171081543, "logps/chosen": -183.57919311523438, "logps/rejected": -581.8060913085938, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -3.2423195838928223, "rewards/margins": 46.37403869628906, "rewards/rejected": -49.616355895996094, "step": 1000 }, { "epoch": 1.44, "eval_logits/chosen": -2.633044958114624, "eval_logits/rejected": -2.436069965362549, "eval_logps/chosen": -164.32315063476562, "eval_logps/rejected": -331.22625732421875, "eval_loss": 0.022912979125976562, "eval_rewards/accuracies": 0.9960317611694336, "eval_rewards/chosen": -1.6108430624008179, "eval_rewards/margins": 22.94175910949707, "eval_rewards/rejected": -24.55260467529297, "eval_runtime": 924.6056, "eval_samples_per_second": 2.163, "eval_steps_per_second": 0.068, "step": 1000 }, { "epoch": 1.46, "learning_rate": 1.513210568454764e-07, "logits/chosen": -2.764820098876953, "logits/rejected": -2.5704541206359863, "logps/chosen": -163.67437744140625, "logps/rejected": -395.3125915527344, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -2.4941201210021973, "rewards/margins": 27.328876495361328, "rewards/rejected": -29.8229923248291, "step": 1010 }, { "epoch": 1.47, "learning_rate": 1.4731785428342675e-07, "logits/chosen": -2.8067145347595215, "logits/rejected": -2.502478837966919, "logps/chosen": -146.5375518798828, "logps/rejected": -257.7701110839844, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.222342848777771, "rewards/margins": 17.490814208984375, "rewards/rejected": -18.713157653808594, "step": 1020 }, { "epoch": 1.48, "learning_rate": 1.433146517213771e-07, "logits/chosen": -2.8830108642578125, "logits/rejected": -2.599515438079834, "logps/chosen": -164.00958251953125, "logps/rejected": -202.6780242919922, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -1.6045265197753906, "rewards/margins": 10.405461311340332, "rewards/rejected": -12.009988784790039, "step": 1030 }, { "epoch": 1.5, "learning_rate": 1.3931144915932746e-07, "logits/chosen": -2.8134148120880127, "logits/rejected": -2.5562379360198975, "logps/chosen": -167.71530151367188, "logps/rejected": -212.62570190429688, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -1.176457166671753, "rewards/margins": 11.668563842773438, "rewards/rejected": -12.84502124786377, "step": 1040 }, { "epoch": 1.51, "learning_rate": 1.3530824659727782e-07, "logits/chosen": -2.564943790435791, "logits/rejected": -2.4214589595794678, "logps/chosen": -144.50045776367188, "logps/rejected": -427.9203186035156, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -1.0412085056304932, "rewards/margins": 33.99787139892578, "rewards/rejected": -35.03908157348633, "step": 1050 }, { "epoch": 1.53, "learning_rate": 1.3130504403522818e-07, "logits/chosen": -2.8820528984069824, "logits/rejected": -2.634192943572998, "logps/chosen": -147.8389129638672, "logps/rejected": -352.130859375, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -0.7727874517440796, "rewards/margins": 26.576446533203125, "rewards/rejected": -27.3492374420166, "step": 1060 }, { "epoch": 1.54, "learning_rate": 1.2730184147317853e-07, "logits/chosen": -2.7832694053649902, "logits/rejected": -2.5845096111297607, "logps/chosen": -170.01785278320312, "logps/rejected": -473.90863037109375, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -1.1985461711883545, "rewards/margins": 36.935890197753906, "rewards/rejected": -38.13444137573242, "step": 1070 }, { "epoch": 1.56, "learning_rate": 1.232986389111289e-07, "logits/chosen": -2.7855515480041504, "logits/rejected": -2.5294415950775146, "logps/chosen": -158.38758850097656, "logps/rejected": -244.84619140625, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -0.8522005081176758, "rewards/margins": 15.829713821411133, "rewards/rejected": -16.681913375854492, "step": 1080 }, { "epoch": 1.57, "learning_rate": 1.1929543634907927e-07, "logits/chosen": -2.898603916168213, "logits/rejected": -2.705836772918701, "logps/chosen": -159.56637573242188, "logps/rejected": -371.81805419921875, "loss": 0.0088, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.5499234199523926, "rewards/margins": 27.168338775634766, "rewards/rejected": -28.71826171875, "step": 1090 }, { "epoch": 1.59, "learning_rate": 1.1529223378702962e-07, "logits/chosen": -2.733206272125244, "logits/rejected": -2.5612919330596924, "logps/chosen": -172.57748413085938, "logps/rejected": -559.9437255859375, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": -1.0921106338500977, "rewards/margins": 45.248077392578125, "rewards/rejected": -46.340187072753906, "step": 1100 }, { "epoch": 1.6, "learning_rate": 1.1128903122497999e-07, "logits/chosen": -2.8048062324523926, "logits/rejected": -2.533332347869873, "logps/chosen": -145.4043731689453, "logps/rejected": -206.748291015625, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -1.4871163368225098, "rewards/margins": 12.425850868225098, "rewards/rejected": -13.91296672821045, "step": 1110 }, { "epoch": 1.61, "learning_rate": 1.0728582866293035e-07, "logits/chosen": -2.8180341720581055, "logits/rejected": -2.671854257583618, "logps/chosen": -164.4940948486328, "logps/rejected": -299.62310791015625, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -1.2362325191497803, "rewards/margins": 19.141382217407227, "rewards/rejected": -20.377614974975586, "step": 1120 }, { "epoch": 1.63, "learning_rate": 1.032826261008807e-07, "logits/chosen": -2.7941746711730957, "logits/rejected": -2.4854462146759033, "logps/chosen": -170.0912628173828, "logps/rejected": -517.0354614257812, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -1.6481285095214844, "rewards/margins": 41.76611328125, "rewards/rejected": -43.414241790771484, "step": 1130 }, { "epoch": 1.64, "learning_rate": 9.927942353883106e-08, "logits/chosen": -2.7304294109344482, "logits/rejected": -2.458939790725708, "logps/chosen": -147.95701599121094, "logps/rejected": -289.230224609375, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -0.3893832266330719, "rewards/margins": 20.953866958618164, "rewards/rejected": -21.343250274658203, "step": 1140 }, { "epoch": 1.66, "learning_rate": 9.527622097678143e-08, "logits/chosen": -2.786956310272217, "logits/rejected": -2.565520763397217, "logps/chosen": -172.35365295410156, "logps/rejected": -397.92059326171875, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -1.4105862379074097, "rewards/margins": 28.689733505249023, "rewards/rejected": -30.10032081604004, "step": 1150 }, { "epoch": 1.67, "learning_rate": 9.127301841473179e-08, "logits/chosen": -2.803377151489258, "logits/rejected": -2.601539134979248, "logps/chosen": -152.02151489257812, "logps/rejected": -315.120849609375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.8650063276290894, "rewards/margins": 22.168514251708984, "rewards/rejected": -23.033519744873047, "step": 1160 }, { "epoch": 1.69, "learning_rate": 8.726981585268214e-08, "logits/chosen": -2.8182191848754883, "logits/rejected": -2.570002555847168, "logps/chosen": -149.609619140625, "logps/rejected": -231.5469512939453, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -0.5421051979064941, "rewards/margins": 14.738876342773438, "rewards/rejected": -15.280984878540039, "step": 1170 }, { "epoch": 1.7, "learning_rate": 8.32666132906325e-08, "logits/chosen": -2.7290663719177246, "logits/rejected": -2.519298791885376, "logps/chosen": -149.35226440429688, "logps/rejected": -345.97222900390625, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -1.2555046081542969, "rewards/margins": 24.839550018310547, "rewards/rejected": -26.09505271911621, "step": 1180 }, { "epoch": 1.71, "learning_rate": 7.926341072858286e-08, "logits/chosen": -2.8426907062530518, "logits/rejected": -2.560478448867798, "logps/chosen": -147.8146209716797, "logps/rejected": -243.75064086914062, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -0.8054535984992981, "rewards/margins": 15.806452751159668, "rewards/rejected": -16.61190414428711, "step": 1190 }, { "epoch": 1.73, "learning_rate": 7.526020816653323e-08, "logits/chosen": -2.674760341644287, "logits/rejected": -2.367633819580078, "logps/chosen": -151.23182678222656, "logps/rejected": -421.35333251953125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -2.7838943004608154, "rewards/margins": 32.74809646606445, "rewards/rejected": -35.5319938659668, "step": 1200 }, { "epoch": 1.74, "learning_rate": 7.125700560448359e-08, "logits/chosen": -2.726081371307373, "logits/rejected": -2.4065871238708496, "logps/chosen": -150.547119140625, "logps/rejected": -347.12457275390625, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.623246431350708, "rewards/margins": 26.384979248046875, "rewards/rejected": -27.008224487304688, "step": 1210 }, { "epoch": 1.76, "learning_rate": 6.725380304243394e-08, "logits/chosen": -2.7504703998565674, "logits/rejected": -2.449279308319092, "logps/chosen": -158.4932861328125, "logps/rejected": -315.344482421875, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.7700117826461792, "rewards/margins": 22.627595901489258, "rewards/rejected": -23.397607803344727, "step": 1220 }, { "epoch": 1.77, "learning_rate": 6.32506004803843e-08, "logits/chosen": -2.7155704498291016, "logits/rejected": -2.4125099182128906, "logps/chosen": -149.95974731445312, "logps/rejected": -408.9559631347656, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -0.43645238876342773, "rewards/margins": 32.31442642211914, "rewards/rejected": -32.750877380371094, "step": 1230 }, { "epoch": 1.79, "learning_rate": 5.9247397918334664e-08, "logits/chosen": -2.7009987831115723, "logits/rejected": -2.5183584690093994, "logps/chosen": -175.1053466796875, "logps/rejected": -374.5074157714844, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -1.2859503030776978, "rewards/margins": 26.71976089477539, "rewards/rejected": -28.005706787109375, "step": 1240 }, { "epoch": 1.8, "learning_rate": 5.524419535628502e-08, "logits/chosen": -2.724604368209839, "logits/rejected": -2.4586219787597656, "logps/chosen": -204.1591033935547, "logps/rejected": -626.8215942382812, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -4.939435958862305, "rewards/margins": 48.43818283081055, "rewards/rejected": -53.37761688232422, "step": 1250 }, { "epoch": 1.82, "learning_rate": 5.1240992794235385e-08, "logits/chosen": -2.8005754947662354, "logits/rejected": -2.5299735069274902, "logps/chosen": -153.12969970703125, "logps/rejected": -334.99786376953125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.5083599090576172, "rewards/margins": 24.945537567138672, "rewards/rejected": -25.45389747619629, "step": 1260 }, { "epoch": 1.83, "learning_rate": 4.723779023218575e-08, "logits/chosen": -2.721325159072876, "logits/rejected": -2.4367713928222656, "logps/chosen": -154.41665649414062, "logps/rejected": -428.780517578125, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -1.7341115474700928, "rewards/margins": 34.29324722290039, "rewards/rejected": -36.02735137939453, "step": 1270 }, { "epoch": 1.84, "learning_rate": 4.323458767013611e-08, "logits/chosen": -2.7661736011505127, "logits/rejected": -2.489382028579712, "logps/chosen": -153.54531860351562, "logps/rejected": -436.8839416503906, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -0.38232460618019104, "rewards/margins": 35.33488082885742, "rewards/rejected": -35.71720504760742, "step": 1280 }, { "epoch": 1.86, "learning_rate": 3.923138510808647e-08, "logits/chosen": -2.6737539768218994, "logits/rejected": -2.4625658988952637, "logps/chosen": -154.32327270507812, "logps/rejected": -682.4031982421875, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -1.4486008882522583, "rewards/margins": 58.11639404296875, "rewards/rejected": -59.56499481201172, "step": 1290 }, { "epoch": 1.87, "learning_rate": 3.5228182546036826e-08, "logits/chosen": -2.5800118446350098, "logits/rejected": -2.395155191421509, "logps/chosen": -140.3927764892578, "logps/rejected": -396.32806396484375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.859173595905304, "rewards/margins": 30.5933780670166, "rewards/rejected": -31.452550888061523, "step": 1300 }, { "epoch": 1.89, "learning_rate": 3.122497998398719e-08, "logits/chosen": -2.6436543464660645, "logits/rejected": -2.371372699737549, "logps/chosen": -123.04959869384766, "logps/rejected": -331.83624267578125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.4394907057285309, "rewards/margins": 26.18316650390625, "rewards/rejected": -26.622655868530273, "step": 1310 }, { "epoch": 1.9, "learning_rate": 2.722177742193755e-08, "logits/chosen": -2.7385857105255127, "logits/rejected": -2.4508581161499023, "logps/chosen": -175.63052368164062, "logps/rejected": -294.7503356933594, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -1.2934653759002686, "rewards/margins": 19.555923461914062, "rewards/rejected": -20.849384307861328, "step": 1320 }, { "epoch": 1.92, "learning_rate": 2.3218574859887907e-08, "logits/chosen": -2.794159412384033, "logits/rejected": -2.5210330486297607, "logps/chosen": -155.2234344482422, "logps/rejected": -240.988037109375, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -0.8834775686264038, "rewards/margins": 14.685577392578125, "rewards/rejected": -15.569055557250977, "step": 1330 }, { "epoch": 1.93, "learning_rate": 1.9215372297838268e-08, "logits/chosen": -2.6487419605255127, "logits/rejected": -2.3688254356384277, "logps/chosen": -140.86117553710938, "logps/rejected": -280.75250244140625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.05746353790163994, "rewards/margins": 20.066030502319336, "rewards/rejected": -20.008569717407227, "step": 1340 }, { "epoch": 1.95, "learning_rate": 1.521216973578863e-08, "logits/chosen": -2.875211715698242, "logits/rejected": -2.490218162536621, "logps/chosen": -172.50912475585938, "logps/rejected": -273.8616943359375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.1841375827789307, "rewards/margins": 17.2589054107666, "rewards/rejected": -18.443042755126953, "step": 1350 }, { "epoch": 1.96, "learning_rate": 1.120896717373899e-08, "logits/chosen": -2.7332968711853027, "logits/rejected": -2.491285800933838, "logps/chosen": -150.06475830078125, "logps/rejected": -376.63824462890625, "loss": 0.0102, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.1961749792099, "rewards/margins": 28.852294921875, "rewards/rejected": -30.048471450805664, "step": 1360 }, { "epoch": 1.97, "learning_rate": 7.205764611689351e-09, "logits/chosen": -2.6668992042541504, "logits/rejected": -2.389853000640869, "logps/chosen": -170.96156311035156, "logps/rejected": -512.9969482421875, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.6127735376358032, "rewards/margins": 40.84336471557617, "rewards/rejected": -41.456138610839844, "step": 1370 }, { "epoch": 1.99, "learning_rate": 3.2025620496397115e-09, "logits/chosen": -2.7764010429382324, "logits/rejected": -2.528985023498535, "logps/chosen": -173.30990600585938, "logps/rejected": -363.98358154296875, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -0.7063196897506714, "rewards/margins": 26.459331512451172, "rewards/rejected": -27.165653228759766, "step": 1380 }, { "epoch": 2.0, "step": 1388, "total_flos": 0.0, "train_loss": 0.04402416471998484, "train_runtime": 16535.8816, "train_samples_per_second": 1.341, "train_steps_per_second": 0.084 } ], "logging_steps": 10, "max_steps": 1388, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }