{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 2907, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.7182130584192438e-09, "logits/chosen": -2.7645790576934814, "logits/rejected": -2.8125059604644775, "logps/chosen": -113.67314910888672, "logps/rejected": -132.0498504638672, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 1.718213058419244e-08, "logits/chosen": -2.9990971088409424, "logits/rejected": -3.0227837562561035, "logps/chosen": -281.044921875, "logps/rejected": -247.3936309814453, "loss": 0.6906, "rewards/accuracies": 0.5, "rewards/chosen": 0.004584211856126785, "rewards/margins": 0.00850469246506691, "rewards/rejected": -0.0039204806089401245, "step": 10 }, { "epoch": 0.02, "learning_rate": 3.436426116838488e-08, "logits/chosen": -2.88598895072937, "logits/rejected": -2.917177200317383, "logps/chosen": -359.26177978515625, "logps/rejected": -298.42877197265625, "loss": 0.6845, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.05283154919743538, "rewards/margins": 0.018162177875638008, "rewards/rejected": 0.03466937318444252, "step": 20 }, { "epoch": 0.03, "learning_rate": 5.154639175257731e-08, "logits/chosen": -2.9570868015289307, "logits/rejected": -2.9609949588775635, "logps/chosen": -326.2544860839844, "logps/rejected": -289.9393615722656, "loss": 0.6564, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1515582799911499, "rewards/margins": 0.05830109864473343, "rewards/rejected": 0.09325718879699707, "step": 30 }, { "epoch": 0.04, "learning_rate": 6.872852233676976e-08, "logits/chosen": -2.9531846046447754, "logits/rejected": -2.955566883087158, "logps/chosen": -376.5739440917969, "logps/rejected": -331.3490295410156, "loss": 0.6444, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.3529122769832611, "rewards/margins": 0.1296483278274536, "rewards/rejected": 0.2232639044523239, "step": 40 }, { "epoch": 0.05, "learning_rate": 8.59106529209622e-08, "logits/chosen": -2.8878796100616455, "logits/rejected": -2.9229512214660645, "logps/chosen": -427.5284118652344, "logps/rejected": -266.94415283203125, "loss": 0.6204, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.5041142702102661, "rewards/margins": 0.29676300287246704, "rewards/rejected": 0.20735123753547668, "step": 50 }, { "epoch": 0.06, "learning_rate": 1.0309278350515462e-07, "logits/chosen": -2.92777943611145, "logits/rejected": -2.9371728897094727, "logps/chosen": -318.0414123535156, "logps/rejected": -265.1334228515625, "loss": 0.6105, "rewards/accuracies": 0.75, "rewards/chosen": 0.38802462816238403, "rewards/margins": 0.31535086035728455, "rewards/rejected": 0.0726737454533577, "step": 60 }, { "epoch": 0.07, "learning_rate": 1.202749140893471e-07, "logits/chosen": -2.9851737022399902, "logits/rejected": -3.0005269050598145, "logps/chosen": -404.5143127441406, "logps/rejected": -300.8736572265625, "loss": 0.6069, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5573440790176392, "rewards/margins": 0.41277560591697693, "rewards/rejected": 0.14456847310066223, "step": 70 }, { "epoch": 0.08, "learning_rate": 1.3745704467353952e-07, "logits/chosen": -2.96441912651062, "logits/rejected": -2.968987464904785, "logps/chosen": -314.7437438964844, "logps/rejected": -254.7586669921875, "loss": 0.5569, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.45461219549179077, "rewards/margins": 0.5670984983444214, "rewards/rejected": -0.1124863252043724, "step": 80 }, { "epoch": 0.09, "learning_rate": 1.5463917525773197e-07, "logits/chosen": -3.021219491958618, "logits/rejected": -3.0178027153015137, "logps/chosen": -308.31585693359375, "logps/rejected": -257.63250732421875, "loss": 0.5296, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.5257282257080078, "rewards/margins": 0.7057730555534363, "rewards/rejected": -0.18004484474658966, "step": 90 }, { "epoch": 0.1, "learning_rate": 1.718213058419244e-07, "logits/chosen": -2.8890886306762695, "logits/rejected": -2.896449327468872, "logps/chosen": -375.84564208984375, "logps/rejected": -241.34219360351562, "loss": 0.5504, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.6213200092315674, "rewards/margins": 0.8095133900642395, "rewards/rejected": -0.18819323182106018, "step": 100 }, { "epoch": 0.1, "eval_logits/chosen": -2.936641216278076, "eval_logits/rejected": -2.935973882675171, "eval_logps/chosen": -361.9043884277344, "eval_logps/rejected": -293.7761535644531, "eval_loss": 0.5406630635261536, "eval_rewards/accuracies": 0.7579365372657776, "eval_rewards/chosen": 0.5287383794784546, "eval_rewards/margins": 0.7097563743591309, "eval_rewards/rejected": -0.18101799488067627, "eval_runtime": 163.7175, "eval_samples_per_second": 12.216, "eval_steps_per_second": 0.385, "step": 100 }, { "epoch": 0.11, "learning_rate": 1.8900343642611682e-07, "logits/chosen": -2.917739152908325, "logits/rejected": -2.8890061378479004, "logps/chosen": -334.1250305175781, "logps/rejected": -331.29571533203125, "loss": 0.5741, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.4206802248954773, "rewards/margins": 0.6734089851379395, "rewards/rejected": -0.25272876024246216, "step": 110 }, { "epoch": 0.12, "learning_rate": 2.0618556701030925e-07, "logits/chosen": -2.9665865898132324, "logits/rejected": -2.970818519592285, "logps/chosen": -386.2568664550781, "logps/rejected": -280.7279357910156, "loss": 0.5533, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.3308308720588684, "rewards/margins": 0.6611676216125488, "rewards/rejected": -0.33033671975135803, "step": 120 }, { "epoch": 0.13, "learning_rate": 2.2336769759450173e-07, "logits/chosen": -3.012545347213745, "logits/rejected": -2.9925591945648193, "logps/chosen": -353.75469970703125, "logps/rejected": -290.1478576660156, "loss": 0.5447, "rewards/accuracies": 0.6875, "rewards/chosen": 0.47041910886764526, "rewards/margins": 0.7254467010498047, "rewards/rejected": -0.2550275921821594, "step": 130 }, { "epoch": 0.14, "learning_rate": 2.405498281786942e-07, "logits/chosen": -2.9241251945495605, "logits/rejected": -2.9576869010925293, "logps/chosen": -329.7611389160156, "logps/rejected": -265.63006591796875, "loss": 0.5113, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.4418914318084717, "rewards/margins": 0.7908871173858643, "rewards/rejected": -0.3489956259727478, "step": 140 }, { "epoch": 0.15, "learning_rate": 2.5773195876288655e-07, "logits/chosen": -2.9687321186065674, "logits/rejected": -2.9832406044006348, "logps/chosen": -331.42669677734375, "logps/rejected": -269.3779296875, "loss": 0.5387, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.38882407546043396, "rewards/margins": 0.8327142000198364, "rewards/rejected": -0.4438902735710144, "step": 150 }, { "epoch": 0.17, "learning_rate": 2.7491408934707903e-07, "logits/chosen": -2.9920172691345215, "logits/rejected": -3.013425827026367, "logps/chosen": -383.51934814453125, "logps/rejected": -297.9476318359375, "loss": 0.5083, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.6878620982170105, "rewards/margins": 1.0685365200042725, "rewards/rejected": -0.3806745111942291, "step": 160 }, { "epoch": 0.18, "learning_rate": 2.9209621993127146e-07, "logits/chosen": -2.9233837127685547, "logits/rejected": -2.9321510791778564, "logps/chosen": -339.95745849609375, "logps/rejected": -280.793701171875, "loss": 0.5131, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5272036194801331, "rewards/margins": 0.8315987586975098, "rewards/rejected": -0.3043951690196991, "step": 170 }, { "epoch": 0.19, "learning_rate": 3.0927835051546394e-07, "logits/chosen": -2.9920477867126465, "logits/rejected": -2.9811954498291016, "logps/chosen": -265.2094421386719, "logps/rejected": -254.6926727294922, "loss": 0.504, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.177840456366539, "rewards/margins": 0.8774341344833374, "rewards/rejected": -0.699593722820282, "step": 180 }, { "epoch": 0.2, "learning_rate": 3.2646048109965636e-07, "logits/chosen": -3.0027570724487305, "logits/rejected": -2.987896203994751, "logps/chosen": -330.7102966308594, "logps/rejected": -239.6572723388672, "loss": 0.5611, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.5821124315261841, "rewards/margins": 1.141722559928894, "rewards/rejected": -0.5596100687980652, "step": 190 }, { "epoch": 0.21, "learning_rate": 3.436426116838488e-07, "logits/chosen": -3.048879384994507, "logits/rejected": -2.9993340969085693, "logps/chosen": -266.72430419921875, "logps/rejected": -187.27467346191406, "loss": 0.541, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2961394786834717, "rewards/margins": 0.8773609399795532, "rewards/rejected": -0.5812214612960815, "step": 200 }, { "epoch": 0.21, "eval_logits/chosen": -2.980220317840576, "eval_logits/rejected": -2.9785656929016113, "eval_logps/chosen": -360.50030517578125, "eval_logps/rejected": -297.53515625, "eval_loss": 0.5220658779144287, "eval_rewards/accuracies": 0.7698412537574768, "eval_rewards/chosen": 0.6691505908966064, "eval_rewards/margins": 1.2260682582855225, "eval_rewards/rejected": -0.5569177269935608, "eval_runtime": 163.6147, "eval_samples_per_second": 12.224, "eval_steps_per_second": 0.385, "step": 200 }, { "epoch": 0.22, "learning_rate": 3.608247422680412e-07, "logits/chosen": -2.9797844886779785, "logits/rejected": -2.9449918270111084, "logps/chosen": -364.26287841796875, "logps/rejected": -251.58901977539062, "loss": 0.4772, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5583639740943909, "rewards/margins": 1.3936102390289307, "rewards/rejected": -0.835246205329895, "step": 210 }, { "epoch": 0.23, "learning_rate": 3.7800687285223364e-07, "logits/chosen": -2.9557044506073, "logits/rejected": -2.9637341499328613, "logps/chosen": -261.53216552734375, "logps/rejected": -271.5208740234375, "loss": 0.5707, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.367598295211792, "rewards/margins": 1.1545054912567139, "rewards/rejected": -0.7869071960449219, "step": 220 }, { "epoch": 0.24, "learning_rate": 3.9518900343642607e-07, "logits/chosen": -3.0034899711608887, "logits/rejected": -2.991698980331421, "logps/chosen": -308.8106689453125, "logps/rejected": -278.55950927734375, "loss": 0.5827, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.4888441562652588, "rewards/margins": 1.2474777698516846, "rewards/rejected": -0.7586336731910706, "step": 230 }, { "epoch": 0.25, "learning_rate": 4.123711340206185e-07, "logits/chosen": -3.0844597816467285, "logits/rejected": -3.0464837551116943, "logps/chosen": -385.8021545410156, "logps/rejected": -253.19869995117188, "loss": 0.4898, "rewards/accuracies": 0.75, "rewards/chosen": 0.5026682019233704, "rewards/margins": 1.1020526885986328, "rewards/rejected": -0.5993844270706177, "step": 240 }, { "epoch": 0.26, "learning_rate": 4.2955326460481097e-07, "logits/chosen": -3.0472395420074463, "logits/rejected": -3.0599236488342285, "logps/chosen": -341.8814697265625, "logps/rejected": -295.29437255859375, "loss": 0.5395, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.24311120808124542, "rewards/margins": 0.9385444521903992, "rewards/rejected": -0.6954333186149597, "step": 250 }, { "epoch": 0.27, "learning_rate": 4.4673539518900345e-07, "logits/chosen": -3.0285518169403076, "logits/rejected": -3.0690500736236572, "logps/chosen": -353.20074462890625, "logps/rejected": -244.77041625976562, "loss": 0.6312, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.22555696964263916, "rewards/margins": 0.8821722269058228, "rewards/rejected": -1.107729196548462, "step": 260 }, { "epoch": 0.28, "learning_rate": 4.639175257731959e-07, "logits/chosen": -3.0879526138305664, "logits/rejected": -3.0506978034973145, "logps/chosen": -354.5426025390625, "logps/rejected": -279.86773681640625, "loss": 0.571, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.26413029432296753, "rewards/margins": 0.9725528955459595, "rewards/rejected": -0.7084226012229919, "step": 270 }, { "epoch": 0.29, "learning_rate": 4.810996563573884e-07, "logits/chosen": -3.0588438510894775, "logits/rejected": -3.0481762886047363, "logps/chosen": -339.85675048828125, "logps/rejected": -285.8063049316406, "loss": 0.6383, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.30532822012901306, "rewards/margins": 1.10079026222229, "rewards/rejected": -0.7954620122909546, "step": 280 }, { "epoch": 0.3, "learning_rate": 4.982817869415807e-07, "logits/chosen": -3.0889270305633545, "logits/rejected": -3.0666940212249756, "logps/chosen": -335.5870666503906, "logps/rejected": -258.51641845703125, "loss": 0.5611, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.7204562425613403, "rewards/margins": 1.0493910312652588, "rewards/rejected": -0.32893460988998413, "step": 290 }, { "epoch": 0.31, "learning_rate": 4.982798165137615e-07, "logits/chosen": -3.0340983867645264, "logits/rejected": -3.0090713500976562, "logps/chosen": -281.38751220703125, "logps/rejected": -289.4985656738281, "loss": 0.6034, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.268043577671051, "rewards/margins": 0.5827728509902954, "rewards/rejected": -0.3147292733192444, "step": 300 }, { "epoch": 0.31, "eval_logits/chosen": -3.035973072052002, "eval_logits/rejected": -3.0234200954437256, "eval_logps/chosen": -359.8170166015625, "eval_logps/rejected": -296.5441589355469, "eval_loss": 0.5459412932395935, "eval_rewards/accuracies": 0.761904776096344, "eval_rewards/chosen": 0.737476110458374, "eval_rewards/margins": 1.1952924728393555, "eval_rewards/rejected": -0.45781639218330383, "eval_runtime": 164.3219, "eval_samples_per_second": 12.171, "eval_steps_per_second": 0.383, "step": 300 }, { "epoch": 0.32, "learning_rate": 4.963685015290519e-07, "logits/chosen": -3.1333563327789307, "logits/rejected": -3.0529465675354004, "logps/chosen": -394.2475280761719, "logps/rejected": -328.84796142578125, "loss": 0.5995, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5251134634017944, "rewards/margins": 0.8031543493270874, "rewards/rejected": -0.27804094552993774, "step": 310 }, { "epoch": 0.33, "learning_rate": 4.944571865443424e-07, "logits/chosen": -3.115387201309204, "logits/rejected": -3.104794502258301, "logps/chosen": -299.5379943847656, "logps/rejected": -227.14413452148438, "loss": 0.5504, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5498681664466858, "rewards/margins": 1.550806999206543, "rewards/rejected": -1.0009387731552124, "step": 320 }, { "epoch": 0.34, "learning_rate": 4.92545871559633e-07, "logits/chosen": -3.1059436798095703, "logits/rejected": -3.110661029815674, "logps/chosen": -405.8400573730469, "logps/rejected": -290.01934814453125, "loss": 0.5355, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5642995834350586, "rewards/margins": 1.1246757507324219, "rewards/rejected": -0.5603762269020081, "step": 330 }, { "epoch": 0.35, "learning_rate": 4.906345565749235e-07, "logits/chosen": -3.0694103240966797, "logits/rejected": -3.075610876083374, "logps/chosen": -301.7900695800781, "logps/rejected": -273.09100341796875, "loss": 0.6496, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.26571425795555115, "rewards/margins": 0.967176079750061, "rewards/rejected": -0.701461672782898, "step": 340 }, { "epoch": 0.36, "learning_rate": 4.88723241590214e-07, "logits/chosen": -3.078815460205078, "logits/rejected": -3.097691059112549, "logps/chosen": -373.6755065917969, "logps/rejected": -278.1918640136719, "loss": 0.5251, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5501624941825867, "rewards/margins": 1.1470292806625366, "rewards/rejected": -0.5968667268753052, "step": 350 }, { "epoch": 0.37, "learning_rate": 4.868119266055046e-07, "logits/chosen": -3.101353168487549, "logits/rejected": -3.1290316581726074, "logps/chosen": -370.21112060546875, "logps/rejected": -328.2227783203125, "loss": 0.5218, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.4693407118320465, "rewards/margins": 1.2798802852630615, "rewards/rejected": -0.8105396032333374, "step": 360 }, { "epoch": 0.38, "learning_rate": 4.849006116207951e-07, "logits/chosen": -3.108405113220215, "logits/rejected": -3.108668804168701, "logps/chosen": -357.5787048339844, "logps/rejected": -308.5846252441406, "loss": 0.5781, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5667105317115784, "rewards/margins": 1.561586618423462, "rewards/rejected": -0.9948760271072388, "step": 370 }, { "epoch": 0.39, "learning_rate": 4.829892966360856e-07, "logits/chosen": -3.1076834201812744, "logits/rejected": -3.139901638031006, "logps/chosen": -372.7229919433594, "logps/rejected": -321.50347900390625, "loss": 0.5748, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.595112681388855, "rewards/margins": 1.7188622951507568, "rewards/rejected": -1.1237497329711914, "step": 380 }, { "epoch": 0.4, "learning_rate": 4.810779816513762e-07, "logits/chosen": -3.0231597423553467, "logits/rejected": -3.055475950241089, "logps/chosen": -308.81109619140625, "logps/rejected": -280.67572021484375, "loss": 0.594, "rewards/accuracies": 0.6875, "rewards/chosen": 0.022154245525598526, "rewards/margins": 1.1715147495269775, "rewards/rejected": -1.1493604183197021, "step": 390 }, { "epoch": 0.41, "learning_rate": 4.791666666666667e-07, "logits/chosen": -2.9821434020996094, "logits/rejected": -2.990657329559326, "logps/chosen": -350.4073791503906, "logps/rejected": -234.08291625976562, "loss": 0.5944, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.6456303000450134, "rewards/margins": 1.8281257152557373, "rewards/rejected": -1.182495355606079, "step": 400 }, { "epoch": 0.41, "eval_logits/chosen": -2.96209979057312, "eval_logits/rejected": -2.963911294937134, "eval_logps/chosen": -362.2125549316406, "eval_logps/rejected": -300.90362548828125, "eval_loss": 0.5573462247848511, "eval_rewards/accuracies": 0.7698412537574768, "eval_rewards/chosen": 0.49792128801345825, "eval_rewards/margins": 1.391687273979187, "eval_rewards/rejected": -0.8937660455703735, "eval_runtime": 163.7646, "eval_samples_per_second": 12.213, "eval_steps_per_second": 0.385, "step": 400 }, { "epoch": 0.42, "learning_rate": 4.772553516819572e-07, "logits/chosen": -2.9685988426208496, "logits/rejected": -2.9469170570373535, "logps/chosen": -359.9443054199219, "logps/rejected": -339.13482666015625, "loss": 0.7753, "rewards/accuracies": 0.75, "rewards/chosen": 0.5800348520278931, "rewards/margins": 1.4965015649795532, "rewards/rejected": -0.9164667129516602, "step": 410 }, { "epoch": 0.43, "learning_rate": 4.753440366972477e-07, "logits/chosen": -3.039097785949707, "logits/rejected": -3.0352489948272705, "logps/chosen": -279.19451904296875, "logps/rejected": -275.61077880859375, "loss": 0.5719, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.08171078562736511, "rewards/margins": 0.8535135388374329, "rewards/rejected": -0.7718027234077454, "step": 420 }, { "epoch": 0.44, "learning_rate": 4.7343272171253825e-07, "logits/chosen": -3.0542099475860596, "logits/rejected": -3.048107624053955, "logps/chosen": -304.2041015625, "logps/rejected": -275.24664306640625, "loss": 0.5521, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.6358417272567749, "rewards/margins": 1.4337527751922607, "rewards/rejected": -0.7979112863540649, "step": 430 }, { "epoch": 0.45, "learning_rate": 4.715214067278288e-07, "logits/chosen": -2.9832911491394043, "logits/rejected": -2.9696083068847656, "logps/chosen": -351.0896911621094, "logps/rejected": -278.2879333496094, "loss": 0.5257, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.4420256018638611, "rewards/margins": 1.773047685623169, "rewards/rejected": -1.331022024154663, "step": 440 }, { "epoch": 0.46, "learning_rate": 4.696100917431192e-07, "logits/chosen": -3.115874767303467, "logits/rejected": -3.0773837566375732, "logps/chosen": -392.2452392578125, "logps/rejected": -324.62640380859375, "loss": 0.5536, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2861310839653015, "rewards/margins": 0.8551927804946899, "rewards/rejected": -0.5690616369247437, "step": 450 }, { "epoch": 0.47, "learning_rate": 4.6769877675840974e-07, "logits/chosen": -3.0585522651672363, "logits/rejected": -3.089534282684326, "logps/chosen": -310.84967041015625, "logps/rejected": -287.9058532714844, "loss": 0.5614, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.2514137625694275, "rewards/margins": 1.2147700786590576, "rewards/rejected": -0.9633563160896301, "step": 460 }, { "epoch": 0.49, "learning_rate": 4.6578746177370027e-07, "logits/chosen": -3.0050368309020996, "logits/rejected": -3.0113613605499268, "logps/chosen": -243.838623046875, "logps/rejected": -224.61404418945312, "loss": 0.5769, "rewards/accuracies": 0.8125, "rewards/chosen": 0.10031839460134506, "rewards/margins": 1.2319433689117432, "rewards/rejected": -1.1316249370574951, "step": 470 }, { "epoch": 0.5, "learning_rate": 4.638761467889908e-07, "logits/chosen": -3.01200795173645, "logits/rejected": -2.9829325675964355, "logps/chosen": -353.6679992675781, "logps/rejected": -299.7701416015625, "loss": 0.5141, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6005850434303284, "rewards/margins": 2.408433437347412, "rewards/rejected": -1.807848334312439, "step": 480 }, { "epoch": 0.51, "learning_rate": 4.6196483180428133e-07, "logits/chosen": -3.038440227508545, "logits/rejected": -3.0429458618164062, "logps/chosen": -330.0135192871094, "logps/rejected": -262.1318359375, "loss": 0.5292, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.4290197491645813, "rewards/margins": 1.5280876159667969, "rewards/rejected": -1.0990678071975708, "step": 490 }, { "epoch": 0.52, "learning_rate": 4.600535168195718e-07, "logits/chosen": -3.0223565101623535, "logits/rejected": -3.0170624256134033, "logps/chosen": -259.1560363769531, "logps/rejected": -268.68365478515625, "loss": 0.5512, "rewards/accuracies": 0.6875, "rewards/chosen": 0.08852599561214447, "rewards/margins": 1.076027750968933, "rewards/rejected": -0.9875017404556274, "step": 500 }, { "epoch": 0.52, "eval_logits/chosen": -3.0406343936920166, "eval_logits/rejected": -3.0485074520111084, "eval_logps/chosen": -362.83642578125, "eval_logps/rejected": -302.1329650878906, "eval_loss": 0.5256651043891907, "eval_rewards/accuracies": 0.7579365372657776, "eval_rewards/chosen": 0.4355368912220001, "eval_rewards/margins": 1.452234148979187, "eval_rewards/rejected": -1.0166972875595093, "eval_runtime": 164.1914, "eval_samples_per_second": 12.181, "eval_steps_per_second": 0.384, "step": 500 }, { "epoch": 0.53, "learning_rate": 4.5814220183486234e-07, "logits/chosen": -2.971991777420044, "logits/rejected": -2.9626731872558594, "logps/chosen": -387.75872802734375, "logps/rejected": -341.24224853515625, "loss": 0.5611, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.0966944545507431, "rewards/margins": 0.9770743250846863, "rewards/rejected": -1.0737688541412354, "step": 510 }, { "epoch": 0.54, "learning_rate": 4.562308868501529e-07, "logits/chosen": -2.97809100151062, "logits/rejected": -3.0156943798065186, "logps/chosen": -325.83837890625, "logps/rejected": -321.0384826660156, "loss": 0.5693, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.27487924695014954, "rewards/margins": 1.646512746810913, "rewards/rejected": -1.371633529663086, "step": 520 }, { "epoch": 0.55, "learning_rate": 4.543195718654434e-07, "logits/chosen": -3.0082881450653076, "logits/rejected": -3.003408193588257, "logps/chosen": -274.6020812988281, "logps/rejected": -240.13998413085938, "loss": 0.5953, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0502743124961853, "rewards/margins": 0.5772665739059448, "rewards/rejected": -0.6275408864021301, "step": 530 }, { "epoch": 0.56, "learning_rate": 4.5240825688073394e-07, "logits/chosen": -3.0475857257843018, "logits/rejected": -3.0587058067321777, "logps/chosen": -345.28802490234375, "logps/rejected": -276.25018310546875, "loss": 0.559, "rewards/accuracies": 0.875, "rewards/chosen": 0.6526178121566772, "rewards/margins": 1.6864182949066162, "rewards/rejected": -1.033800482749939, "step": 540 }, { "epoch": 0.57, "learning_rate": 4.504969418960244e-07, "logits/chosen": -2.9780993461608887, "logits/rejected": -3.0339550971984863, "logps/chosen": -318.60699462890625, "logps/rejected": -363.83966064453125, "loss": 0.5182, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.3249278664588928, "rewards/margins": 1.6138547658920288, "rewards/rejected": -1.2889269590377808, "step": 550 }, { "epoch": 0.58, "learning_rate": 4.4858562691131495e-07, "logits/chosen": -3.0293617248535156, "logits/rejected": -3.0541815757751465, "logps/chosen": -355.3965759277344, "logps/rejected": -341.19097900390625, "loss": 0.5655, "rewards/accuracies": 0.75, "rewards/chosen": 0.15244658291339874, "rewards/margins": 1.1955846548080444, "rewards/rejected": -1.043138027191162, "step": 560 }, { "epoch": 0.59, "learning_rate": 4.466743119266055e-07, "logits/chosen": -2.955909252166748, "logits/rejected": -2.966557502746582, "logps/chosen": -339.918701171875, "logps/rejected": -312.85992431640625, "loss": 0.5342, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.24385514855384827, "rewards/margins": 1.2561490535736084, "rewards/rejected": -1.5000044107437134, "step": 570 }, { "epoch": 0.6, "learning_rate": 4.44762996941896e-07, "logits/chosen": -2.9457859992980957, "logits/rejected": -2.921659231185913, "logps/chosen": -361.46905517578125, "logps/rejected": -314.6666259765625, "loss": 0.5347, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.31199535727500916, "rewards/margins": 1.5148388147354126, "rewards/rejected": -1.202843427658081, "step": 580 }, { "epoch": 0.61, "learning_rate": 4.4285168195718655e-07, "logits/chosen": -2.9674103260040283, "logits/rejected": -2.9832658767700195, "logps/chosen": -279.5147705078125, "logps/rejected": -283.4952697753906, "loss": 0.5475, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.18109655380249023, "rewards/margins": 1.081386685371399, "rewards/rejected": -1.2624832391738892, "step": 590 }, { "epoch": 0.62, "learning_rate": 4.40940366972477e-07, "logits/chosen": -3.038327932357788, "logits/rejected": -3.081512928009033, "logps/chosen": -282.9052429199219, "logps/rejected": -260.5687255859375, "loss": 0.5879, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0032925487030297518, "rewards/margins": 0.903986930847168, "rewards/rejected": -0.9006943702697754, "step": 600 }, { "epoch": 0.62, "eval_logits/chosen": -2.9869041442871094, "eval_logits/rejected": -2.991122007369995, "eval_logps/chosen": -362.4848327636719, "eval_logps/rejected": -301.2572021484375, "eval_loss": 0.5287741422653198, "eval_rewards/accuracies": 0.7579365372657776, "eval_rewards/chosen": 0.47069627046585083, "eval_rewards/margins": 1.3998188972473145, "eval_rewards/rejected": -0.9291225075721741, "eval_runtime": 164.0279, "eval_samples_per_second": 12.193, "eval_steps_per_second": 0.384, "step": 600 }, { "epoch": 0.63, "learning_rate": 4.3902905198776756e-07, "logits/chosen": -3.0266683101654053, "logits/rejected": -3.0426414012908936, "logps/chosen": -345.2246398925781, "logps/rejected": -280.60711669921875, "loss": 0.5172, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.3867380917072296, "rewards/margins": 1.5311682224273682, "rewards/rejected": -1.14443039894104, "step": 610 }, { "epoch": 0.64, "learning_rate": 4.371177370030581e-07, "logits/chosen": -3.0024008750915527, "logits/rejected": -3.0336501598358154, "logps/chosen": -340.01483154296875, "logps/rejected": -288.4037170410156, "loss": 0.5674, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.09759467095136642, "rewards/margins": 1.4280248880386353, "rewards/rejected": -1.330430030822754, "step": 620 }, { "epoch": 0.65, "learning_rate": 4.352064220183486e-07, "logits/chosen": -3.073171377182007, "logits/rejected": -3.0693984031677246, "logps/chosen": -268.47442626953125, "logps/rejected": -253.87173461914062, "loss": 0.6129, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.08323542028665543, "rewards/margins": 1.263319969177246, "rewards/rejected": -1.180084466934204, "step": 630 }, { "epoch": 0.66, "learning_rate": 4.3329510703363915e-07, "logits/chosen": -3.1394124031066895, "logits/rejected": -3.147449493408203, "logps/chosen": -316.50323486328125, "logps/rejected": -256.6443786621094, "loss": 0.5405, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.0077073900029063225, "rewards/margins": 1.4150127172470093, "rewards/rejected": -1.407305359840393, "step": 640 }, { "epoch": 0.67, "learning_rate": 4.313837920489297e-07, "logits/chosen": -3.015110731124878, "logits/rejected": -3.0439746379852295, "logps/chosen": -309.4215087890625, "logps/rejected": -278.88934326171875, "loss": 0.5239, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.3628634512424469, "rewards/margins": 1.4821045398712158, "rewards/rejected": -1.1192409992218018, "step": 650 }, { "epoch": 0.68, "learning_rate": 4.2947247706422016e-07, "logits/chosen": -3.0345845222473145, "logits/rejected": -2.997607469558716, "logps/chosen": -311.18719482421875, "logps/rejected": -289.7060852050781, "loss": 0.5288, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4103531241416931, "rewards/margins": 1.0822376012802124, "rewards/rejected": -1.4925907850265503, "step": 660 }, { "epoch": 0.69, "learning_rate": 4.275611620795107e-07, "logits/chosen": -3.0080935955047607, "logits/rejected": -3.015535593032837, "logps/chosen": -377.9685974121094, "logps/rejected": -297.92169189453125, "loss": 0.5683, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14489376544952393, "rewards/margins": 1.1130046844482422, "rewards/rejected": -0.9681110382080078, "step": 670 }, { "epoch": 0.7, "learning_rate": 4.2564984709480123e-07, "logits/chosen": -3.030597448348999, "logits/rejected": -3.059508800506592, "logps/chosen": -368.32635498046875, "logps/rejected": -274.53619384765625, "loss": 0.6557, "rewards/accuracies": 0.75, "rewards/chosen": 0.3344075083732605, "rewards/margins": 1.3792588710784912, "rewards/rejected": -1.0448510646820068, "step": 680 }, { "epoch": 0.71, "learning_rate": 4.2373853211009176e-07, "logits/chosen": -3.0355846881866455, "logits/rejected": -3.0617101192474365, "logps/chosen": -310.24530029296875, "logps/rejected": -280.7437438964844, "loss": 0.5629, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.31564414501190186, "rewards/margins": 1.532622218132019, "rewards/rejected": -1.2169779539108276, "step": 690 }, { "epoch": 0.72, "learning_rate": 4.2182721712538224e-07, "logits/chosen": -3.029533863067627, "logits/rejected": -3.05369234085083, "logps/chosen": -370.49945068359375, "logps/rejected": -285.1793212890625, "loss": 0.6773, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.3230968117713928, "rewards/margins": 1.0617311000823975, "rewards/rejected": -0.7386342287063599, "step": 700 }, { "epoch": 0.72, "eval_logits/chosen": -3.041776180267334, "eval_logits/rejected": -3.0563852787017822, "eval_logps/chosen": -366.7193603515625, "eval_logps/rejected": -301.1505432128906, "eval_loss": 0.585310697555542, "eval_rewards/accuracies": 0.7460317611694336, "eval_rewards/chosen": 0.047242674976587296, "eval_rewards/margins": 0.9657005667686462, "eval_rewards/rejected": -0.9184578657150269, "eval_runtime": 164.258, "eval_samples_per_second": 12.176, "eval_steps_per_second": 0.384, "step": 700 }, { "epoch": 0.73, "learning_rate": 4.199159021406727e-07, "logits/chosen": -2.963630199432373, "logits/rejected": -3.031212329864502, "logps/chosen": -297.58990478515625, "logps/rejected": -283.17572021484375, "loss": 0.6067, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.13306304812431335, "rewards/margins": 1.0036863088607788, "rewards/rejected": -0.8706234097480774, "step": 710 }, { "epoch": 0.74, "learning_rate": 4.1800458715596325e-07, "logits/chosen": -3.0382869243621826, "logits/rejected": -3.0224924087524414, "logps/chosen": -373.01947021484375, "logps/rejected": -315.932861328125, "loss": 0.6166, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.028559958562254906, "rewards/margins": 0.9609702825546265, "rewards/rejected": -0.9324103593826294, "step": 720 }, { "epoch": 0.75, "learning_rate": 4.160932721712538e-07, "logits/chosen": -3.0072388648986816, "logits/rejected": -3.0005228519439697, "logps/chosen": -340.4766540527344, "logps/rejected": -306.3741149902344, "loss": 0.6079, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.09662823379039764, "rewards/margins": 1.1530828475952148, "rewards/rejected": -1.0564546585083008, "step": 730 }, { "epoch": 0.76, "learning_rate": 4.141819571865443e-07, "logits/chosen": -2.9518847465515137, "logits/rejected": -2.9550204277038574, "logps/chosen": -325.9070739746094, "logps/rejected": -244.12588500976562, "loss": 0.564, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.1386883705854416, "rewards/margins": 1.7188549041748047, "rewards/rejected": -1.5801665782928467, "step": 740 }, { "epoch": 0.77, "learning_rate": 4.1227064220183485e-07, "logits/chosen": -2.9738943576812744, "logits/rejected": -3.009288787841797, "logps/chosen": -306.73614501953125, "logps/rejected": -284.35089111328125, "loss": 0.5213, "rewards/accuracies": 0.8125, "rewards/chosen": 0.16552898287773132, "rewards/margins": 1.8094953298568726, "rewards/rejected": -1.6439664363861084, "step": 750 }, { "epoch": 0.78, "learning_rate": 4.103593272171253e-07, "logits/chosen": -2.9576098918914795, "logits/rejected": -2.9751369953155518, "logps/chosen": -336.5853576660156, "logps/rejected": -326.5455017089844, "loss": 0.5703, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.07825515419244766, "rewards/margins": 1.1398742198944092, "rewards/rejected": -1.0616191625595093, "step": 760 }, { "epoch": 0.79, "learning_rate": 4.0844801223241586e-07, "logits/chosen": -3.0122196674346924, "logits/rejected": -2.9879307746887207, "logps/chosen": -350.8817138671875, "logps/rejected": -298.84307861328125, "loss": 0.5197, "rewards/accuracies": 0.75, "rewards/chosen": 0.03767753392457962, "rewards/margins": 1.09770929813385, "rewards/rejected": -1.0600318908691406, "step": 770 }, { "epoch": 0.8, "learning_rate": 4.065366972477064e-07, "logits/chosen": -2.9043805599212646, "logits/rejected": -2.9711837768554688, "logps/chosen": -379.1385803222656, "logps/rejected": -296.9505920410156, "loss": 0.5669, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.3532111942768097, "rewards/margins": 1.7610466480255127, "rewards/rejected": -1.407835602760315, "step": 780 }, { "epoch": 0.82, "learning_rate": 4.046253822629969e-07, "logits/chosen": -2.9510416984558105, "logits/rejected": -2.961275100708008, "logps/chosen": -330.33673095703125, "logps/rejected": -288.71173095703125, "loss": 0.5177, "rewards/accuracies": 0.75, "rewards/chosen": 0.301142156124115, "rewards/margins": 1.2936238050460815, "rewards/rejected": -0.9924817085266113, "step": 790 }, { "epoch": 0.83, "learning_rate": 4.0271406727828745e-07, "logits/chosen": -2.991361141204834, "logits/rejected": -2.974353790283203, "logps/chosen": -322.8855895996094, "logps/rejected": -271.7654113769531, "loss": 0.5263, "rewards/accuracies": 0.625, "rewards/chosen": 0.06158037111163139, "rewards/margins": 1.1184431314468384, "rewards/rejected": -1.056862711906433, "step": 800 }, { "epoch": 0.83, "eval_logits/chosen": -2.963681936264038, "eval_logits/rejected": -2.966184139251709, "eval_logps/chosen": -364.9457702636719, "eval_logps/rejected": -303.87957763671875, "eval_loss": 0.5150811076164246, "eval_rewards/accuracies": 0.761904776096344, "eval_rewards/chosen": 0.22460374236106873, "eval_rewards/margins": 1.4159626960754395, "eval_rewards/rejected": -1.191359043121338, "eval_runtime": 163.931, "eval_samples_per_second": 12.2, "eval_steps_per_second": 0.384, "step": 800 }, { "epoch": 0.84, "learning_rate": 4.00802752293578e-07, "logits/chosen": -2.974116802215576, "logits/rejected": -2.9998645782470703, "logps/chosen": -330.64910888671875, "logps/rejected": -294.6690368652344, "loss": 0.5031, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.18794824182987213, "rewards/margins": 1.2928552627563477, "rewards/rejected": -1.1049071550369263, "step": 810 }, { "epoch": 0.85, "learning_rate": 3.9889143730886847e-07, "logits/chosen": -2.9930388927459717, "logits/rejected": -2.983773946762085, "logps/chosen": -376.36212158203125, "logps/rejected": -305.14111328125, "loss": 0.5357, "rewards/accuracies": 0.75, "rewards/chosen": 0.27564454078674316, "rewards/margins": 1.5530188083648682, "rewards/rejected": -1.277374267578125, "step": 820 }, { "epoch": 0.86, "learning_rate": 3.96980122324159e-07, "logits/chosen": -3.0382747650146484, "logits/rejected": -3.0700857639312744, "logps/chosen": -313.2106018066406, "logps/rejected": -256.5130310058594, "loss": 0.556, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.03291673585772514, "rewards/margins": 1.2974836826324463, "rewards/rejected": -1.3304002285003662, "step": 830 }, { "epoch": 0.87, "learning_rate": 3.9506880733944953e-07, "logits/chosen": -3.0589098930358887, "logits/rejected": -3.058842897415161, "logps/chosen": -304.68658447265625, "logps/rejected": -276.25177001953125, "loss": 0.5578, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.22373457252979279, "rewards/margins": 1.489611268043518, "rewards/rejected": -1.7133458852767944, "step": 840 }, { "epoch": 0.88, "learning_rate": 3.9315749235474006e-07, "logits/chosen": -3.037079334259033, "logits/rejected": -3.0386836528778076, "logps/chosen": -347.38897705078125, "logps/rejected": -334.3331298828125, "loss": 0.5433, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.15273378789424896, "rewards/margins": 1.5688612461090088, "rewards/rejected": -1.7215951681137085, "step": 850 }, { "epoch": 0.89, "learning_rate": 3.912461773700306e-07, "logits/chosen": -2.9914333820343018, "logits/rejected": -3.013286828994751, "logps/chosen": -361.6410217285156, "logps/rejected": -342.3985900878906, "loss": 0.5464, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.03575097769498825, "rewards/margins": 1.2531265020370483, "rewards/rejected": -1.2888776063919067, "step": 860 }, { "epoch": 0.9, "learning_rate": 3.8933486238532107e-07, "logits/chosen": -2.9541945457458496, "logits/rejected": -2.979830265045166, "logps/chosen": -424.258544921875, "logps/rejected": -299.7648620605469, "loss": 0.581, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.013498688116669655, "rewards/margins": 1.3451616764068604, "rewards/rejected": -1.331662893295288, "step": 870 }, { "epoch": 0.91, "learning_rate": 3.874235474006116e-07, "logits/chosen": -2.9790916442871094, "logits/rejected": -2.987037181854248, "logps/chosen": -364.68048095703125, "logps/rejected": -290.4891052246094, "loss": 0.58, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.007329714484512806, "rewards/margins": 1.4278209209442139, "rewards/rejected": -1.4351506233215332, "step": 880 }, { "epoch": 0.92, "learning_rate": 3.8551223241590214e-07, "logits/chosen": -2.986210823059082, "logits/rejected": -2.9739222526550293, "logps/chosen": -300.7494812011719, "logps/rejected": -278.2732849121094, "loss": 0.5741, "rewards/accuracies": 0.75, "rewards/chosen": -0.04843021556735039, "rewards/margins": 1.3019744157791138, "rewards/rejected": -1.3504045009613037, "step": 890 }, { "epoch": 0.93, "learning_rate": 3.8360091743119267e-07, "logits/chosen": -2.9838929176330566, "logits/rejected": -2.9902117252349854, "logps/chosen": -306.20025634765625, "logps/rejected": -289.5735168457031, "loss": 0.5366, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.058762937784194946, "rewards/margins": 1.2602014541625977, "rewards/rejected": -1.2014386653900146, "step": 900 }, { "epoch": 0.93, "eval_logits/chosen": -2.9907381534576416, "eval_logits/rejected": -2.982360363006592, "eval_logps/chosen": -364.6807556152344, "eval_logps/rejected": -302.8385314941406, "eval_loss": 0.5133689641952515, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": 0.25110283493995667, "eval_rewards/margins": 1.338356614112854, "eval_rewards/rejected": -1.0872538089752197, "eval_runtime": 164.3114, "eval_samples_per_second": 12.172, "eval_steps_per_second": 0.383, "step": 900 }, { "epoch": 0.94, "learning_rate": 3.816896024464832e-07, "logits/chosen": -3.0253748893737793, "logits/rejected": -2.9562289714813232, "logps/chosen": -281.73016357421875, "logps/rejected": -248.2506866455078, "loss": 0.5377, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.28038138151168823, "rewards/margins": 1.607208251953125, "rewards/rejected": -1.326826810836792, "step": 910 }, { "epoch": 0.95, "learning_rate": 3.797782874617737e-07, "logits/chosen": -2.9797048568725586, "logits/rejected": -2.932326555252075, "logps/chosen": -333.2131042480469, "logps/rejected": -267.63128662109375, "loss": 0.4959, "rewards/accuracies": 0.6875, "rewards/chosen": 0.26507893204689026, "rewards/margins": 1.226858139038086, "rewards/rejected": -0.9617794156074524, "step": 920 }, { "epoch": 0.96, "learning_rate": 3.778669724770642e-07, "logits/chosen": -2.9677836894989014, "logits/rejected": -2.9711012840270996, "logps/chosen": -301.1932067871094, "logps/rejected": -239.915771484375, "loss": 0.5646, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.06536471843719482, "rewards/margins": 1.3896596431732178, "rewards/rejected": -1.3242948055267334, "step": 930 }, { "epoch": 0.97, "learning_rate": 3.7595565749235474e-07, "logits/chosen": -3.003399133682251, "logits/rejected": -2.9879281520843506, "logps/chosen": -351.9979553222656, "logps/rejected": -264.519775390625, "loss": 0.585, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.3593365252017975, "rewards/margins": 1.1577335596084595, "rewards/rejected": -0.7983969449996948, "step": 940 }, { "epoch": 0.98, "learning_rate": 3.740443425076452e-07, "logits/chosen": -2.9658942222595215, "logits/rejected": -2.982341766357422, "logps/chosen": -336.6238708496094, "logps/rejected": -299.588134765625, "loss": 0.5176, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.2032889872789383, "rewards/margins": 1.3534172773361206, "rewards/rejected": -1.1501282453536987, "step": 950 }, { "epoch": 0.99, "learning_rate": 3.7213302752293575e-07, "logits/chosen": -2.951019763946533, "logits/rejected": -2.985151767730713, "logps/chosen": -338.8179931640625, "logps/rejected": -283.80328369140625, "loss": 0.538, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.049173761159181595, "rewards/margins": 0.8662222623825073, "rewards/rejected": -0.8170484304428101, "step": 960 }, { "epoch": 1.0, "learning_rate": 3.702217125382263e-07, "logits/chosen": -2.9252991676330566, "logits/rejected": -2.937505006790161, "logps/chosen": -354.7286682128906, "logps/rejected": -312.35333251953125, "loss": 0.4865, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.2835424542427063, "rewards/margins": 1.7692314386367798, "rewards/rejected": -1.4856891632080078, "step": 970 }, { "epoch": 1.01, "learning_rate": 3.6831039755351677e-07, "logits/chosen": -2.975984573364258, "logits/rejected": -2.9734318256378174, "logps/chosen": -319.2844543457031, "logps/rejected": -303.3651428222656, "loss": 0.1133, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.7135050296783447, "rewards/margins": 4.7504682540893555, "rewards/rejected": -3.0369625091552734, "step": 980 }, { "epoch": 1.02, "learning_rate": 3.663990825688073e-07, "logits/chosen": -2.899402141571045, "logits/rejected": -2.89802885055542, "logps/chosen": -288.45123291015625, "logps/rejected": -316.5885314941406, "loss": 0.1405, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.7678325176239014, "rewards/margins": 5.483719348907471, "rewards/rejected": -3.715886354446411, "step": 990 }, { "epoch": 1.03, "learning_rate": 3.6448776758409783e-07, "logits/chosen": -2.8675971031188965, "logits/rejected": -2.89615797996521, "logps/chosen": -325.03863525390625, "logps/rejected": -333.436767578125, "loss": 0.1034, "rewards/accuracies": 1.0, "rewards/chosen": 1.79555344581604, "rewards/margins": 4.759924411773682, "rewards/rejected": -2.9643709659576416, "step": 1000 }, { "epoch": 1.03, "eval_logits/chosen": -2.92020583152771, "eval_logits/rejected": -2.9095799922943115, "eval_logps/chosen": -364.1185302734375, "eval_logps/rejected": -306.2866516113281, "eval_loss": 0.5107486248016357, "eval_rewards/accuracies": 0.761904776096344, "eval_rewards/chosen": 0.307327002286911, "eval_rewards/margins": 1.739391803741455, "eval_rewards/rejected": -1.4320647716522217, "eval_runtime": 164.3142, "eval_samples_per_second": 12.172, "eval_steps_per_second": 0.383, "step": 1000 }, { "epoch": 1.04, "learning_rate": 3.6257645259938836e-07, "logits/chosen": -2.8848228454589844, "logits/rejected": -2.9434664249420166, "logps/chosen": -304.1281433105469, "logps/rejected": -323.9388732910156, "loss": 0.0912, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.5237512588500977, "rewards/margins": 5.300021171569824, "rewards/rejected": -3.7762699127197266, "step": 1010 }, { "epoch": 1.05, "learning_rate": 3.606651376146789e-07, "logits/chosen": -2.818145275115967, "logits/rejected": -2.773864269256592, "logps/chosen": -315.73687744140625, "logps/rejected": -252.3991241455078, "loss": 0.1072, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2391575574874878, "rewards/margins": 4.052863121032715, "rewards/rejected": -2.8137052059173584, "step": 1020 }, { "epoch": 1.06, "learning_rate": 3.5875382262996937e-07, "logits/chosen": -2.8956587314605713, "logits/rejected": -2.88509202003479, "logps/chosen": -332.889404296875, "logps/rejected": -375.0550231933594, "loss": 0.0854, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7886276245117188, "rewards/margins": 4.777144432067871, "rewards/rejected": -2.9885172843933105, "step": 1030 }, { "epoch": 1.07, "learning_rate": 3.568425076452599e-07, "logits/chosen": -2.9382426738739014, "logits/rejected": -2.9390716552734375, "logps/chosen": -339.12451171875, "logps/rejected": -315.15625, "loss": 0.099, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.3939841985702515, "rewards/margins": 4.764640808105469, "rewards/rejected": -3.370656967163086, "step": 1040 }, { "epoch": 1.08, "learning_rate": 3.5493119266055044e-07, "logits/chosen": -2.8407671451568604, "logits/rejected": -2.821763753890991, "logps/chosen": -336.37298583984375, "logps/rejected": -257.6861267089844, "loss": 0.1132, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3384716510772705, "rewards/margins": 4.965681076049805, "rewards/rejected": -3.627209424972534, "step": 1050 }, { "epoch": 1.09, "learning_rate": 3.5301987767584097e-07, "logits/chosen": -2.8167824745178223, "logits/rejected": -2.810854434967041, "logps/chosen": -323.439208984375, "logps/rejected": -342.47991943359375, "loss": 0.2041, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7954254150390625, "rewards/margins": 4.722014427185059, "rewards/rejected": -3.9265894889831543, "step": 1060 }, { "epoch": 1.1, "learning_rate": 3.511085626911315e-07, "logits/chosen": -2.940957546234131, "logits/rejected": -3.0021321773529053, "logps/chosen": -366.2899475097656, "logps/rejected": -343.2218933105469, "loss": 0.3299, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4886529445648193, "rewards/margins": 5.359461307525635, "rewards/rejected": -3.870807647705078, "step": 1070 }, { "epoch": 1.11, "learning_rate": 3.49197247706422e-07, "logits/chosen": -2.933786392211914, "logits/rejected": -2.906247615814209, "logps/chosen": -261.5579833984375, "logps/rejected": -276.83026123046875, "loss": 0.149, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.48736995458602905, "rewards/margins": 4.608451843261719, "rewards/rejected": -4.121081829071045, "step": 1080 }, { "epoch": 1.12, "learning_rate": 3.472859327217125e-07, "logits/chosen": -2.8349316120147705, "logits/rejected": -2.9043667316436768, "logps/chosen": -364.6941833496094, "logps/rejected": -376.5315856933594, "loss": 0.1092, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2117881774902344, "rewards/margins": 5.32895565032959, "rewards/rejected": -4.117166996002197, "step": 1090 }, { "epoch": 1.14, "learning_rate": 3.4537461773700304e-07, "logits/chosen": -2.985729694366455, "logits/rejected": -2.8761606216430664, "logps/chosen": -258.05841064453125, "logps/rejected": -241.39053344726562, "loss": 0.1114, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6233962774276733, "rewards/margins": 4.293813228607178, "rewards/rejected": -3.6704165935516357, "step": 1100 }, { "epoch": 1.14, "eval_logits/chosen": -2.9666378498077393, "eval_logits/rejected": -2.95609450340271, "eval_logps/chosen": -365.8598327636719, "eval_logps/rejected": -310.414794921875, "eval_loss": 0.534447431564331, "eval_rewards/accuracies": 0.7460317611694336, "eval_rewards/chosen": 0.1331927627325058, "eval_rewards/margins": 1.9780747890472412, "eval_rewards/rejected": -1.8448821306228638, "eval_runtime": 164.1399, "eval_samples_per_second": 12.185, "eval_steps_per_second": 0.384, "step": 1100 }, { "epoch": 1.15, "learning_rate": 3.434633027522936e-07, "logits/chosen": -2.9507124423980713, "logits/rejected": -2.9483211040496826, "logps/chosen": -338.0868835449219, "logps/rejected": -325.01483154296875, "loss": 0.1007, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.6764055490493774, "rewards/margins": 5.684920310974121, "rewards/rejected": -4.008514404296875, "step": 1110 }, { "epoch": 1.16, "learning_rate": 3.415519877675841e-07, "logits/chosen": -2.976590156555176, "logits/rejected": -3.025784730911255, "logps/chosen": -277.34710693359375, "logps/rejected": -323.576171875, "loss": 0.1131, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.841607928276062, "rewards/margins": 4.658609867095947, "rewards/rejected": -3.8170018196105957, "step": 1120 }, { "epoch": 1.17, "learning_rate": 3.3964067278287464e-07, "logits/chosen": -3.0445570945739746, "logits/rejected": -3.0413312911987305, "logps/chosen": -337.9605407714844, "logps/rejected": -288.26666259765625, "loss": 0.1463, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.5894230604171753, "rewards/margins": 5.3310723304748535, "rewards/rejected": -3.7416489124298096, "step": 1130 }, { "epoch": 1.18, "learning_rate": 3.377293577981651e-07, "logits/chosen": -2.9471421241760254, "logits/rejected": -2.9865708351135254, "logps/chosen": -288.2189025878906, "logps/rejected": -309.2388610839844, "loss": 0.0934, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.4304211139678955, "rewards/margins": 5.832246780395508, "rewards/rejected": -4.401825428009033, "step": 1140 }, { "epoch": 1.19, "learning_rate": 3.3581804281345565e-07, "logits/chosen": -2.9803059101104736, "logits/rejected": -2.9711978435516357, "logps/chosen": -337.70697021484375, "logps/rejected": -298.4077453613281, "loss": 0.0967, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4117494821548462, "rewards/margins": 5.474527359008789, "rewards/rejected": -4.062777519226074, "step": 1150 }, { "epoch": 1.2, "learning_rate": 3.339067278287462e-07, "logits/chosen": -2.8603241443634033, "logits/rejected": -2.8709046840667725, "logps/chosen": -312.73504638671875, "logps/rejected": -306.9026794433594, "loss": 0.0785, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3839662075042725, "rewards/margins": 5.900813102722168, "rewards/rejected": -4.516847133636475, "step": 1160 }, { "epoch": 1.21, "learning_rate": 3.319954128440367e-07, "logits/chosen": -2.975525379180908, "logits/rejected": -2.9611260890960693, "logps/chosen": -325.86163330078125, "logps/rejected": -285.2755432128906, "loss": 0.0992, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.0602104663848877, "rewards/margins": 4.849926948547363, "rewards/rejected": -3.7897167205810547, "step": 1170 }, { "epoch": 1.22, "learning_rate": 3.3008409785932725e-07, "logits/chosen": -2.877586841583252, "logits/rejected": -2.821748971939087, "logps/chosen": -324.6281433105469, "logps/rejected": -323.02301025390625, "loss": 0.0892, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.1957648992538452, "rewards/margins": 5.952631950378418, "rewards/rejected": -4.756867408752441, "step": 1180 }, { "epoch": 1.23, "learning_rate": 3.2817278287461773e-07, "logits/chosen": -2.9451098442077637, "logits/rejected": -2.9684863090515137, "logps/chosen": -279.90216064453125, "logps/rejected": -338.3842468261719, "loss": 0.1045, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.2665460109710693, "rewards/margins": 5.084068298339844, "rewards/rejected": -3.8175220489501953, "step": 1190 }, { "epoch": 1.24, "learning_rate": 3.262614678899082e-07, "logits/chosen": -2.889819383621216, "logits/rejected": -2.9235751628875732, "logps/chosen": -303.02838134765625, "logps/rejected": -356.177734375, "loss": 0.1338, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8252049684524536, "rewards/margins": 5.507418155670166, "rewards/rejected": -4.6822123527526855, "step": 1200 }, { "epoch": 1.24, "eval_logits/chosen": -2.9508416652679443, "eval_logits/rejected": -2.9459922313690186, "eval_logps/chosen": -368.0057678222656, "eval_logps/rejected": -313.3835144042969, "eval_loss": 0.534950315952301, "eval_rewards/accuracies": 0.773809552192688, "eval_rewards/chosen": -0.0813969075679779, "eval_rewards/margins": 2.0603599548339844, "eval_rewards/rejected": -2.141756772994995, "eval_runtime": 164.0736, "eval_samples_per_second": 12.19, "eval_steps_per_second": 0.384, "step": 1200 }, { "epoch": 1.25, "learning_rate": 3.2435015290519874e-07, "logits/chosen": -2.916611671447754, "logits/rejected": -2.927777051925659, "logps/chosen": -283.2217712402344, "logps/rejected": -297.02850341796875, "loss": 0.0893, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0804212093353271, "rewards/margins": 5.549715518951416, "rewards/rejected": -4.469293594360352, "step": 1210 }, { "epoch": 1.26, "learning_rate": 3.2243883792048927e-07, "logits/chosen": -2.977875232696533, "logits/rejected": -2.986704111099243, "logps/chosen": -335.274658203125, "logps/rejected": -380.4412536621094, "loss": 0.1303, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.4366910457611084, "rewards/margins": 5.666425704956055, "rewards/rejected": -4.229735374450684, "step": 1220 }, { "epoch": 1.27, "learning_rate": 3.205275229357798e-07, "logits/chosen": -2.868638753890991, "logits/rejected": -2.8948395252227783, "logps/chosen": -387.9947204589844, "logps/rejected": -389.3511657714844, "loss": 0.1117, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.3278411626815796, "rewards/margins": 5.698910236358643, "rewards/rejected": -4.371068954467773, "step": 1230 }, { "epoch": 1.28, "learning_rate": 3.186162079510703e-07, "logits/chosen": -2.9128642082214355, "logits/rejected": -2.91692852973938, "logps/chosen": -351.5616149902344, "logps/rejected": -373.9852600097656, "loss": 0.1466, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3745101690292358, "rewards/margins": 5.025930404663086, "rewards/rejected": -3.6514201164245605, "step": 1240 }, { "epoch": 1.29, "learning_rate": 3.167048929663608e-07, "logits/chosen": -2.955967426300049, "logits/rejected": -2.923954486846924, "logps/chosen": -278.7707824707031, "logps/rejected": -281.9942321777344, "loss": 0.1003, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.0149275064468384, "rewards/margins": 5.51505184173584, "rewards/rejected": -4.500124931335449, "step": 1250 }, { "epoch": 1.3, "learning_rate": 3.1479357798165134e-07, "logits/chosen": -2.9661002159118652, "logits/rejected": -2.948564052581787, "logps/chosen": -339.5476989746094, "logps/rejected": -321.3616638183594, "loss": 0.0984, "rewards/accuracies": 0.9375, "rewards/chosen": 1.038356065750122, "rewards/margins": 5.16934061050415, "rewards/rejected": -4.130984306335449, "step": 1260 }, { "epoch": 1.31, "learning_rate": 3.128822629969419e-07, "logits/chosen": -2.862750291824341, "logits/rejected": -2.8853306770324707, "logps/chosen": -350.9757995605469, "logps/rejected": -333.6067199707031, "loss": 0.1195, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.147101640701294, "rewards/margins": 5.550149917602539, "rewards/rejected": -4.403048038482666, "step": 1270 }, { "epoch": 1.32, "learning_rate": 3.109709480122324e-07, "logits/chosen": -2.9103734493255615, "logits/rejected": -2.9115426540374756, "logps/chosen": -286.4703063964844, "logps/rejected": -298.028076171875, "loss": 0.1039, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1303044557571411, "rewards/margins": 5.467093467712402, "rewards/rejected": -4.336789131164551, "step": 1280 }, { "epoch": 1.33, "learning_rate": 3.0905963302752294e-07, "logits/chosen": -2.7934536933898926, "logits/rejected": -2.880432605743408, "logps/chosen": -312.3811340332031, "logps/rejected": -316.52215576171875, "loss": 0.1168, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9917739629745483, "rewards/margins": 5.529503345489502, "rewards/rejected": -4.537729263305664, "step": 1290 }, { "epoch": 1.34, "learning_rate": 3.071483180428134e-07, "logits/chosen": -2.9836788177490234, "logits/rejected": -2.9340128898620605, "logps/chosen": -382.5011291503906, "logps/rejected": -295.3705749511719, "loss": 0.0979, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5801069736480713, "rewards/margins": 5.381975173950195, "rewards/rejected": -3.801867723464966, "step": 1300 }, { "epoch": 1.34, "eval_logits/chosen": -2.9200918674468994, "eval_logits/rejected": -2.9171833992004395, "eval_logps/chosen": -368.1370849609375, "eval_logps/rejected": -314.4656982421875, "eval_loss": 0.5474238991737366, "eval_rewards/accuracies": 0.7658730149269104, "eval_rewards/chosen": -0.09453116357326508, "eval_rewards/margins": 2.1554412841796875, "eval_rewards/rejected": -2.249972343444824, "eval_runtime": 164.7724, "eval_samples_per_second": 12.138, "eval_steps_per_second": 0.382, "step": 1300 }, { "epoch": 1.35, "learning_rate": 3.0523700305810395e-07, "logits/chosen": -2.9245269298553467, "logits/rejected": -2.9436841011047363, "logps/chosen": -330.574951171875, "logps/rejected": -342.8641662597656, "loss": 0.1043, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.3053131103515625, "rewards/margins": 5.542339324951172, "rewards/rejected": -4.237026214599609, "step": 1310 }, { "epoch": 1.36, "learning_rate": 3.033256880733945e-07, "logits/chosen": -2.9248242378234863, "logits/rejected": -2.935176372528076, "logps/chosen": -286.57171630859375, "logps/rejected": -278.96746826171875, "loss": 0.104, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7352786660194397, "rewards/margins": 5.138430118560791, "rewards/rejected": -4.403151035308838, "step": 1320 }, { "epoch": 1.37, "learning_rate": 3.01414373088685e-07, "logits/chosen": -2.8516454696655273, "logits/rejected": -2.7985987663269043, "logps/chosen": -344.3554382324219, "logps/rejected": -341.85986328125, "loss": 0.1138, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1177431344985962, "rewards/margins": 5.806307792663574, "rewards/rejected": -4.688565254211426, "step": 1330 }, { "epoch": 1.38, "learning_rate": 2.9950305810397555e-07, "logits/chosen": -2.906580686569214, "logits/rejected": -2.97481369972229, "logps/chosen": -335.2439880371094, "logps/rejected": -320.96929931640625, "loss": 0.1256, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8055500984191895, "rewards/margins": 6.211544990539551, "rewards/rejected": -5.405994892120361, "step": 1340 }, { "epoch": 1.39, "learning_rate": 2.9759174311926603e-07, "logits/chosen": -2.9623026847839355, "logits/rejected": -2.9445879459381104, "logps/chosen": -323.3135070800781, "logps/rejected": -329.90496826171875, "loss": 0.1101, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9566561579704285, "rewards/margins": 7.040016174316406, "rewards/rejected": -6.083359718322754, "step": 1350 }, { "epoch": 1.4, "learning_rate": 2.9568042813455656e-07, "logits/chosen": -2.8847999572753906, "logits/rejected": -2.8803889751434326, "logps/chosen": -342.5070495605469, "logps/rejected": -269.67431640625, "loss": 0.1087, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.4027617573738098, "rewards/margins": 4.224934101104736, "rewards/rejected": -3.8221726417541504, "step": 1360 }, { "epoch": 1.41, "learning_rate": 2.937691131498471e-07, "logits/chosen": -2.949441909790039, "logits/rejected": -2.9045028686523438, "logps/chosen": -338.4786376953125, "logps/rejected": -334.69189453125, "loss": 0.1096, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9296592473983765, "rewards/margins": 5.884530067443848, "rewards/rejected": -4.954870700836182, "step": 1370 }, { "epoch": 1.42, "learning_rate": 2.918577981651376e-07, "logits/chosen": -2.932290554046631, "logits/rejected": -2.9427378177642822, "logps/chosen": -280.7291564941406, "logps/rejected": -314.51953125, "loss": 0.1177, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7406275868415833, "rewards/margins": 5.4294962882995605, "rewards/rejected": -4.688868522644043, "step": 1380 }, { "epoch": 1.43, "learning_rate": 2.8994648318042816e-07, "logits/chosen": -2.9206976890563965, "logits/rejected": -2.9712460041046143, "logps/chosen": -358.7654113769531, "logps/rejected": -313.7131652832031, "loss": 0.112, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8611236810684204, "rewards/margins": 5.0603203773498535, "rewards/rejected": -4.199196815490723, "step": 1390 }, { "epoch": 1.44, "learning_rate": 2.8803516819571863e-07, "logits/chosen": -2.979775905609131, "logits/rejected": -2.9886953830718994, "logps/chosen": -379.1478576660156, "logps/rejected": -358.28179931640625, "loss": 0.1366, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.4315125942230225, "rewards/margins": 6.2576189041137695, "rewards/rejected": -4.826106071472168, "step": 1400 }, { "epoch": 1.44, "eval_logits/chosen": -2.9143617153167725, "eval_logits/rejected": -2.9134304523468018, "eval_logps/chosen": -371.9402770996094, "eval_logps/rejected": -315.933837890625, "eval_loss": 0.5439518094062805, "eval_rewards/accuracies": 0.7579365372657776, "eval_rewards/chosen": -0.47485068440437317, "eval_rewards/margins": 1.921934962272644, "eval_rewards/rejected": -2.3967857360839844, "eval_runtime": 165.1605, "eval_samples_per_second": 12.109, "eval_steps_per_second": 0.381, "step": 1400 }, { "epoch": 1.46, "learning_rate": 2.8612385321100917e-07, "logits/chosen": -2.8306632041931152, "logits/rejected": -2.9071240425109863, "logps/chosen": -294.634033203125, "logps/rejected": -327.87896728515625, "loss": 0.1281, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9478427171707153, "rewards/margins": 5.661940097808838, "rewards/rejected": -4.714097499847412, "step": 1410 }, { "epoch": 1.47, "learning_rate": 2.842125382262997e-07, "logits/chosen": -2.9503073692321777, "logits/rejected": -2.9379420280456543, "logps/chosen": -308.3216247558594, "logps/rejected": -308.57574462890625, "loss": 0.1361, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.2659790515899658, "rewards/margins": 5.583965301513672, "rewards/rejected": -4.317985534667969, "step": 1420 }, { "epoch": 1.48, "learning_rate": 2.8230122324159023e-07, "logits/chosen": -2.8611526489257812, "logits/rejected": -2.9008944034576416, "logps/chosen": -375.9707946777344, "logps/rejected": -374.29913330078125, "loss": 0.1194, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3691354990005493, "rewards/margins": 6.1049299240112305, "rewards/rejected": -4.735795021057129, "step": 1430 }, { "epoch": 1.49, "learning_rate": 2.8038990825688076e-07, "logits/chosen": -2.994868516921997, "logits/rejected": -2.9603443145751953, "logps/chosen": -263.36474609375, "logps/rejected": -250.1201934814453, "loss": 0.1098, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.8430646657943726, "rewards/margins": 4.541081428527832, "rewards/rejected": -3.698017120361328, "step": 1440 }, { "epoch": 1.5, "learning_rate": 2.784785932721712e-07, "logits/chosen": -2.9225330352783203, "logits/rejected": -2.925787925720215, "logps/chosen": -319.07574462890625, "logps/rejected": -327.4895324707031, "loss": 0.1336, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.1656725406646729, "rewards/margins": 5.621832847595215, "rewards/rejected": -4.456160068511963, "step": 1450 }, { "epoch": 1.51, "learning_rate": 2.765672782874617e-07, "logits/chosen": -2.896669864654541, "logits/rejected": -2.895914316177368, "logps/chosen": -331.0616760253906, "logps/rejected": -286.6056213378906, "loss": 0.1367, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3613170385360718, "rewards/margins": 5.62969970703125, "rewards/rejected": -4.268383026123047, "step": 1460 }, { "epoch": 1.52, "learning_rate": 2.7465596330275225e-07, "logits/chosen": -2.955178737640381, "logits/rejected": -2.9608724117279053, "logps/chosen": -350.20703125, "logps/rejected": -255.1401824951172, "loss": 0.104, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.233802080154419, "rewards/margins": 5.684638023376465, "rewards/rejected": -4.450836658477783, "step": 1470 }, { "epoch": 1.53, "learning_rate": 2.727446483180428e-07, "logits/chosen": -2.9283223152160645, "logits/rejected": -2.952641010284424, "logps/chosen": -313.20306396484375, "logps/rejected": -316.35333251953125, "loss": 0.1125, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9838566780090332, "rewards/margins": 5.377806186676025, "rewards/rejected": -4.393948554992676, "step": 1480 }, { "epoch": 1.54, "learning_rate": 2.708333333333333e-07, "logits/chosen": -2.850559949874878, "logits/rejected": -2.833322048187256, "logps/chosen": -341.3831481933594, "logps/rejected": -314.9398498535156, "loss": 0.0943, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7250410914421082, "rewards/margins": 5.533560276031494, "rewards/rejected": -4.808518409729004, "step": 1490 }, { "epoch": 1.55, "learning_rate": 2.6892201834862385e-07, "logits/chosen": -2.937903642654419, "logits/rejected": -2.9050183296203613, "logps/chosen": -326.52691650390625, "logps/rejected": -302.30694580078125, "loss": 0.1042, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6643081903457642, "rewards/margins": 5.094948768615723, "rewards/rejected": -4.43064022064209, "step": 1500 }, { "epoch": 1.55, "eval_logits/chosen": -2.930583953857422, "eval_logits/rejected": -2.936053991317749, "eval_logps/chosen": -372.2054138183594, "eval_logps/rejected": -318.7686462402344, "eval_loss": 0.552377462387085, "eval_rewards/accuracies": 0.7698412537574768, "eval_rewards/chosen": -0.5013648867607117, "eval_rewards/margins": 2.178898334503174, "eval_rewards/rejected": -2.6802632808685303, "eval_runtime": 167.7329, "eval_samples_per_second": 11.924, "eval_steps_per_second": 0.376, "step": 1500 }, { "epoch": 1.56, "learning_rate": 2.6701070336391433e-07, "logits/chosen": -2.90950345993042, "logits/rejected": -2.87695050239563, "logps/chosen": -378.1885070800781, "logps/rejected": -322.77337646484375, "loss": 0.0924, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.4327830076217651, "rewards/margins": 5.619394779205322, "rewards/rejected": -4.186612129211426, "step": 1510 }, { "epoch": 1.57, "learning_rate": 2.6509938837920486e-07, "logits/chosen": -2.849907398223877, "logits/rejected": -2.8833765983581543, "logps/chosen": -289.51605224609375, "logps/rejected": -320.0068054199219, "loss": 0.0975, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8192381858825684, "rewards/margins": 4.99267053604126, "rewards/rejected": -4.173432350158691, "step": 1520 }, { "epoch": 1.58, "learning_rate": 2.631880733944954e-07, "logits/chosen": -2.964118719100952, "logits/rejected": -2.984459400177002, "logps/chosen": -303.44866943359375, "logps/rejected": -299.94482421875, "loss": 0.105, "rewards/accuracies": 0.9375, "rewards/chosen": 0.37635737657546997, "rewards/margins": 4.797235488891602, "rewards/rejected": -4.420877933502197, "step": 1530 }, { "epoch": 1.59, "learning_rate": 2.612767584097859e-07, "logits/chosen": -2.9242002964019775, "logits/rejected": -2.9575366973876953, "logps/chosen": -308.75616455078125, "logps/rejected": -282.21380615234375, "loss": 0.1016, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.4492263793945312, "rewards/margins": 5.21218204498291, "rewards/rejected": -3.7629554271698, "step": 1540 }, { "epoch": 1.6, "learning_rate": 2.5936544342507646e-07, "logits/chosen": -2.902669668197632, "logits/rejected": -2.932953357696533, "logps/chosen": -306.1797790527344, "logps/rejected": -315.36700439453125, "loss": 0.1412, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.0359132289886475, "rewards/margins": 5.480694770812988, "rewards/rejected": -4.444781303405762, "step": 1550 }, { "epoch": 1.61, "learning_rate": 2.5745412844036693e-07, "logits/chosen": -2.862687110900879, "logits/rejected": -2.9322876930236816, "logps/chosen": -361.41583251953125, "logps/rejected": -309.0520935058594, "loss": 0.1228, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.3139979839324951, "rewards/margins": 5.389029026031494, "rewards/rejected": -4.07503080368042, "step": 1560 }, { "epoch": 1.62, "learning_rate": 2.5554281345565747e-07, "logits/chosen": -2.937886953353882, "logits/rejected": -2.9431166648864746, "logps/chosen": -312.0148620605469, "logps/rejected": -329.90863037109375, "loss": 0.1005, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.7804427146911621, "rewards/margins": 5.692571640014648, "rewards/rejected": -4.912128925323486, "step": 1570 }, { "epoch": 1.63, "learning_rate": 2.53631498470948e-07, "logits/chosen": -2.972658634185791, "logits/rejected": -2.9699690341949463, "logps/chosen": -347.6422424316406, "logps/rejected": -312.10858154296875, "loss": 0.0857, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3729878664016724, "rewards/margins": 5.7400736808776855, "rewards/rejected": -4.367085933685303, "step": 1580 }, { "epoch": 1.64, "learning_rate": 2.5172018348623853e-07, "logits/chosen": -2.8885016441345215, "logits/rejected": -2.9170756340026855, "logps/chosen": -355.39813232421875, "logps/rejected": -336.2825927734375, "loss": 0.0852, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1573688983917236, "rewards/margins": 5.676226615905762, "rewards/rejected": -4.518857479095459, "step": 1590 }, { "epoch": 1.65, "learning_rate": 2.4980886850152906e-07, "logits/chosen": -2.895519256591797, "logits/rejected": -2.851107120513916, "logps/chosen": -341.70904541015625, "logps/rejected": -305.86480712890625, "loss": 0.1313, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.0455483198165894, "rewards/margins": 5.084899425506592, "rewards/rejected": -4.039351463317871, "step": 1600 }, { "epoch": 1.65, "eval_logits/chosen": -2.8998661041259766, "eval_logits/rejected": -2.9059910774230957, "eval_logps/chosen": -369.42547607421875, "eval_logps/rejected": -313.8333435058594, "eval_loss": 0.5333446264266968, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -0.22337232530117035, "eval_rewards/margins": 1.9633642435073853, "eval_rewards/rejected": -2.1867363452911377, "eval_runtime": 167.9925, "eval_samples_per_second": 11.905, "eval_steps_per_second": 0.375, "step": 1600 }, { "epoch": 1.66, "learning_rate": 2.478975535168196e-07, "logits/chosen": -2.892448663711548, "logits/rejected": -2.8929343223571777, "logps/chosen": -341.67431640625, "logps/rejected": -287.61383056640625, "loss": 0.1044, "rewards/accuracies": 1.0, "rewards/chosen": 0.8163628578186035, "rewards/margins": 4.475451469421387, "rewards/rejected": -3.659088611602783, "step": 1610 }, { "epoch": 1.67, "learning_rate": 2.459862385321101e-07, "logits/chosen": -2.9265084266662598, "logits/rejected": -2.9548892974853516, "logps/chosen": -382.5340576171875, "logps/rejected": -347.4888916015625, "loss": 0.1018, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.6915686130523682, "rewards/margins": 6.067580223083496, "rewards/rejected": -4.376010894775391, "step": 1620 }, { "epoch": 1.68, "learning_rate": 2.440749235474006e-07, "logits/chosen": -2.8851680755615234, "logits/rejected": -2.903552532196045, "logps/chosen": -342.8496398925781, "logps/rejected": -317.72845458984375, "loss": 0.1005, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.0220366716384888, "rewards/margins": 5.634666442871094, "rewards/rejected": -4.6126298904418945, "step": 1630 }, { "epoch": 1.69, "learning_rate": 2.421636085626911e-07, "logits/chosen": -2.8142755031585693, "logits/rejected": -2.8399770259857178, "logps/chosen": -344.5146484375, "logps/rejected": -301.95928955078125, "loss": 0.0981, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.6624820232391357, "rewards/margins": 6.2719035148620605, "rewards/rejected": -4.6094207763671875, "step": 1640 }, { "epoch": 1.7, "learning_rate": 2.402522935779816e-07, "logits/chosen": -2.8734793663024902, "logits/rejected": -2.876209259033203, "logps/chosen": -339.1289367675781, "logps/rejected": -351.3002014160156, "loss": 0.1207, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1800963878631592, "rewards/margins": 5.746790409088135, "rewards/rejected": -4.566694736480713, "step": 1650 }, { "epoch": 1.71, "learning_rate": 2.3834097859327215e-07, "logits/chosen": -2.922632932662964, "logits/rejected": -2.973679780960083, "logps/chosen": -285.7434997558594, "logps/rejected": -304.81536865234375, "loss": 0.1184, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9965537786483765, "rewards/margins": 5.077877998352051, "rewards/rejected": -4.081325054168701, "step": 1660 }, { "epoch": 1.72, "learning_rate": 2.3642966360856268e-07, "logits/chosen": -2.9541144371032715, "logits/rejected": -2.929344654083252, "logps/chosen": -326.902587890625, "logps/rejected": -306.6372985839844, "loss": 0.0972, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.7346267700195312, "rewards/margins": 5.666862964630127, "rewards/rejected": -3.9322357177734375, "step": 1670 }, { "epoch": 1.73, "learning_rate": 2.345183486238532e-07, "logits/chosen": -2.857109785079956, "logits/rejected": -2.8801960945129395, "logps/chosen": -308.47369384765625, "logps/rejected": -375.78692626953125, "loss": 0.0899, "rewards/accuracies": 0.9375, "rewards/chosen": 1.088503122329712, "rewards/margins": 5.432967185974121, "rewards/rejected": -4.344464302062988, "step": 1680 }, { "epoch": 1.74, "learning_rate": 2.3260703363914372e-07, "logits/chosen": -2.9087703227996826, "logits/rejected": -2.9551265239715576, "logps/chosen": -381.01959228515625, "logps/rejected": -338.1856994628906, "loss": 0.105, "rewards/accuracies": 1.0, "rewards/chosen": 1.9901115894317627, "rewards/margins": 5.308182716369629, "rewards/rejected": -3.318070888519287, "step": 1690 }, { "epoch": 1.75, "learning_rate": 2.3069571865443425e-07, "logits/chosen": -2.9682905673980713, "logits/rejected": -2.9819796085357666, "logps/chosen": -277.3031921386719, "logps/rejected": -325.71649169921875, "loss": 0.1629, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5376863479614258, "rewards/margins": 5.757152557373047, "rewards/rejected": -5.219466209411621, "step": 1700 }, { "epoch": 1.75, "eval_logits/chosen": -2.909576654434204, "eval_logits/rejected": -2.9181904792785645, "eval_logps/chosen": -371.09588623046875, "eval_logps/rejected": -319.5571594238281, "eval_loss": 0.5655122399330139, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -0.3904118835926056, "eval_rewards/margins": 2.3687071800231934, "eval_rewards/rejected": -2.7591187953948975, "eval_runtime": 164.0305, "eval_samples_per_second": 12.193, "eval_steps_per_second": 0.384, "step": 1700 }, { "epoch": 1.76, "learning_rate": 2.2878440366972476e-07, "logits/chosen": -2.9808902740478516, "logits/rejected": -2.9869067668914795, "logps/chosen": -326.5906677246094, "logps/rejected": -384.11944580078125, "loss": 0.0867, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5069032311439514, "rewards/margins": 5.799986362457275, "rewards/rejected": -5.2930827140808105, "step": 1710 }, { "epoch": 1.78, "learning_rate": 2.268730886850153e-07, "logits/chosen": -2.8385868072509766, "logits/rejected": -2.9057114124298096, "logps/chosen": -325.4120178222656, "logps/rejected": -309.59136962890625, "loss": 0.0989, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1605224609375, "rewards/margins": 5.591654300689697, "rewards/rejected": -4.431131362915039, "step": 1720 }, { "epoch": 1.79, "learning_rate": 2.249617737003058e-07, "logits/chosen": -2.8839237689971924, "logits/rejected": -2.8796088695526123, "logps/chosen": -352.14886474609375, "logps/rejected": -371.3978576660156, "loss": 0.1089, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0480362176895142, "rewards/margins": 5.741724967956543, "rewards/rejected": -4.693687915802002, "step": 1730 }, { "epoch": 1.8, "learning_rate": 2.2305045871559633e-07, "logits/chosen": -2.883430242538452, "logits/rejected": -2.8605690002441406, "logps/chosen": -345.23272705078125, "logps/rejected": -331.52325439453125, "loss": 0.1311, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.180057168006897, "rewards/margins": 5.705449104309082, "rewards/rejected": -4.525391578674316, "step": 1740 }, { "epoch": 1.81, "learning_rate": 2.2113914373088686e-07, "logits/chosen": -2.962857484817505, "logits/rejected": -2.9696333408355713, "logps/chosen": -377.6351623535156, "logps/rejected": -362.8825378417969, "loss": 0.1067, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8095831871032715, "rewards/margins": 5.408170700073242, "rewards/rejected": -4.598587512969971, "step": 1750 }, { "epoch": 1.82, "learning_rate": 2.1922782874617736e-07, "logits/chosen": -2.9029316902160645, "logits/rejected": -2.9539952278137207, "logps/chosen": -289.267822265625, "logps/rejected": -366.2077941894531, "loss": 0.0929, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.47237950563430786, "rewards/margins": 6.199611186981201, "rewards/rejected": -5.727231502532959, "step": 1760 }, { "epoch": 1.83, "learning_rate": 2.1731651376146787e-07, "logits/chosen": -2.8582608699798584, "logits/rejected": -2.8988289833068848, "logps/chosen": -277.98406982421875, "logps/rejected": -328.0066833496094, "loss": 0.1048, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7530109286308289, "rewards/margins": 5.7497334480285645, "rewards/rejected": -4.99672269821167, "step": 1770 }, { "epoch": 1.84, "learning_rate": 2.154051987767584e-07, "logits/chosen": -2.9023542404174805, "logits/rejected": -2.9242827892303467, "logps/chosen": -314.0538330078125, "logps/rejected": -299.74420166015625, "loss": 0.12, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.7526110410690308, "rewards/margins": 5.213059425354004, "rewards/rejected": -4.460447311401367, "step": 1780 }, { "epoch": 1.85, "learning_rate": 2.134938837920489e-07, "logits/chosen": -2.8436591625213623, "logits/rejected": -2.8463809490203857, "logps/chosen": -255.3565673828125, "logps/rejected": -273.94464111328125, "loss": 0.0818, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.06976697593927383, "rewards/margins": 4.849638938903809, "rewards/rejected": -4.919405937194824, "step": 1790 }, { "epoch": 1.86, "learning_rate": 2.1158256880733944e-07, "logits/chosen": -2.9030632972717285, "logits/rejected": -2.8941729068756104, "logps/chosen": -367.6248779296875, "logps/rejected": -302.12249755859375, "loss": 0.0993, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.4400938153266907, "rewards/margins": 5.627293586730957, "rewards/rejected": -5.187199592590332, "step": 1800 }, { "epoch": 1.86, "eval_logits/chosen": -2.8477160930633545, "eval_logits/rejected": -2.8601999282836914, "eval_logps/chosen": -374.3083801269531, "eval_logps/rejected": -321.6667785644531, "eval_loss": 0.5605445504188538, "eval_rewards/accuracies": 0.7460317611694336, "eval_rewards/chosen": -0.711660623550415, "eval_rewards/margins": 2.2584221363067627, "eval_rewards/rejected": -2.9700827598571777, "eval_runtime": 164.7388, "eval_samples_per_second": 12.14, "eval_steps_per_second": 0.382, "step": 1800 }, { "epoch": 1.87, "learning_rate": 2.0967125382262994e-07, "logits/chosen": -2.8688273429870605, "logits/rejected": -2.868739366531372, "logps/chosen": -337.7546081542969, "logps/rejected": -312.27569580078125, "loss": 0.1163, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7751233577728271, "rewards/margins": 5.873146057128906, "rewards/rejected": -5.098022937774658, "step": 1810 }, { "epoch": 1.88, "learning_rate": 2.0775993883792048e-07, "logits/chosen": -2.8374381065368652, "logits/rejected": -2.8085215091705322, "logps/chosen": -352.53192138671875, "logps/rejected": -316.3230895996094, "loss": 0.0933, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.9547752141952515, "rewards/margins": 5.203994274139404, "rewards/rejected": -4.249218940734863, "step": 1820 }, { "epoch": 1.89, "learning_rate": 2.05848623853211e-07, "logits/chosen": -2.874891757965088, "logits/rejected": -2.839573621749878, "logps/chosen": -366.4833679199219, "logps/rejected": -319.9959411621094, "loss": 0.0966, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.212294101715088, "rewards/margins": 5.914790630340576, "rewards/rejected": -4.702496528625488, "step": 1830 }, { "epoch": 1.9, "learning_rate": 2.0393730886850151e-07, "logits/chosen": -2.8277204036712646, "logits/rejected": -2.878105640411377, "logps/chosen": -378.3955383300781, "logps/rejected": -314.2088623046875, "loss": 0.0863, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7747803926467896, "rewards/margins": 5.967954158782959, "rewards/rejected": -5.193174362182617, "step": 1840 }, { "epoch": 1.91, "learning_rate": 2.0202599388379205e-07, "logits/chosen": -2.8658251762390137, "logits/rejected": -2.8985071182250977, "logps/chosen": -339.0852355957031, "logps/rejected": -384.46112060546875, "loss": 0.0786, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9408707618713379, "rewards/margins": 5.965841770172119, "rewards/rejected": -5.024971008300781, "step": 1850 }, { "epoch": 1.92, "learning_rate": 2.0011467889908258e-07, "logits/chosen": -2.829246997833252, "logits/rejected": -2.8732194900512695, "logps/chosen": -381.65655517578125, "logps/rejected": -284.0471496582031, "loss": 0.0945, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5882245898246765, "rewards/margins": 5.461816787719727, "rewards/rejected": -4.873592376708984, "step": 1860 }, { "epoch": 1.93, "learning_rate": 1.9820336391437308e-07, "logits/chosen": -2.8398656845092773, "logits/rejected": -2.8620615005493164, "logps/chosen": -309.2004089355469, "logps/rejected": -296.1297302246094, "loss": 0.093, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8088359832763672, "rewards/margins": 5.4316887855529785, "rewards/rejected": -4.6228532791137695, "step": 1870 }, { "epoch": 1.94, "learning_rate": 1.9629204892966362e-07, "logits/chosen": -2.941847324371338, "logits/rejected": -2.950911283493042, "logps/chosen": -329.76617431640625, "logps/rejected": -295.0538635253906, "loss": 0.1113, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2022342681884766, "rewards/margins": 5.600251197814941, "rewards/rejected": -4.398016929626465, "step": 1880 }, { "epoch": 1.95, "learning_rate": 1.943807339449541e-07, "logits/chosen": -2.8697052001953125, "logits/rejected": -2.901094913482666, "logps/chosen": -311.559326171875, "logps/rejected": -333.4175720214844, "loss": 0.0948, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.7903985977172852, "rewards/margins": 5.597433090209961, "rewards/rejected": -4.807034015655518, "step": 1890 }, { "epoch": 1.96, "learning_rate": 1.9246941896024463e-07, "logits/chosen": -2.939120054244995, "logits/rejected": -2.9861233234405518, "logps/chosen": -320.5481262207031, "logps/rejected": -347.7875061035156, "loss": 0.1116, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6647524833679199, "rewards/margins": 6.036587238311768, "rewards/rejected": -5.3718342781066895, "step": 1900 }, { "epoch": 1.96, "eval_logits/chosen": -2.914954900741577, "eval_logits/rejected": -2.927724599838257, "eval_logps/chosen": -373.57073974609375, "eval_logps/rejected": -319.2250061035156, "eval_loss": 0.5649252533912659, "eval_rewards/accuracies": 0.7539682388305664, "eval_rewards/chosen": -0.6378985047340393, "eval_rewards/margins": 2.088006019592285, "eval_rewards/rejected": -2.7259042263031006, "eval_runtime": 164.2377, "eval_samples_per_second": 12.177, "eval_steps_per_second": 0.384, "step": 1900 }, { "epoch": 1.97, "learning_rate": 1.9055810397553516e-07, "logits/chosen": -2.9238085746765137, "logits/rejected": -2.9308090209960938, "logps/chosen": -313.63665771484375, "logps/rejected": -304.2153625488281, "loss": 0.1214, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6549821496009827, "rewards/margins": 4.487866401672363, "rewards/rejected": -3.8328843116760254, "step": 1910 }, { "epoch": 1.98, "learning_rate": 1.8864678899082566e-07, "logits/chosen": -2.8484818935394287, "logits/rejected": -2.866534948348999, "logps/chosen": -347.75689697265625, "logps/rejected": -279.4710693359375, "loss": 0.1082, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.3496394753456116, "rewards/margins": 4.312170505523682, "rewards/rejected": -3.9625308513641357, "step": 1920 }, { "epoch": 1.99, "learning_rate": 1.867354740061162e-07, "logits/chosen": -2.9492716789245605, "logits/rejected": -2.956796169281006, "logps/chosen": -307.85845947265625, "logps/rejected": -332.1622619628906, "loss": 0.1061, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4475575387477875, "rewards/margins": 5.942025184631348, "rewards/rejected": -5.494467735290527, "step": 1930 }, { "epoch": 2.0, "learning_rate": 1.8482415902140673e-07, "logits/chosen": -2.923053503036499, "logits/rejected": -2.920959949493408, "logps/chosen": -331.311767578125, "logps/rejected": -320.19586181640625, "loss": 0.0801, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.0848143100738525, "rewards/margins": 5.605216979980469, "rewards/rejected": -4.520401954650879, "step": 1940 }, { "epoch": 2.01, "learning_rate": 1.8291284403669723e-07, "logits/chosen": -2.881058692932129, "logits/rejected": -2.93363618850708, "logps/chosen": -309.11212158203125, "logps/rejected": -322.50665283203125, "loss": 0.0254, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7099040746688843, "rewards/margins": 6.735787868499756, "rewards/rejected": -5.025883674621582, "step": 1950 }, { "epoch": 2.02, "learning_rate": 1.8100152905198777e-07, "logits/chosen": -2.7668607234954834, "logits/rejected": -2.7822773456573486, "logps/chosen": -351.9031677246094, "logps/rejected": -415.9180603027344, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": 1.090543508529663, "rewards/margins": 7.8234100341796875, "rewards/rejected": -6.7328667640686035, "step": 1960 }, { "epoch": 2.03, "learning_rate": 1.7909021406727827e-07, "logits/chosen": -2.9600331783294678, "logits/rejected": -2.8843834400177, "logps/chosen": -309.39642333984375, "logps/rejected": -306.4966735839844, "loss": 0.0282, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9620206952095032, "rewards/margins": 6.899697303771973, "rewards/rejected": -5.937676429748535, "step": 1970 }, { "epoch": 2.04, "learning_rate": 1.771788990825688e-07, "logits/chosen": -2.923687696456909, "logits/rejected": -2.9661598205566406, "logps/chosen": -330.7653503417969, "logps/rejected": -352.5653076171875, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": 1.7405881881713867, "rewards/margins": 8.00406265258789, "rewards/rejected": -6.263474941253662, "step": 1980 }, { "epoch": 2.05, "learning_rate": 1.7526758409785934e-07, "logits/chosen": -2.9299581050872803, "logits/rejected": -2.8949360847473145, "logps/chosen": -362.6274719238281, "logps/rejected": -363.09149169921875, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": 1.0967247486114502, "rewards/margins": 7.466977119445801, "rewards/rejected": -6.37025260925293, "step": 1990 }, { "epoch": 2.06, "learning_rate": 1.7335626911314984e-07, "logits/chosen": -2.845986843109131, "logits/rejected": -2.8671188354492188, "logps/chosen": -274.60870361328125, "logps/rejected": -295.59478759765625, "loss": 0.0193, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.014614641666412354, "rewards/margins": 6.743406772613525, "rewards/rejected": -6.758021354675293, "step": 2000 }, { "epoch": 2.06, "eval_logits/chosen": -2.882474422454834, "eval_logits/rejected": -2.8919453620910645, "eval_logps/chosen": -376.60406494140625, "eval_logps/rejected": -329.82745361328125, "eval_loss": 0.6121558547019958, "eval_rewards/accuracies": 0.761904776096344, "eval_rewards/chosen": -0.941230058670044, "eval_rewards/margins": 2.8449153900146484, "eval_rewards/rejected": -3.7861454486846924, "eval_runtime": 164.9655, "eval_samples_per_second": 12.124, "eval_steps_per_second": 0.382, "step": 2000 }, { "epoch": 2.07, "learning_rate": 1.7144495412844037e-07, "logits/chosen": -2.9446756839752197, "logits/rejected": -2.953831911087036, "logps/chosen": -353.67376708984375, "logps/rejected": -347.7017822265625, "loss": 0.0214, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.4430414140224457, "rewards/margins": 7.570870399475098, "rewards/rejected": -7.127829074859619, "step": 2010 }, { "epoch": 2.08, "learning_rate": 1.6953363914373088e-07, "logits/chosen": -2.940734386444092, "logits/rejected": -2.9746463298797607, "logps/chosen": -348.05328369140625, "logps/rejected": -333.2148742675781, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": 1.2261013984680176, "rewards/margins": 8.234363555908203, "rewards/rejected": -7.008261680603027, "step": 2020 }, { "epoch": 2.09, "learning_rate": 1.6762232415902138e-07, "logits/chosen": -2.875319719314575, "logits/rejected": -2.855180263519287, "logps/chosen": -306.70050048828125, "logps/rejected": -349.5177917480469, "loss": 0.0203, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3287349343299866, "rewards/margins": 7.007230281829834, "rewards/rejected": -6.678494930267334, "step": 2030 }, { "epoch": 2.11, "learning_rate": 1.6571100917431192e-07, "logits/chosen": -2.9315755367279053, "logits/rejected": -2.930187702178955, "logps/chosen": -306.041259765625, "logps/rejected": -305.6824951171875, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": 0.5696347951889038, "rewards/margins": 7.535808563232422, "rewards/rejected": -6.9661736488342285, "step": 2040 }, { "epoch": 2.12, "learning_rate": 1.6379969418960242e-07, "logits/chosen": -2.8848538398742676, "logits/rejected": -2.905867338180542, "logps/chosen": -389.7286682128906, "logps/rejected": -386.9409484863281, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": 1.3652772903442383, "rewards/margins": 8.069811820983887, "rewards/rejected": -6.704535484313965, "step": 2050 }, { "epoch": 2.13, "learning_rate": 1.6188837920489295e-07, "logits/chosen": -2.8828773498535156, "logits/rejected": -2.8783280849456787, "logps/chosen": -359.57666015625, "logps/rejected": -339.35345458984375, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": 0.8870366811752319, "rewards/margins": 8.021839141845703, "rewards/rejected": -7.13480281829834, "step": 2060 }, { "epoch": 2.14, "learning_rate": 1.5997706422018349e-07, "logits/chosen": -2.88275408744812, "logits/rejected": -2.929903984069824, "logps/chosen": -322.4759216308594, "logps/rejected": -423.30682373046875, "loss": 0.0171, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5736116170883179, "rewards/margins": 8.667892456054688, "rewards/rejected": -7.0942816734313965, "step": 2070 }, { "epoch": 2.15, "learning_rate": 1.58065749235474e-07, "logits/chosen": -2.90217661857605, "logits/rejected": -2.8910233974456787, "logps/chosen": -413.54522705078125, "logps/rejected": -385.43341064453125, "loss": 0.0127, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9447764158248901, "rewards/margins": 7.688788414001465, "rewards/rejected": -6.744012355804443, "step": 2080 }, { "epoch": 2.16, "learning_rate": 1.5615443425076452e-07, "logits/chosen": -2.7480947971343994, "logits/rejected": -2.747185230255127, "logps/chosen": -354.87493896484375, "logps/rejected": -351.3457946777344, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": 0.513919472694397, "rewards/margins": 8.175249099731445, "rewards/rejected": -7.661329746246338, "step": 2090 }, { "epoch": 2.17, "learning_rate": 1.5424311926605506e-07, "logits/chosen": -2.8367042541503906, "logits/rejected": -2.8408215045928955, "logps/chosen": -298.4134826660156, "logps/rejected": -300.4717712402344, "loss": 0.0175, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2804573178291321, "rewards/margins": 7.449028968811035, "rewards/rejected": -7.729485511779785, "step": 2100 }, { "epoch": 2.17, "eval_logits/chosen": -2.839301586151123, "eval_logits/rejected": -2.8474462032318115, "eval_logps/chosen": -383.2186279296875, "eval_logps/rejected": -338.7977294921875, "eval_loss": 0.6523212790489197, "eval_rewards/accuracies": 0.7658730149269104, "eval_rewards/chosen": -1.6026798486709595, "eval_rewards/margins": 3.080495834350586, "eval_rewards/rejected": -4.683175563812256, "eval_runtime": 165.5125, "eval_samples_per_second": 12.084, "eval_steps_per_second": 0.381, "step": 2100 }, { "epoch": 2.18, "learning_rate": 1.5233180428134556e-07, "logits/chosen": -2.9054439067840576, "logits/rejected": -2.913278102874756, "logps/chosen": -323.6388244628906, "logps/rejected": -323.73419189453125, "loss": 0.0138, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8671594858169556, "rewards/margins": 7.387481689453125, "rewards/rejected": -6.520320892333984, "step": 2110 }, { "epoch": 2.19, "learning_rate": 1.504204892966361e-07, "logits/chosen": -2.800830364227295, "logits/rejected": -2.8197312355041504, "logps/chosen": -359.3259582519531, "logps/rejected": -394.8112487792969, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": 0.4949137568473816, "rewards/margins": 9.693056106567383, "rewards/rejected": -9.198141098022461, "step": 2120 }, { "epoch": 2.2, "learning_rate": 1.485091743119266e-07, "logits/chosen": -2.890476942062378, "logits/rejected": -2.925356388092041, "logps/chosen": -315.17742919921875, "logps/rejected": -378.8518371582031, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": 0.27211472392082214, "rewards/margins": 8.301239967346191, "rewards/rejected": -8.02912425994873, "step": 2130 }, { "epoch": 2.21, "learning_rate": 1.465978593272171e-07, "logits/chosen": -2.820862054824829, "logits/rejected": -2.8192131519317627, "logps/chosen": -233.12344360351562, "logps/rejected": -238.68014526367188, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": -0.23832440376281738, "rewards/margins": 7.234049320220947, "rewards/rejected": -7.472373008728027, "step": 2140 }, { "epoch": 2.22, "learning_rate": 1.4468654434250764e-07, "logits/chosen": -2.7812180519104004, "logits/rejected": -2.839566946029663, "logps/chosen": -400.56396484375, "logps/rejected": -418.9078063964844, "loss": 0.0183, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.306671380996704, "rewards/margins": 9.309846878051758, "rewards/rejected": -8.003175735473633, "step": 2150 }, { "epoch": 2.23, "learning_rate": 1.4277522935779814e-07, "logits/chosen": -2.857119083404541, "logits/rejected": -2.8069121837615967, "logps/chosen": -346.87091064453125, "logps/rejected": -364.4837341308594, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": 0.42278409004211426, "rewards/margins": 7.95212459564209, "rewards/rejected": -7.5293402671813965, "step": 2160 }, { "epoch": 2.24, "learning_rate": 1.4086391437308867e-07, "logits/chosen": -2.828322172164917, "logits/rejected": -2.872556209564209, "logps/chosen": -371.8916015625, "logps/rejected": -404.73162841796875, "loss": 0.0183, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5918793082237244, "rewards/margins": 8.069284439086914, "rewards/rejected": -7.477405548095703, "step": 2170 }, { "epoch": 2.25, "learning_rate": 1.389525993883792e-07, "logits/chosen": -2.8225388526916504, "logits/rejected": -2.8491692543029785, "logps/chosen": -293.30047607421875, "logps/rejected": -313.1904296875, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": -0.3963487148284912, "rewards/margins": 7.192727565765381, "rewards/rejected": -7.589076042175293, "step": 2180 }, { "epoch": 2.26, "learning_rate": 1.370412844036697e-07, "logits/chosen": -2.8727283477783203, "logits/rejected": -2.850238561630249, "logps/chosen": -306.93695068359375, "logps/rejected": -345.2283020019531, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": 0.09160284698009491, "rewards/margins": 7.823256492614746, "rewards/rejected": -7.731653690338135, "step": 2190 }, { "epoch": 2.27, "learning_rate": 1.3512996941896024e-07, "logits/chosen": -2.8500583171844482, "logits/rejected": -2.8594961166381836, "logps/chosen": -283.15771484375, "logps/rejected": -311.3097839355469, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": 0.03272407129406929, "rewards/margins": 8.4783353805542, "rewards/rejected": -8.445611953735352, "step": 2200 }, { "epoch": 2.27, "eval_logits/chosen": -2.806851387023926, "eval_logits/rejected": -2.812812328338623, "eval_logps/chosen": -386.09039306640625, "eval_logps/rejected": -342.2704162597656, "eval_loss": 0.6702452898025513, "eval_rewards/accuracies": 0.7420634627342224, "eval_rewards/chosen": -1.8898613452911377, "eval_rewards/margins": 3.1405844688415527, "eval_rewards/rejected": -5.0304460525512695, "eval_runtime": 165.1336, "eval_samples_per_second": 12.111, "eval_steps_per_second": 0.382, "step": 2200 }, { "epoch": 2.28, "learning_rate": 1.3321865443425075e-07, "logits/chosen": -2.7931952476501465, "logits/rejected": -2.8073198795318604, "logps/chosen": -338.2393493652344, "logps/rejected": -352.142333984375, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": -0.13923540711402893, "rewards/margins": 8.090972900390625, "rewards/rejected": -8.230208396911621, "step": 2210 }, { "epoch": 2.29, "learning_rate": 1.3130733944954128e-07, "logits/chosen": -2.74983811378479, "logits/rejected": -2.748617649078369, "logps/chosen": -358.42401123046875, "logps/rejected": -402.30328369140625, "loss": 0.0129, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6026802062988281, "rewards/margins": 9.088810920715332, "rewards/rejected": -7.4861297607421875, "step": 2220 }, { "epoch": 2.3, "learning_rate": 1.293960244648318e-07, "logits/chosen": -2.8457603454589844, "logits/rejected": -2.8344614505767822, "logps/chosen": -365.7544860839844, "logps/rejected": -347.2682189941406, "loss": 0.0163, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.001628613448701799, "rewards/margins": 8.202213287353516, "rewards/rejected": -8.203841209411621, "step": 2230 }, { "epoch": 2.31, "learning_rate": 1.2748470948012232e-07, "logits/chosen": -2.8093724250793457, "logits/rejected": -2.81803822517395, "logps/chosen": -340.55352783203125, "logps/rejected": -407.7304992675781, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": 0.47526517510414124, "rewards/margins": 8.480849266052246, "rewards/rejected": -8.005583763122559, "step": 2240 }, { "epoch": 2.32, "learning_rate": 1.2557339449541285e-07, "logits/chosen": -2.8672242164611816, "logits/rejected": -2.855675220489502, "logps/chosen": -343.7786865234375, "logps/rejected": -365.4543151855469, "loss": 0.0212, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.35550713539123535, "rewards/margins": 8.239429473876953, "rewards/rejected": -7.8839240074157715, "step": 2250 }, { "epoch": 2.33, "learning_rate": 1.2366207951070336e-07, "logits/chosen": -2.8647074699401855, "logits/rejected": -2.8598999977111816, "logps/chosen": -335.4911193847656, "logps/rejected": -369.7025146484375, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": -0.1035120040178299, "rewards/margins": 8.297709465026855, "rewards/rejected": -8.401222229003906, "step": 2260 }, { "epoch": 2.34, "learning_rate": 1.217507645259939e-07, "logits/chosen": -2.8038744926452637, "logits/rejected": -2.8534445762634277, "logps/chosen": -327.49005126953125, "logps/rejected": -348.63116455078125, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": 0.261627733707428, "rewards/margins": 8.056116104125977, "rewards/rejected": -7.794488430023193, "step": 2270 }, { "epoch": 2.35, "learning_rate": 1.198394495412844e-07, "logits/chosen": -2.879183053970337, "logits/rejected": -2.9233028888702393, "logps/chosen": -337.91790771484375, "logps/rejected": -346.1882019042969, "loss": 0.0176, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.12253670394420624, "rewards/margins": 7.606545925140381, "rewards/rejected": -7.4840087890625, "step": 2280 }, { "epoch": 2.36, "learning_rate": 1.1792813455657493e-07, "logits/chosen": -2.7606253623962402, "logits/rejected": -2.8114898204803467, "logps/chosen": -337.6861877441406, "logps/rejected": -345.7854309082031, "loss": 0.0258, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3526005446910858, "rewards/margins": 8.258612632751465, "rewards/rejected": -7.906012058258057, "step": 2290 }, { "epoch": 2.37, "learning_rate": 1.1601681957186543e-07, "logits/chosen": -2.8984854221343994, "logits/rejected": -2.912468910217285, "logps/chosen": -334.9092102050781, "logps/rejected": -334.67669677734375, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": 0.7173303961753845, "rewards/margins": 7.802558898925781, "rewards/rejected": -7.085227966308594, "step": 2300 }, { "epoch": 2.37, "eval_logits/chosen": -2.8489737510681152, "eval_logits/rejected": -2.854724407196045, "eval_logps/chosen": -383.90655517578125, "eval_logps/rejected": -339.3347473144531, "eval_loss": 0.6559089422225952, "eval_rewards/accuracies": 0.7698412537574768, "eval_rewards/chosen": -1.6714773178100586, "eval_rewards/margins": 3.0653984546661377, "eval_rewards/rejected": -4.736875534057617, "eval_runtime": 164.8339, "eval_samples_per_second": 12.133, "eval_steps_per_second": 0.382, "step": 2300 }, { "epoch": 2.38, "learning_rate": 1.1410550458715595e-07, "logits/chosen": -2.8347411155700684, "logits/rejected": -2.851090908050537, "logps/chosen": -329.1361999511719, "logps/rejected": -359.9030456542969, "loss": 0.0278, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.2210700958967209, "rewards/margins": 7.436942100524902, "rewards/rejected": -7.215872287750244, "step": 2310 }, { "epoch": 2.39, "learning_rate": 1.1219418960244648e-07, "logits/chosen": -2.8228423595428467, "logits/rejected": -2.841404438018799, "logps/chosen": -282.3636169433594, "logps/rejected": -409.47979736328125, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -0.2855607867240906, "rewards/margins": 8.16025161743164, "rewards/rejected": -8.445813179016113, "step": 2320 }, { "epoch": 2.4, "learning_rate": 1.10282874617737e-07, "logits/chosen": -2.8471851348876953, "logits/rejected": -2.8798093795776367, "logps/chosen": -295.41900634765625, "logps/rejected": -340.5544738769531, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": -0.3259337544441223, "rewards/margins": 8.084188461303711, "rewards/rejected": -8.410122871398926, "step": 2330 }, { "epoch": 2.41, "learning_rate": 1.0837155963302752e-07, "logits/chosen": -2.636784076690674, "logits/rejected": -2.740302562713623, "logps/chosen": -287.13702392578125, "logps/rejected": -391.1552429199219, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": 0.528149425983429, "rewards/margins": 9.060527801513672, "rewards/rejected": -8.532378196716309, "step": 2340 }, { "epoch": 2.43, "learning_rate": 1.0646024464831804e-07, "logits/chosen": -2.753213882446289, "logits/rejected": -2.822252035140991, "logps/chosen": -369.473388671875, "logps/rejected": -360.38983154296875, "loss": 0.0221, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3261668086051941, "rewards/margins": 8.502967834472656, "rewards/rejected": -8.829133033752441, "step": 2350 }, { "epoch": 2.44, "learning_rate": 1.0454892966360856e-07, "logits/chosen": -2.782691717147827, "logits/rejected": -2.868027448654175, "logps/chosen": -333.0803527832031, "logps/rejected": -355.0961608886719, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": -0.5834169983863831, "rewards/margins": 8.312009811401367, "rewards/rejected": -8.895425796508789, "step": 2360 }, { "epoch": 2.45, "learning_rate": 1.0263761467889908e-07, "logits/chosen": -2.770711898803711, "logits/rejected": -2.796137809753418, "logps/chosen": -336.739990234375, "logps/rejected": -372.0965576171875, "loss": 0.0151, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.35953274369239807, "rewards/margins": 8.809865951538086, "rewards/rejected": -8.450332641601562, "step": 2370 }, { "epoch": 2.46, "learning_rate": 1.007262996941896e-07, "logits/chosen": -2.7547390460968018, "logits/rejected": -2.7793593406677246, "logps/chosen": -335.936279296875, "logps/rejected": -330.6647033691406, "loss": 0.0167, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.02794322930276394, "rewards/margins": 8.719170570373535, "rewards/rejected": -8.747113227844238, "step": 2380 }, { "epoch": 2.47, "learning_rate": 9.881498470948011e-08, "logits/chosen": -2.846524477005005, "logits/rejected": -2.799567222595215, "logps/chosen": -343.198486328125, "logps/rejected": -335.6533508300781, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -0.013063406571745872, "rewards/margins": 8.952108383178711, "rewards/rejected": -8.965171813964844, "step": 2390 }, { "epoch": 2.48, "learning_rate": 9.690366972477065e-08, "logits/chosen": -2.85577392578125, "logits/rejected": -2.8093135356903076, "logps/chosen": -333.2208251953125, "logps/rejected": -358.0810241699219, "loss": 0.0142, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.41089487075805664, "rewards/margins": 8.724878311157227, "rewards/rejected": -9.135773658752441, "step": 2400 }, { "epoch": 2.48, "eval_logits/chosen": -2.835172653198242, "eval_logits/rejected": -2.839359760284424, "eval_logps/chosen": -386.6546936035156, "eval_logps/rejected": -343.19000244140625, "eval_loss": 0.6733575463294983, "eval_rewards/accuracies": 0.7579365372657776, "eval_rewards/chosen": -1.946290373802185, "eval_rewards/margins": 3.1761116981506348, "eval_rewards/rejected": -5.122402191162109, "eval_runtime": 165.3843, "eval_samples_per_second": 12.093, "eval_steps_per_second": 0.381, "step": 2400 }, { "epoch": 2.49, "learning_rate": 9.499235474006116e-08, "logits/chosen": -2.846043109893799, "logits/rejected": -2.8555102348327637, "logps/chosen": -376.3670349121094, "logps/rejected": -341.2032470703125, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": -0.03427610173821449, "rewards/margins": 8.654411315917969, "rewards/rejected": -8.688688278198242, "step": 2410 }, { "epoch": 2.5, "learning_rate": 9.308103975535168e-08, "logits/chosen": -2.8411316871643066, "logits/rejected": -2.8570432662963867, "logps/chosen": -373.59844970703125, "logps/rejected": -401.067138671875, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": 1.0549451112747192, "rewards/margins": 8.842530250549316, "rewards/rejected": -7.787585258483887, "step": 2420 }, { "epoch": 2.51, "learning_rate": 9.116972477064219e-08, "logits/chosen": -2.895292282104492, "logits/rejected": -2.854443073272705, "logps/chosen": -345.359375, "logps/rejected": -408.4029846191406, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -0.30679136514663696, "rewards/margins": 7.920645713806152, "rewards/rejected": -8.227437019348145, "step": 2430 }, { "epoch": 2.52, "learning_rate": 8.925840978593272e-08, "logits/chosen": -2.835501194000244, "logits/rejected": -2.896915912628174, "logps/chosen": -264.5487365722656, "logps/rejected": -387.1824951171875, "loss": 0.019, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3389735221862793, "rewards/margins": 9.347002029418945, "rewards/rejected": -9.008028030395508, "step": 2440 }, { "epoch": 2.53, "learning_rate": 8.734709480122324e-08, "logits/chosen": -2.806790828704834, "logits/rejected": -2.8148555755615234, "logps/chosen": -308.4158630371094, "logps/rejected": -376.0751953125, "loss": 0.0166, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.38033682107925415, "rewards/margins": 8.450287818908691, "rewards/rejected": -8.8306245803833, "step": 2450 }, { "epoch": 2.54, "learning_rate": 8.543577981651376e-08, "logits/chosen": -2.7967381477355957, "logits/rejected": -2.792023181915283, "logps/chosen": -455.0721740722656, "logps/rejected": -405.89501953125, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": 0.7061554789543152, "rewards/margins": 9.382705688476562, "rewards/rejected": -8.676549911499023, "step": 2460 }, { "epoch": 2.55, "learning_rate": 8.352446483180428e-08, "logits/chosen": -2.8607754707336426, "logits/rejected": -2.8268520832061768, "logps/chosen": -331.96820068359375, "logps/rejected": -321.39422607421875, "loss": 0.0236, "rewards/accuracies": 1.0, "rewards/chosen": 0.4299864172935486, "rewards/margins": 8.559895515441895, "rewards/rejected": -8.129908561706543, "step": 2470 }, { "epoch": 2.56, "learning_rate": 8.161314984709481e-08, "logits/chosen": -2.8822827339172363, "logits/rejected": -2.893578052520752, "logps/chosen": -339.42449951171875, "logps/rejected": -356.1263427734375, "loss": 0.0083, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2752775549888611, "rewards/margins": 8.704290390014648, "rewards/rejected": -8.4290132522583, "step": 2480 }, { "epoch": 2.57, "learning_rate": 7.970183486238531e-08, "logits/chosen": -2.828721284866333, "logits/rejected": -2.833087205886841, "logps/chosen": -328.60418701171875, "logps/rejected": -360.6470642089844, "loss": 0.0176, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.30725544691085815, "rewards/margins": 8.530462265014648, "rewards/rejected": -8.223207473754883, "step": 2490 }, { "epoch": 2.58, "learning_rate": 7.779051987767583e-08, "logits/chosen": -2.8711142539978027, "logits/rejected": -2.892519950866699, "logps/chosen": -370.29339599609375, "logps/rejected": -355.296875, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": 0.0712406188249588, "rewards/margins": 8.155640602111816, "rewards/rejected": -8.084399223327637, "step": 2500 }, { "epoch": 2.58, "eval_logits/chosen": -2.8333258628845215, "eval_logits/rejected": -2.8368897438049316, "eval_logps/chosen": -388.3058776855469, "eval_logps/rejected": -347.57440185546875, "eval_loss": 0.6890397667884827, "eval_rewards/accuracies": 0.7698412537574768, "eval_rewards/chosen": -2.1114044189453125, "eval_rewards/margins": 3.4494407176971436, "eval_rewards/rejected": -5.560845375061035, "eval_runtime": 164.7492, "eval_samples_per_second": 12.14, "eval_steps_per_second": 0.382, "step": 2500 }, { "epoch": 2.59, "learning_rate": 7.587920489296635e-08, "logits/chosen": -2.855881690979004, "logits/rejected": -2.8854427337646484, "logps/chosen": -351.69769287109375, "logps/rejected": -358.4553527832031, "loss": 0.0164, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2860868573188782, "rewards/margins": 8.237478256225586, "rewards/rejected": -8.523565292358398, "step": 2510 }, { "epoch": 2.6, "learning_rate": 7.396788990825688e-08, "logits/chosen": -2.8813681602478027, "logits/rejected": -2.9079108238220215, "logps/chosen": -322.7754821777344, "logps/rejected": -327.5832824707031, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -0.06775089353322983, "rewards/margins": 8.004460334777832, "rewards/rejected": -8.072211265563965, "step": 2520 }, { "epoch": 2.61, "learning_rate": 7.20565749235474e-08, "logits/chosen": -2.810084819793701, "logits/rejected": -2.815389394760132, "logps/chosen": -325.9468688964844, "logps/rejected": -330.6631164550781, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": -0.12425418943166733, "rewards/margins": 8.769124984741211, "rewards/rejected": -8.893379211425781, "step": 2530 }, { "epoch": 2.62, "learning_rate": 7.014525993883792e-08, "logits/chosen": -2.7919540405273438, "logits/rejected": -2.7934675216674805, "logps/chosen": -353.1927185058594, "logps/rejected": -365.3847351074219, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": 0.2072305679321289, "rewards/margins": 8.390886306762695, "rewards/rejected": -8.18365478515625, "step": 2540 }, { "epoch": 2.63, "learning_rate": 6.823394495412843e-08, "logits/chosen": -2.7678780555725098, "logits/rejected": -2.765697479248047, "logps/chosen": -358.8880615234375, "logps/rejected": -376.55706787109375, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": 0.11311036348342896, "rewards/margins": 9.944357872009277, "rewards/rejected": -9.83124828338623, "step": 2550 }, { "epoch": 2.64, "learning_rate": 6.632262996941895e-08, "logits/chosen": -2.834345817565918, "logits/rejected": -2.7858288288116455, "logps/chosen": -336.33648681640625, "logps/rejected": -366.84991455078125, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": -0.12319626659154892, "rewards/margins": 9.35567569732666, "rewards/rejected": -9.478872299194336, "step": 2560 }, { "epoch": 2.65, "learning_rate": 6.441131498470948e-08, "logits/chosen": -2.847996950149536, "logits/rejected": -2.863615036010742, "logps/chosen": -304.58502197265625, "logps/rejected": -352.5277404785156, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -0.37131237983703613, "rewards/margins": 8.316202163696289, "rewards/rejected": -8.687514305114746, "step": 2570 }, { "epoch": 2.66, "learning_rate": 6.25e-08, "logits/chosen": -2.8685457706451416, "logits/rejected": -2.876739501953125, "logps/chosen": -394.1883850097656, "logps/rejected": -382.19287109375, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": 0.5528956055641174, "rewards/margins": 9.068865776062012, "rewards/rejected": -8.515970230102539, "step": 2580 }, { "epoch": 2.67, "learning_rate": 6.058868501529052e-08, "logits/chosen": -2.9075653553009033, "logits/rejected": -2.8715763092041016, "logps/chosen": -366.0291442871094, "logps/rejected": -358.59381103515625, "loss": 0.0202, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.14853370189666748, "rewards/margins": 8.612794876098633, "rewards/rejected": -8.464262008666992, "step": 2590 }, { "epoch": 2.68, "learning_rate": 5.8677370030581035e-08, "logits/chosen": -2.797910213470459, "logits/rejected": -2.840148687362671, "logps/chosen": -331.3750305175781, "logps/rejected": -344.34332275390625, "loss": 0.011, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.22872698307037354, "rewards/margins": 8.751152038574219, "rewards/rejected": -8.979879379272461, "step": 2600 }, { "epoch": 2.68, "eval_logits/chosen": -2.8258047103881836, "eval_logits/rejected": -2.8298983573913574, "eval_logps/chosen": -390.2113952636719, "eval_logps/rejected": -350.03887939453125, "eval_loss": 0.6998910307884216, "eval_rewards/accuracies": 0.7658730149269104, "eval_rewards/chosen": -2.301961660385132, "eval_rewards/margins": 3.5053274631500244, "eval_rewards/rejected": -5.807290077209473, "eval_runtime": 164.7101, "eval_samples_per_second": 12.143, "eval_steps_per_second": 0.382, "step": 2600 }, { "epoch": 2.69, "learning_rate": 5.6766055045871554e-08, "logits/chosen": -2.837218761444092, "logits/rejected": -2.8603646755218506, "logps/chosen": -325.1515197753906, "logps/rejected": -377.93707275390625, "loss": 0.0122, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7425957918167114, "rewards/margins": 9.372137069702148, "rewards/rejected": -8.629541397094727, "step": 2610 }, { "epoch": 2.7, "learning_rate": 5.485474006116208e-08, "logits/chosen": -2.859614372253418, "logits/rejected": -2.907731294631958, "logps/chosen": -323.7126159667969, "logps/rejected": -337.7955627441406, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": 0.9508829116821289, "rewards/margins": 9.375367164611816, "rewards/rejected": -8.424482345581055, "step": 2620 }, { "epoch": 2.71, "learning_rate": 5.294342507645259e-08, "logits/chosen": -2.8355846405029297, "logits/rejected": -2.8445563316345215, "logps/chosen": -387.78021240234375, "logps/rejected": -341.34332275390625, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -0.06057599186897278, "rewards/margins": 8.586808204650879, "rewards/rejected": -8.647383689880371, "step": 2630 }, { "epoch": 2.72, "learning_rate": 5.1032110091743117e-08, "logits/chosen": -2.8416004180908203, "logits/rejected": -2.8135132789611816, "logps/chosen": -294.2474670410156, "logps/rejected": -342.1490173339844, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -1.0905003547668457, "rewards/margins": 7.913638114929199, "rewards/rejected": -9.004137992858887, "step": 2640 }, { "epoch": 2.73, "learning_rate": 4.9120795107033635e-08, "logits/chosen": -2.808621406555176, "logits/rejected": -2.8129184246063232, "logps/chosen": -361.94146728515625, "logps/rejected": -372.2251892089844, "loss": 0.0108, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.09845595061779022, "rewards/margins": 9.0397367477417, "rewards/rejected": -9.138191223144531, "step": 2650 }, { "epoch": 2.75, "learning_rate": 4.7209480122324154e-08, "logits/chosen": -2.857626438140869, "logits/rejected": -2.854701042175293, "logps/chosen": -334.3382568359375, "logps/rejected": -397.60504150390625, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -0.4237436354160309, "rewards/margins": 8.734308242797852, "rewards/rejected": -9.158050537109375, "step": 2660 }, { "epoch": 2.76, "learning_rate": 4.529816513761467e-08, "logits/chosen": -2.833742618560791, "logits/rejected": -2.848910093307495, "logps/chosen": -376.8042297363281, "logps/rejected": -431.098388671875, "loss": 0.0185, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.09423612058162689, "rewards/margins": 8.842924118041992, "rewards/rejected": -8.937159538269043, "step": 2670 }, { "epoch": 2.77, "learning_rate": 4.33868501529052e-08, "logits/chosen": -2.8585665225982666, "logits/rejected": -2.888023614883423, "logps/chosen": -299.4255065917969, "logps/rejected": -347.65032958984375, "loss": 0.0227, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.39737778902053833, "rewards/margins": 8.897387504577637, "rewards/rejected": -9.294764518737793, "step": 2680 }, { "epoch": 2.78, "learning_rate": 4.147553516819572e-08, "logits/chosen": -2.7752485275268555, "logits/rejected": -2.7679455280303955, "logps/chosen": -288.93524169921875, "logps/rejected": -362.3262634277344, "loss": 0.0138, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.21344709396362305, "rewards/margins": 8.969237327575684, "rewards/rejected": -9.182684898376465, "step": 2690 }, { "epoch": 2.79, "learning_rate": 3.9564220183486236e-08, "logits/chosen": -2.771638870239258, "logits/rejected": -2.7894372940063477, "logps/chosen": -358.27276611328125, "logps/rejected": -337.4700012207031, "loss": 0.0114, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.08803452551364899, "rewards/margins": 8.622003555297852, "rewards/rejected": -8.710036277770996, "step": 2700 }, { "epoch": 2.79, "eval_logits/chosen": -2.817159414291382, "eval_logits/rejected": -2.820690393447876, "eval_logps/chosen": -389.5738525390625, "eval_logps/rejected": -348.8511962890625, "eval_loss": 0.6951248645782471, "eval_rewards/accuracies": 0.7698412537574768, "eval_rewards/chosen": -2.238208055496216, "eval_rewards/margins": 3.4503118991851807, "eval_rewards/rejected": -5.6885199546813965, "eval_runtime": 164.1407, "eval_samples_per_second": 12.185, "eval_steps_per_second": 0.384, "step": 2700 }, { "epoch": 2.8, "learning_rate": 3.7652905198776755e-08, "logits/chosen": -2.8291115760803223, "logits/rejected": -2.812997817993164, "logps/chosen": -361.16973876953125, "logps/rejected": -371.3473205566406, "loss": 0.0109, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2673804759979248, "rewards/margins": 8.214799880981445, "rewards/rejected": -8.482179641723633, "step": 2710 }, { "epoch": 2.81, "learning_rate": 3.574159021406728e-08, "logits/chosen": -2.8169431686401367, "logits/rejected": -2.780579090118408, "logps/chosen": -340.25567626953125, "logps/rejected": -452.1532287597656, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -0.4266482889652252, "rewards/margins": 8.901152610778809, "rewards/rejected": -9.327801704406738, "step": 2720 }, { "epoch": 2.82, "learning_rate": 3.383027522935779e-08, "logits/chosen": -2.852733850479126, "logits/rejected": -2.8627407550811768, "logps/chosen": -345.10504150390625, "logps/rejected": -381.2701416015625, "loss": 0.0144, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6709401607513428, "rewards/margins": 9.368196487426758, "rewards/rejected": -8.697256088256836, "step": 2730 }, { "epoch": 2.83, "learning_rate": 3.191896024464832e-08, "logits/chosen": -2.847033977508545, "logits/rejected": -2.880303382873535, "logps/chosen": -344.7592468261719, "logps/rejected": -366.505615234375, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": 0.2561042606830597, "rewards/margins": 9.626019477844238, "rewards/rejected": -9.369915008544922, "step": 2740 }, { "epoch": 2.84, "learning_rate": 3.0007645259938836e-08, "logits/chosen": -2.829150676727295, "logits/rejected": -2.8306522369384766, "logps/chosen": -299.6269836425781, "logps/rejected": -362.5341796875, "loss": 0.0191, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.312942773103714, "rewards/margins": 10.283103942871094, "rewards/rejected": -9.970161437988281, "step": 2750 }, { "epoch": 2.85, "learning_rate": 2.809633027522936e-08, "logits/chosen": -2.815882444381714, "logits/rejected": -2.7824299335479736, "logps/chosen": -315.24993896484375, "logps/rejected": -347.3058776855469, "loss": 0.0207, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3008100688457489, "rewards/margins": 8.677629470825195, "rewards/rejected": -8.978440284729004, "step": 2760 }, { "epoch": 2.86, "learning_rate": 2.6185015290519877e-08, "logits/chosen": -2.8008246421813965, "logits/rejected": -2.7953882217407227, "logps/chosen": -333.69329833984375, "logps/rejected": -373.4231872558594, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -0.08695399761199951, "rewards/margins": 7.9680304527282715, "rewards/rejected": -8.054986000061035, "step": 2770 }, { "epoch": 2.87, "learning_rate": 2.4273700305810396e-08, "logits/chosen": -2.790097951889038, "logits/rejected": -2.827036142349243, "logps/chosen": -378.98236083984375, "logps/rejected": -420.4288635253906, "loss": 0.0135, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.11630038917064667, "rewards/margins": 10.843367576599121, "rewards/rejected": -10.727069854736328, "step": 2780 }, { "epoch": 2.88, "learning_rate": 2.2362385321100918e-08, "logits/chosen": -2.7879481315612793, "logits/rejected": -2.7845988273620605, "logps/chosen": -350.80572509765625, "logps/rejected": -345.9007873535156, "loss": 0.018, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.001116895698942244, "rewards/margins": 8.821355819702148, "rewards/rejected": -8.822473526000977, "step": 2790 }, { "epoch": 2.89, "learning_rate": 2.0451070336391437e-08, "logits/chosen": -2.7627055644989014, "logits/rejected": -2.7335832118988037, "logps/chosen": -341.35662841796875, "logps/rejected": -318.8877258300781, "loss": 0.0437, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1377846747636795, "rewards/margins": 8.672611236572266, "rewards/rejected": -8.534826278686523, "step": 2800 }, { "epoch": 2.89, "eval_logits/chosen": -2.8117332458496094, "eval_logits/rejected": -2.8151025772094727, "eval_logps/chosen": -389.4859924316406, "eval_logps/rejected": -348.1217346191406, "eval_loss": 0.6910788416862488, "eval_rewards/accuracies": 0.7658730149269104, "eval_rewards/chosen": -2.229426622390747, "eval_rewards/margins": 3.3861491680145264, "eval_rewards/rejected": -5.615575313568115, "eval_runtime": 165.138, "eval_samples_per_second": 12.111, "eval_steps_per_second": 0.381, "step": 2800 }, { "epoch": 2.9, "learning_rate": 1.8539755351681956e-08, "logits/chosen": -2.7365012168884277, "logits/rejected": -2.788407325744629, "logps/chosen": -330.33197021484375, "logps/rejected": -380.91168212890625, "loss": 0.0163, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.0645025223493576, "rewards/margins": 8.317387580871582, "rewards/rejected": -8.381890296936035, "step": 2810 }, { "epoch": 2.91, "learning_rate": 1.6628440366972478e-08, "logits/chosen": -2.8033618927001953, "logits/rejected": -2.8255763053894043, "logps/chosen": -373.3817443847656, "logps/rejected": -359.75421142578125, "loss": 0.0142, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.008309101685881615, "rewards/margins": 8.045055389404297, "rewards/rejected": -8.03674602508545, "step": 2820 }, { "epoch": 2.92, "learning_rate": 1.4717125382262997e-08, "logits/chosen": -2.854548692703247, "logits/rejected": -2.8665812015533447, "logps/chosen": -339.0101318359375, "logps/rejected": -377.62847900390625, "loss": 0.0123, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.6132253408432007, "rewards/margins": 8.178349494934082, "rewards/rejected": -8.791574478149414, "step": 2830 }, { "epoch": 2.93, "learning_rate": 1.2805810397553517e-08, "logits/chosen": -2.8801310062408447, "logits/rejected": -2.826385021209717, "logps/chosen": -346.5010681152344, "logps/rejected": -360.3970031738281, "loss": 0.026, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.022508572787046432, "rewards/margins": 8.316872596740723, "rewards/rejected": -8.33938217163086, "step": 2840 }, { "epoch": 2.94, "learning_rate": 1.0894495412844038e-08, "logits/chosen": -2.758545160293579, "logits/rejected": -2.7856967449188232, "logps/chosen": -326.37896728515625, "logps/rejected": -359.3761291503906, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": 0.21567471325397491, "rewards/margins": 8.950045585632324, "rewards/rejected": -8.734369277954102, "step": 2850 }, { "epoch": 2.95, "learning_rate": 8.983180428134555e-09, "logits/chosen": -2.8466389179229736, "logits/rejected": -2.8278822898864746, "logps/chosen": -327.270751953125, "logps/rejected": -307.3416748046875, "loss": 0.0158, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5247852802276611, "rewards/margins": 8.214715957641602, "rewards/rejected": -8.739501953125, "step": 2860 }, { "epoch": 2.96, "learning_rate": 7.071865443425076e-09, "logits/chosen": -2.7788777351379395, "logits/rejected": -2.7975292205810547, "logps/chosen": -361.37384033203125, "logps/rejected": -391.8774719238281, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": -0.35571426153182983, "rewards/margins": 8.981501579284668, "rewards/rejected": -9.337217330932617, "step": 2870 }, { "epoch": 2.97, "learning_rate": 5.1605504587155965e-09, "logits/chosen": -2.8489837646484375, "logits/rejected": -2.7892398834228516, "logps/chosen": -342.7001953125, "logps/rejected": -358.37469482421875, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": 0.04198342561721802, "rewards/margins": 8.752424240112305, "rewards/rejected": -8.710439682006836, "step": 2880 }, { "epoch": 2.98, "learning_rate": 3.249235474006116e-09, "logits/chosen": -2.8391404151916504, "logits/rejected": -2.862032175064087, "logps/chosen": -332.72100830078125, "logps/rejected": -362.6523132324219, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": 0.023767167702317238, "rewards/margins": 8.220497131347656, "rewards/rejected": -8.19672966003418, "step": 2890 }, { "epoch": 2.99, "learning_rate": 1.3379204892966359e-09, "logits/chosen": -2.854654550552368, "logits/rejected": -2.8116354942321777, "logps/chosen": -330.1148986816406, "logps/rejected": -358.40155029296875, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -0.3439770042896271, "rewards/margins": 8.63255500793457, "rewards/rejected": -8.976531028747559, "step": 2900 }, { "epoch": 2.99, "eval_logits/chosen": -2.814802408218384, "eval_logits/rejected": -2.818735122680664, "eval_logps/chosen": -389.9677429199219, "eval_logps/rejected": -348.89801025390625, "eval_loss": 0.6909257769584656, "eval_rewards/accuracies": 0.7658730149269104, "eval_rewards/chosen": -2.2775967121124268, "eval_rewards/margins": 3.415607452392578, "eval_rewards/rejected": -5.693204402923584, "eval_runtime": 164.8452, "eval_samples_per_second": 12.133, "eval_steps_per_second": 0.382, "step": 2900 }, { "epoch": 3.0, "step": 2907, "total_flos": 0.0, "train_loss": 0.23139607249449978, "train_runtime": 34004.0578, "train_samples_per_second": 5.467, "train_steps_per_second": 0.085 } ], "logging_steps": 10, "max_steps": 2907, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "trial_name": null, "trial_params": null }